# Let's get started

###Import required python libraries

In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import DistanceMetric
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1080)

###Create and fill the feature vector



In [43]:
# Create a feature vector with columns representing alphabets
#alphabets = 'ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی'
#alphabets = 'ابپتثجچحخدذرزژسشصضطظعغفقکكگلمنوهیيئءأإآة'
alphabets = 'پچژكگیيئءأإآة'
feature_vector = pd.DataFrame(0, index=pd.RangeIndex(start=1, stop=2), columns=list(reversed(alphabets)))

# Read the Excel file into a DataFrame
xlsx_file_path = 'dataset-1.xlsx'
main_df = pd.read_excel(xlsx_file_path)
labels = main_df.iloc[:, 1]
labels.index = labels.index + 1

##Attention : Now you have to run only one of the feature vectors below every time that you want to calculate the predictions !!!

###Creating the feature vector using the Binary BoW

In [None]:
# Create an empty matrix to store feature vectors for each row
feature_matrix = pd.DataFrame(0, index=main_df.index, columns=list(alphabets))

# Iterate through each row in the DataFrame
for index, row in main_df.iterrows():
    # Assume the first column contains sentences
    sentence = str(row.iloc[0])

    # Iterate through each column in the feature vector
    for column in feature_vector.columns:
        # Check if the character is present in the sentence
        if column in sentence:
            # If present, set the corresponding value in the feature matrix to 1
            feature_matrix.at[index, column] = 1

# Change the index to start from one instead of zero
feature_matrix.index = feature_matrix.index + 1

# Display the resulting feature matrix
feature_matrix

###Creating the feature vector using Weighted BoW

In [None]:
# Create an empty matrix to store weighted feature vectors for each row
feature_matrix = pd.DataFrame(0, index=main_df.index, columns=list(alphabets))

# Iterate through each row in the DataFrame
for index, row in main_df.iterrows():
    # Assume the first column contains sentences
    sentence = str(row.iloc[0])

    # Iterate through each column in the feature vector
    for column in feature_vector.columns:
        # Count the occurrences of the character in the sentence
        count = sentence.count(column)

        # Set the corresponding value in the weighted feature matrix
        feature_matrix.at[index, column] = count

# Change the index to start from one instead of zero
feature_matrix.index = feature_matrix.index + 1

# Display the resulting weighted feature matrix
print(feature_matrix)

###Creating the feature vector using Length Normalized Weighted BoW

In [None]:
# Create an empty matrix to store length normalized feature vectors for each row
feature_matrix = pd.DataFrame(0, index=main_df.index, columns=list(alphabets))

# Iterate through each row in the DataFrame
for index, row in main_df.iterrows():
    # Assume the first column contains sentences
    sentence = str(row.iloc[0])

    # Iterate through each column in the feature vector
    for column in feature_vector.columns:
        # Count the occurrences of the character in the sentence
        count = sentence.count(column)

        # Set the corresponding value in the normalized feature matrix
        feature_matrix.at[index, column] = count / len(sentence)

# Change the index to start from one instead of zero
feature_matrix.index = feature_matrix.index + 1

# Display the resulting normalized feature matrix
print(feature_matrix)

###Creating the feature vector using Z-Score Normalized Weighted BoW

In [None]:
# Create an empty matrix to store Z-score normalized feature vectors for each row
feature_matrix = pd.DataFrame(0, index=main_df.index, columns=list(alphabets))

# Iterate through each row in the DataFrame
for index, row in main_df.iterrows():
    # Assume the first column contains sentences
    sentence = str(row.iloc[0])

    # Iterate through each column in the feature vector
    for column in feature_vector.columns:
        # Count the occurrences of the character in the sentence
        count = sentence.count(column)

        # Set the corresponding value in the feature matrix
        feature_matrix.at[index, column] = count

# Standardize the feature matrix using Z-score normalization
scaler = StandardScaler()
feature_matrix = pd.DataFrame(scaler.fit_transform(feature_matrix),
                                                 index=feature_matrix.index,
                                                 columns=feature_matrix.columns)

# Change the index to start from one instead of zero
feature_matrix.index = feature_matrix.index + 1

# Display the resulting Z-score normalized feature matrix
print(feature_matrix)


###Creating the feature vector using TF-IDF(Term Frequency-Inverse Document Frequency) transformation

In [None]:
# Create an empty matrix to store TF-IDF transformed feature vectors for each row
feature_matrix = pd.DataFrame(0, index=main_df.index, columns=list(alphabets))

# Convert sentences into a list
sentences = main_df.iloc[:, 0].astype(str).tolist()

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer='char', vocabulary=list(alphabets))

# Fit and transform the sentences using TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

# Fill the TF-IDF values into the feature matrix
feature_matrix.loc[:, list(alphabets)] = tfidf_matrix.toarray()

# Change the index to start from one instead of zero
feature_matrix.index = feature_matrix.index + 1

# Display the resulting TF-IDF transformed feature matrix
print(feature_matrix)

##Now, We have to split data into Train data and Test data


In [45]:
# Get the number of rows in the DataFrame
num_rows = feature_matrix.shape[0]

# Define the condition for selecting test rows
test_indices = [i for i in range(1, num_rows + 1) if i % 10 == 0]

# Create the train and test DataFrames
X_train = feature_matrix.drop(test_indices)
y_train = labels.drop(test_indices)

X_test  = feature_matrix.loc[test_indices]
y_test  = labels.loc[test_indices]

##Using k-NN classifiers for k = 1, 3, 5 with Euclidean metric

In [None]:
# Initialize k-NN classifiers for k = 1, 3, 5 with Euclidean metric
for k in [1, 3, 5]:
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='euclidean')

    # Fit the classifier on the training data
    knn_classifier.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = knn_classifier.predict(X_test)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Display the results
    print(f"\nResults for k = {k} (Euclidean metric):")
    print(f"Predicted Labels: {y_pred}")
    print(f"True Labels     : {y_test.values}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Error   : {1 - accuracy:.2%}")

##Using k-NN classifiers for k = 1, 3, 5 with Cosine metric

In [None]:
# Initialize k-NN classifiers for k = 1, 3, 5 with Cosine metric
for k in [1, 3, 5]:
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='cosine')

    # Fit the classifier on the training data
    knn_classifier.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = knn_classifier.predict(X_test)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Display the results
    print(f"\nResults for k = {k} (Cosine metric):")
    print(f"Predicted Labels: {y_pred}")
    print(f"True Labels     : {y_test.values}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Error   : {1 - accuracy:.2%}")

##Using k-NN classifiers for k = 1, 3, 5 with Correlation metric

In [None]:
# Initialize k-NN classifiers for k = 1, 3 and 5 with Correlation metric
for k in [1, 3, 5]:
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='correlation')

    # Fit the classifier on the training data
    knn_classifier.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = knn_classifier.predict(X_test)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Display the results
    print(f"\nResults for k = {k} (Correlation metric):")
    print(f"Predicted Labels: {y_pred}")
    print(f"True Labels     : {y_test.values}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Error   : {1 - accuracy:.2%}")

##Using k-NN classifiers for k = 1, 3, 5 with Manhattan metric

In [None]:
# Initialize k-NN classifiers for k = 1, 3 and 5 with Manhattan metric
for k in [1, 3, 5]:
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='manhattan')

    # Fit the classifier on the training data
    knn_classifier.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = knn_classifier.predict(X_test)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Display the results
    print(f"\nResults for k = {k} (Manhattan metric):")
    print(f"Predicted Labels: {y_pred}")
    print(f"True Labels     : {y_test.values}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Error   : {1 - accuracy:.2%}")

##Using k-NN classifiers for k = 1, 3, 5 with Chebyshev metric

In [None]:
# Initialize k-NN classifiers for k = 1, 3 and 5 with Chebyshev metric
for k in [1, 3, 5]:
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='chebyshev')

    # Fit the classifier on the training data
    knn_classifier.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = knn_classifier.predict(X_test)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Display the results
    print(f"\nResults for k = {k} (Chebyshev metric):")
    print(f"Predicted Labels: {y_pred}")
    print(f"True Labels     : {y_test.values}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Error   : {1 - accuracy:.2%}")

##Using k-NN classifiers for k = 1, 3, 5 with Jaccard metric

In [None]:
# Initialize k-NN classifiers for k = 1, 3 and 5 with Jaccard metric
for k in [1, 3, 5]:
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='jaccard', algorithm='auto')

    # Fit the classifier on the training data
    knn_classifier.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = knn_classifier.predict(X_test)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Display the results
    print(f"\nResults for k = {k} (Jaccard metric):")
    print(f"Predicted Labels: {y_pred}")
    print(f"True Labels     : {y_test.values}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Error   : {1 - accuracy:.2%}")