In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import re
import numpy as np
from collections import Counter
from math import log
from sklearn.model_selection import GridSearchCV
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from hazm import Normalizer, word_tokenize, stopwords_list, Stemmer, Lemmatizer

In [2]:
# Load the data
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")
# Create a normalizer object
normalizer = Normalizer()
def remove_u200c(text):
    return text.replace('\u200c', '')

def preprocess_text(text):
    # Normalize the text
    text = normalizer.normalize(text)
    text = normalizer.remove_specials_chars(text)

    # Tokenize the text
    words = word_tokenize(text)
    words = [remove_u200c(word) for word in words]
    return words

In [3]:
train['content'] = train['content'].apply(preprocess_text)
val['content'] = val['content'].apply(preprocess_text)
test['content'] = test['content'].apply(preprocess_text)

In [4]:
# Create a stemmer object
stemmer = Stemmer()

# Create a lemmatizer object
lemmatizer = Lemmatizer()

# Get the list of Persian stopwords
stopwords = stopwords_list()

def preprocess_text(words):
    # Remove stopwords, apply stemming and lemmatization
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stopwords]
    return words

train['content'] = train['content'].apply(preprocess_text)
test['content'] = test['content'].apply(preprocess_text)

In [5]:
 # Train Word2Vec model
model = Word2Vec(train['content'], min_count=1, sg=1, vector_size=200)

In [6]:
train_avg_vectors = train['content'].apply(lambda words: np.mean([model.wv[word] for word in words], axis=0))

In [7]:
def check_existence(word, model, vector_size):
    try:
        return model.wv[word]
    except:
        return np.zeros(vector_size)

In [8]:
test_avg_vectors = test['content'].apply(lambda words: np.mean([check_existence(word, model, 200) for word in words], axis=0))
val_avg_vectors = val['content'].apply(lambda words: np.mean([check_existence(word, model, 200) for word in words], axis=0))

In [9]:


# Define the parameter grid for the KNN classifier
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}

# Create a GridSearchCV object
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5,n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(list(train_avg_vectors), train['label'])

# Print the best parameters found by GridSearchCV
print("Best parameters: ", grid_search.best_params_)

# Use the best estimator to make predictions on the test data
best_knn = grid_search.best_estimator_
predictions = best_knn.predict(list(test_avg_vectors))

# Print the classification report for the test data predictions
print(classification_report(test['label'], predictions))

Best parameters:  {'n_neighbors': 11}
              precision    recall  f1-score   support

           0       0.82      0.72      0.77       217
           1       0.84      0.76      0.80       156
           2       0.91      0.87      0.89       197
           3       0.74      0.84      0.79       227
           4       0.88      0.91      0.89       244
           5       0.90      0.91      0.91       256
           6       0.99      0.94      0.97       138
           7       0.81      0.86      0.83       209

    accuracy                           0.85      1644
   macro avg       0.86      0.85      0.86      1644
weighted avg       0.86      0.85      0.85      1644



In [16]:


# Calculate term frequency
def term_frequency(doc):
    counts = Counter(doc)
    return {word: count/len(doc) for word, count in counts.items()}

# Calculate inverse document frequency
def inverse_document_frequency(docs):
    idf = {}
    all_words = set(word for doc in docs for word in doc)
    for word in all_words:
        contains_word = map(lambda doc: word in doc, docs)
        idf[word] = log(len(docs)/(1 + sum(contains_word)))
    return idf

# Calculate TF-IDF
def tf_idf(docs):
    word2weight = {}
    idf = inverse_document_frequency(docs)
    for doc in docs:
        tf = term_frequency(doc)
        for word, freq in tf.items():
            word2weight[word] = freq * idf[word]
    return word2weight

# Apply TF-IDF to the content
word2weight = tf_idf(train['content'])

KeyboardInterrupt: 

## MY IMPLEMENTATION of tf idf need too much time

In [51]:
labels = list(range(0, 8))
train_class_tokens = {}
for i in labels:
    train_label = train[train['label'] == i]
    train_tokens = ''
    for j, content in enumerate(train_label['content']):
        train_tokens += ' '.join(content)
    train_class_tokens[i] = train_tokens

import pandas as pd

# Convert the dictionary to a DataFrame
train_class_content = pd.DataFrame(list(train_class_tokens.items()), columns=['Key', 'Value'])


test_class_tokens = {}
for i in labels:
    test_label = test[test['label'] == i]
    test_tokens = ''
    for j, content in enumerate(test_label['content']):
        test_tokens += ' '.join(content)
    test_class_tokens[i] = test_tokens

import pandas as pd

# Convert the dictionary to a DataFrame
test_class_content = pd.DataFrame(list(test_class_tokens.items()), columns=['Key', 'Value'])


In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')

train_vectors = vectorizer.fit_transform(train_class_content['Value'])

# Transform the content of the test data
test_vectors = vectorizer.transform(test_class_content['Value'])

In [55]:
feature_names = vectorizer.get_feature_names_out()

In [67]:
def weighted_average_vector(words,tfidfmatrix, model, vector_size, content_number):
    weighted_vector = []
    for word in words:
        try:
            weight = tfidfmatrix[content_number, np.where(feature_names == word)].toarray().item()
        except:
            weight = 1
        weighted_vector.append(check_existence(word, model, vector_size)*weight)
    return np.mean(weighted_vector, axis=0)
train_weighted_vectors = []
for i, content in enumerate(train['content']):
    train_weighted_vectors.append(weighted_average_vector(content,train_vectors, model, 200, train['label'][i]))

In [63]:
test_weighted_vectors = []
for i, content in enumerate(test['content']):
    test_weighted_vectors.append(weighted_average_vector(content, test_vectors, model, 200, test['label'][i]))

[array([ 0.17311321, -0.21074666,  0.15216093,  0.25855544,  0.04072796,
         0.02794128,  0.00876193,  0.5096189 ,  0.10478681, -0.07163415,
         0.02585813,  0.01460101,  0.17529735, -0.03987003, -0.27987006,
        -0.33239245, -0.08876501, -0.10198827, -0.09601255,  0.02705641,
        -0.10880709, -0.03821993, -0.0259078 ,  0.06103747,  0.08891745,
        -0.09252746,  0.08767658, -0.04285934, -0.03040332,  0.11496001,
        -0.14304198,  0.04589478,  0.25275224,  0.06991956, -0.16283096,
        -0.03834331,  0.04064745, -0.16521439,  0.0520001 , -0.24579273,
        -0.0571112 ,  0.00807662, -0.01833247,  0.26270252,  0.17364898,
        -0.21147832, -0.06856406, -0.08434906,  0.03338691,  0.17187224,
         0.14407349, -0.18754502,  0.11735018,  0.21128938, -0.07483663,
         0.00832723, -0.09773801, -0.25063473,  0.01623252, -0.0167284 ,
        -0.06777856,  0.08864164,  0.31577352,  0.04271876,  0.00787129,
         0.03864022, -0.0384186 ,  0.27253896, -0.2

In [68]:
# Define the parameter grid for the KNN classifier
param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 20]}

# Create a GridSearchCV object
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(train_weighted_vectors, train['label'])

# Print the best parameters found by GridSearchCV
print("Best parameters: ", grid_search.best_params_)

# Use the best estimator to make predictions on the test data
best_knn = grid_search.best_estimator_
predictions = best_knn.predict(list(test_weighted_vectors))

# Print the classification report for the test data predictions
print(classification_report(test['label'], predictions))

Best parameters:  {'n_neighbors': 11}
              precision    recall  f1-score   support

           0       0.67      0.75      0.71       217
           1       0.83      0.67      0.74       156
           2       0.89      0.81      0.85       197
           3       0.68      0.85      0.76       227
           4       0.85      0.89      0.87       244
           5       0.84      0.81      0.82       256
           6       0.97      0.85      0.91       138
           7       0.88      0.79      0.84       209

    accuracy                           0.81      1644
   macro avg       0.83      0.80      0.81      1644
weighted avg       0.82      0.81      0.81      1644



In [70]:
# P4 implementation of multiclass Accuracy and F1 score
def accuracy(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

def f1_score(y_true, y_pred):
    # Calculate precision and recall for each class
    classes = np.unique(y_true)
    f1_scores = []
    for cls in classes:
        tp = np.sum((y_true == cls) & (y_pred == cls))
        fp = np.sum((y_true != cls) & (y_pred == cls))
        fn = np.sum((y_true == cls) & (y_pred != cls))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)
    # Calculate the average F1-score
    return np.mean(f1_scores)
print(f"Accuracy of wighted vector with knn {accuracy(test['label'], predictions)}")
print(f"f1 score of wighted vector with knn {f1_score(test['label'], predictions)}")

Accuracy of wighted vector with knn 0.8083941605839416
f1 score of wighted vector with knn 0.8117163323117935
