In [132]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import re
import numpy as np
from collections import Counter
from math import log
from sklearn.model_selection import GridSearchCV
from gensim.models import Word2Vec
from hazm import Normalizer, word_tokenize, stopwords_list, Stemmer, Lemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [133]:
# Load the data
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")
# Create a normalizer object
normalizer = Normalizer()
def remove_u200c(text):
    return text.replace('\u200c', '')

def preprocess_text(text):
    # Normalize the text
    text = normalizer.normalize(text)
    text = normalizer.remove_specials_chars(text)

    # Tokenize the text
    words = word_tokenize(text)
    words = [remove_u200c(word) for word in words]
    return words

In [134]:
train['content'] = train['content'].apply(preprocess_text)
val['content'] = val['content'].apply(preprocess_text)
test['content'] = test['content'].apply(preprocess_text)

In [135]:
# Create a stemmer object
stemmer = Stemmer()

# Create a lemmatizer object
lemmatizer = Lemmatizer()

# Get the list of Persian stopwords
stopwords = stopwords_list()

def preprocess_text(words):
    # Remove stopwords, apply stemming and lemmatization
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stopwords]
    return words

train['content'] = train['content'].apply(preprocess_text)
test['content'] = test['content'].apply(preprocess_text)

In [136]:
 # Train Word2Vec model
model = Word2Vec(train['content'], min_count=1, sg=1, vector_size=200)

In [6]:
train_avg_vectors = train['content'].apply(lambda words: np.mean([model.wv[word] for word in words], axis=0))

In [7]:
def check_existence(word, model, vector_size):
    try:
        return model.wv[word]
    except:
        return np.zeros(vector_size)

In [8]:
test_avg_vectors = test['content'].apply(lambda words: np.mean([check_existence(word, model, 200) for word in words], axis=0))
val_avg_vectors = val['content'].apply(lambda words: np.mean([check_existence(word, model, 200) for word in words], axis=0))

In [9]:
# Define the parameter grid for the KNN classifier
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}

# Create a GridSearchCV object
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5,n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(list(train_avg_vectors), train['label'])
print("Best parameters: ", grid_search.best_params_)

# Use the best estimator to make predictions on the test data
best_knn = grid_search.best_estimator_
predictions = best_knn.predict(list(test_avg_vectors))

# Print the classification report for the test data predictions
print(classification_report(test['label'], predictions))

Best parameters:  {'n_neighbors': 11}
              precision    recall  f1-score   support

           0       0.82      0.72      0.77       217
           1       0.84      0.76      0.80       156
           2       0.91      0.87      0.89       197
           3       0.74      0.84      0.79       227
           4       0.88      0.91      0.89       244
           5       0.90      0.91      0.91       256
           6       0.99      0.94      0.97       138
           7       0.81      0.86      0.83       209

    accuracy                           0.85      1644
   macro avg       0.86      0.85      0.86      1644
weighted avg       0.86      0.85      0.85      1644



In [16]:


# Calculate term frequency
def term_frequency(doc):
    counts = Counter(doc)
    return {word: count/len(doc) for word, count in counts.items()}

# Calculate inverse document frequency
def inverse_document_frequency(docs):
    idf = {}
    all_words = set(word for doc in docs for word in doc)
    for word in all_words:
        contains_word = map(lambda doc: word in doc, docs)
        idf[word] = log(len(docs)/(1 + sum(contains_word)))
    return idf

# Calculate TF-IDF
def tf_idf(docs):
    word2weight = {}
    idf = inverse_document_frequency(docs)
    for doc in docs:
        tf = term_frequency(doc)
        for word, freq in tf.items():
            word2weight[word] = freq * idf[word]
    return word2weight

# Apply TF-IDF to the content
word2weight = tf_idf(train['content'])

KeyboardInterrupt: 

## MY IMPLEMENTATION of tf idf need too much time

In [137]:
def compute_tfidf(dataframe):
    dataframe_copy = dataframe.copy()
    dataframe_copy['content'] = dataframe_copy['content'].astype(str)

    # Initialize the TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(dataframe_copy['content'])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    return tfidf_df


tfidf_dataframe_train = compute_tfidf(train)
tfidf_dataframe_test = compute_tfidf(test)

In [141]:
type(train['content'][1])

list

In [142]:
def weighted_average_vector(words, tfidf_df, model, vector_size, content_index):
    weights = [tfidf_df.at[content_index, word] if word in tfidf_df.columns else 0 for word in words]
    vectors = [check_existence(word[2:-1], model, vector_size) * weight for word, weight in zip(words, weights)]
    weighted_vector = np.sum(vectors, axis=0) / sum(weights) if sum(weights) > 0 else np.zeros(vector_size)
    return weighted_vector

averaged_tfidf_train_vector = pd.DataFrame()
averaged_tfidf_train_vector['weighted_vectors'] = train.apply(lambda row: weighted_average_vector(row['content'], tfidf_dataframe_train, model, 200, row.name), axis=1)

In [144]:
averaged_tfidf_test_vector = pd.DataFrame()
averaged_tfidf_test_vector['weighted_vectors'] = test.apply(lambda row: weighted_average_vector(row['content'], tfidf_dataframe_test, model, 200, row.name), axis=1)

In [148]:
averaged_tfidf_train_vector = pd.DataFrame(averaged_tfidf_train_vector['weighted_vectors'].tolist())
averaged_tfidf_test_vector = pd.DataFrame(averaged_tfidf_test_vector['weighted_vectors'].tolist())

In [150]:
averaged_tfidf_train_vector

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.035862,-0.112808,0.114000,0.088515,-0.078202,0.024686,-0.053142,0.164881,0.021225,-0.023193,...,0.093822,0.045869,-0.092711,-0.035211,0.082029,0.180364,-0.094961,-0.223556,0.008804,0.057009
1,-0.025810,-0.017736,0.053696,0.014416,-0.054654,-0.021744,-0.024179,0.078300,0.004737,0.025706,...,0.091946,0.016544,-0.050197,-0.074023,0.059599,0.089849,-0.007602,-0.063566,0.019655,0.000181
2,-0.024027,-0.122084,0.055143,0.028036,-0.060900,0.005088,-0.015501,0.177120,0.054914,0.030419,...,0.033569,-0.001953,-0.090035,-0.038973,0.060554,0.159910,-0.020167,-0.234681,-0.011592,0.017877
3,-0.002543,-0.018871,0.011171,0.005699,-0.021981,0.003959,-0.001240,0.056898,0.001188,0.012098,...,0.030257,0.008301,-0.030636,-0.006380,0.023158,0.047930,0.002099,-0.064154,0.008579,-0.009091
4,0.016554,-0.048106,0.045708,0.022023,-0.041868,0.014961,-0.004186,0.099906,0.013650,0.006763,...,0.054721,0.002409,-0.040711,-0.024341,0.050985,0.055208,-0.025623,-0.085176,0.013984,-0.011667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13309,-0.026748,-0.077005,0.038115,0.047939,-0.068217,-0.033169,-0.011428,0.104179,-0.006156,0.005152,...,0.078293,0.018336,-0.080581,-0.036076,0.044301,0.088154,-0.030073,-0.142752,0.026356,0.030831
13310,-0.011747,-0.209270,0.100183,0.047660,-0.096377,0.061074,-0.055249,0.084722,0.023599,-0.000888,...,0.010384,0.032084,-0.085715,0.016863,-0.016578,0.158788,-0.075489,-0.103788,0.009507,-0.005941
13311,-0.004446,-0.001938,0.000160,0.034167,-0.064746,-0.022242,-0.007561,0.201843,0.023876,0.010571,...,0.076916,0.025338,-0.112461,-0.097209,0.141591,0.100885,0.009437,-0.214768,-0.009785,-0.019890
13312,0.030782,0.027556,0.032795,0.012134,-0.067947,0.000592,-0.019363,0.135255,0.034717,0.000056,...,0.099718,0.020959,-0.068113,-0.045146,0.080847,0.137774,-0.011096,-0.130354,-0.003231,-0.043431


In [151]:
param_grids = {
    'KNN': {'n_neighbors': [3, 5, 7, 9, 11, 20]},
    'RandomForest': {'n_estimators': [50, 100, 200], 'max_features': ['sqrt', 'log2']},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
}

classifiers = {
    'KNN': KNeighborsClassifier(),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC()
}

best_scores = {}
best_params = {}
best_estimators = {}

for name, classifier in classifiers.items():
    grid_search = GridSearchCV(classifier, param_grids[name], cv=5, n_jobs=-1)
    grid_search.fit(averaged_tfidf_train_vector, train['label'])
    best_scores[name] = grid_search.best_score_
    best_params[name] = grid_search.best_params_
    best_estimators[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: ", grid_search.best_params_)
    print(f"Best cross-validation score for {name}: {grid_search.best_score_:.3f}")

best_classifier_name = max(best_scores, key=best_scores.get)
print(f"Best classifier is {best_classifier_name} with a score of {best_scores[best_classifier_name]:.3f}")
best_classifier = best_estimators[best_classifier_name]
predictions = best_classifier.predict(averaged_tfidf_test_vector)
print(classification_report(test['label'], predictions))

Best parameters for KNN:  {'n_neighbors': 20}
Best cross-validation score for KNN: 0.343
Best parameters for RandomForest:  {'max_features': 'sqrt', 'n_estimators': 200}
Best cross-validation score for RandomForest: 0.401
Best parameters for SVM:  {'C': 10, 'kernel': 'rbf'}
Best cross-validation score for SVM: 0.504
Best classifier is SVM with a score of 0.504
              precision    recall  f1-score   support

           0       0.44      0.46      0.45       217
           1       0.48      0.42      0.45       156
           2       0.54      0.53      0.54       197
           3       0.45      0.50      0.47       227
           4       0.53      0.53      0.53       244
           5       0.55      0.62      0.58       256
           6       0.71      0.59      0.65       138
           7       0.54      0.50      0.52       209

    accuracy                           0.52      1644
   macro avg       0.53      0.52      0.52      1644
weighted avg       0.52      0.52      0.

In [152]:
# P4 implementation of multiclass Accuracy and F1 score
def accuracy(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

def f1_score(y_true, y_pred):
    # Calculate precision and recall for each class
    classes = np.unique(y_true)
    f1_scores = []
    for cls in classes:
        tp = np.sum((y_true == cls) & (y_pred == cls))
        fp = np.sum((y_true != cls) & (y_pred == cls))
        fn = np.sum((y_true == cls) & (y_pred != cls))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)
    # Calculate the average F1-score
    return np.mean(f1_scores)
print(f"Accuracy of wighted vector with knn {accuracy(test['label'], predictions)}")
print(f"f1 score of wighted vector with knn {f1_score(test['label'], predictions)}")

Accuracy of wighted vector with knn 0.5206812652068127
f1 score of wighted vector with knn 0.5233737501517393
