# Traditional ML classifiers

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

In [2]:
# load all data
dataset_dir = os.path.join('..', 'Dataset')
tfidf_features_data_path = os.path.join(dataset_dir, 'tfidf_features_50.csv')
tfidf_df = pd.read_csv(tfidf_features_data_path)
tfidf_df = tfidf_df.drop(['class'], axis = 1)

ngram_df_features_data_path = os.path.join(dataset_dir, 'ngram_df_features_50.csv')
ngram_df = pd.read_csv(ngram_df_features_data_path)
ngram_df = ngram_df.drop(['class'], axis = 1)

lda_df_features_data_path = os.path.join(dataset_dir, 'lda_df_features_50.csv')
lda_df = pd.read_csv(lda_df_features_data_path)
# lda_df = lda_df.drop(['class'], axis = 1)

data = pd.concat([tfidf_df, ngram_df, lda_df], axis=1)
data

Unnamed: 0,also,always,anymore,anyone,anything,around,back,bad,better,cant,...,way,work,would,year,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,class
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.367680,0.000000,0.000000,0.000000,...,1,0,0,0,0.008016,0.289972,0.008001,0.008242,0.685770,1
1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.025000,0.898189,0.025003,0.026267,0.025541,0
2,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.484986,0.000000,0.000000,...,0,0,0,1,0.033333,0.034386,0.033334,0.237204,0.661744,0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.050321,0.051255,0.050010,0.796905,0.051509,1
4,0.0,0.000000,0.064171,0.064511,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,3,0.002778,0.269270,0.002778,0.002833,0.722341,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232045,0.0,0.000000,0.000000,0.000000,0.499350,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.028572,0.029708,0.028575,0.603239,0.309907,0
232046,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.040000,0.838003,0.040002,0.040844,0.041150,0
232047,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.066667,0.731766,0.066676,0.067825,0.067066,0
232048,0.0,0.000000,0.000000,0.000000,0.081052,0.186823,0.084033,0.000000,0.084835,0.065762,...,0,0,4,0,0.002667,0.002757,0.002667,0.329361,0.662549,1


In [3]:
texts, labels = data.drop(['class'], axis = 1), data['class']

In [4]:
# Split Data
x_train, x_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

In [5]:
# Initialize evaluation metrics dictionary
evaluation_metrics = {}

# Function to evaluate models
def evaluate_model(model_name, y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)
    evaluation_metrics[model_name] = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'kappa': cohen_kappa_score(y_true, y_pred),
        'classification_report': report
    }
    
    print(f"{model_name} - Classification Report:\n", classification_report(y_true, y_pred))
    print(f"Accuracy : {accuracy_score(y_true, y_pred)} \n")
    print(f"Loss : {1 - accuracy_score(y_true, y_pred)} \n")
    print(f"Cohen Kappa Score : {cohen_kappa_score(y_true, y_pred)} \n")


### Traditional ML Models

In [6]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
y_pred = log_reg.predict(x_val)
evaluate_model("Logistic Regression", y_val, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression - Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87     23204
           1       0.89      0.84      0.86     23206

    accuracy                           0.87     46410
   macro avg       0.87      0.87      0.87     46410
weighted avg       0.87      0.87      0.87     46410

Accuracy : 0.8681534152122388 

Loss : 0.13184658478776123 

Cohen Kappa Score : 0.7363074268854618 



In [7]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(x_train, y_train)
y_pred = nb.predict(x_val)
evaluate_model("Naive Bayes", y_val, y_pred)

Naive Bayes - Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.78      0.83     23204
           1       0.80      0.89      0.85     23206

    accuracy                           0.84     46410
   macro avg       0.84      0.84      0.84     46410
weighted avg       0.84      0.84      0.84     46410

Accuracy : 0.8370609782374488 

Loss : 0.16293902176255115 

Cohen Kappa Score : 0.6741203932608677 



In [8]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
y_pred = dt.predict(x_val)
evaluate_model("Decision Tree", y_val, y_pred)

Decision Tree - Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.82     23204
           1       0.82      0.80      0.81     23206

    accuracy                           0.81     46410
   macro avg       0.81      0.81      0.81     46410
weighted avg       0.81      0.81      0.81     46410

Accuracy : 0.8127774186597716 

Loss : 0.1872225813402284 

Cohen Kappa Score : 0.6255554075340606 



In [9]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_val)
evaluate_model("Random Forest", y_val, y_pred)

Random Forest - Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87     23204
           1       0.87      0.85      0.86     23206

    accuracy                           0.86     46410
   macro avg       0.86      0.86      0.86     46410
weighted avg       0.86      0.86      0.86     46410

Accuracy : 0.8645335057099763 

Loss : 0.13546649429002366 

Cohen Kappa Score : 0.7290672861403743 



In [10]:
# Support Vector Machine
svm = SVC()
svm.fit(x_train, y_train)
y_pred = svm.predict(x_val)
evaluate_model("SVM", y_val, y_pred)

SVM - Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.93      0.87     23204
           1       0.91      0.79      0.85     23206

    accuracy                           0.86     46410
   macro avg       0.86      0.86      0.86     46410
weighted avg       0.86      0.86      0.86     46410

Accuracy : 0.8562809739280327 

Loss : 0.1437190260719673 

Cohen Kappa Score : 0.7125636864423426 



In [16]:
texts, labels = data.drop(['class'], axis = 1), data['class']
texts, labels = texts.values, labels.values

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

# XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
evaluate_model("XGBoost", y_test, y_pred)

XGBoost - Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.90      0.87     23204
           1       0.89      0.85      0.87     23206

    accuracy                           0.87     46410
   macro avg       0.87      0.87      0.87     46410
weighted avg       0.87      0.87      0.87     46410

Accuracy : 0.8713639301874596 

Loss : 0.1286360698125404 

Cohen Kappa Score : 0.7427284427896291 



In [17]:
evaluation_metrics

{'Logistic Regression': {'accuracy': 0.8681534152122388,
  'precision': np.float64(0.8885705189430118),
  'recall': np.float64(0.841894337671292),
  'f1_score': np.float64(0.8646029252317837),
  'kappa': np.float64(0.7363074268854618),
  'classification_report': {'0': {'precision': 0.8497727551897801,
    'recall': 0.8944147560765385,
    'f1-score': 0.8715224557498897,
    'support': 23204.0},
   '1': {'precision': 0.8885705189430118,
    'recall': 0.841894337671292,
    'f1-score': 0.8646029252317837,
    'support': 23206.0},
   'accuracy': 0.8681534152122388,
   'macro avg': {'precision': 0.869171637066396,
    'recall': 0.8681545468739152,
    'f1-score': 0.8680626904908367,
    'support': 46410.0},
   'weighted avg': {'precision': 0.8691724730449297,
    'recall': 0.8681534152122388,
    'f1-score': 0.8680625413951566,
    'support': 46410.0}}},
 'Naive Bayes': {'accuracy': 0.8370609782374488,
  'precision': np.float64(0.8033193733519467),
  'recall': np.float64(0.8927001637507541