In [1]:
try:
    import pandas as pd
    import numpy as np
    import os,sys
    import re
    # importing algorithms
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.naive_bayes import BernoulliNB
    from sklearn import svm
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix, classification_report
    from sklearn.preprocessing import MinMaxScaler
except Exception as e:
    print("Error is due to",e)
pwd = os.getcwd()
labels_df = pd.read_csv(pwd+"//Datasets//Kabita//Input//kabita_dataset_labels.csv")

In [2]:
# Function of Train-test split, MinMax Scaling
def minmax_scaling(x_data, y_data):
    x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.30,random_state=21,stratify=y_data)
    # MinMax scaling of train data
    minmax_model = MinMaxScaler(feature_range=(0,5))
    np.set_printoptions(precision=3)
    scaled_data_train = minmax_model.fit_transform(x_train)
    # MinMax scaling of test data
    scaled_data_test = minmax_model.fit_transform(x_test)
    return scaled_data_train, scaled_data_test, y_train, y_test

In [3]:
# Function for Modelling and extracting Metrics
def ml_training(ml_model, x_train, x_test, y_train, y_test, model_name):
    ml_model.fit(x_train, y_train)
    ml_pred_val = ml_model.predict(x_test)
    print("Accuracy of "+model_name+" after MinMax Scaling is:", ml_model.score(x_test,y_test))
    print("Confusion Matrix of "+model_name+" is:\n", confusion_matrix(y_test,ml_pred_val))
    print("Classification Report of "+model_name+" is:\n", classification_report(y_test,ml_pred_val))
    print(70*"=")

### Bag of words Models

In [4]:
# TFIDF vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//BagOfWords//tfidf_500_vectors.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after MinMax Scaling is: 0.7306122448979592
Confusion Matrix of Logistic Regression is:
 [[160   1   1   4  29  12   3]
 [  0 158  11  10  12  19   0]
 [  1   2 181  13   0  13   0]
 [  1  10  16 151  11  17   4]
 [ 25  13   9  17 139   2   5]
 [  5   9   5  30   4 119  38]
 [  1   1   0   4   0  38 166]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.83      0.76      0.79       210
           2       0.81      0.75      0.78       210
           3       0.81      0.86      0.84       210
           4       0.66      0.72      0.69       210
           5       0.71      0.66      0.69       210
           6       0.54      0.57      0.55       210
           7       0.77      0.79      0.78       210

    accuracy                           0.73      1470
   macro avg       0.73      0.73      0.73      1470
weighted avg       0.73      0.73      0.73      1470

KNN with 3 Nei

Confusion Matrix of Bernoulli Naive Bayes is:
 [[163   1  10   2  30   4   0]
 [  1 161  24   7  13   4   0]
 [  0   1 198  10   0   1   0]
 [  7  11  32 143   6   8   3]
 [ 28  15  17  12 134   0   4]
 [  6  12  44  18   5  97  28]
 [  3   5   7   2   3  37 153]]
Classification Report of Bernoulli Naive Bayes is:
               precision    recall  f1-score   support

           1       0.78      0.78      0.78       210
           2       0.78      0.77      0.77       210
           3       0.60      0.94      0.73       210
           4       0.74      0.68      0.71       210
           5       0.70      0.64      0.67       210
           6       0.64      0.46      0.54       210
           7       0.81      0.73      0.77       210

    accuracy                           0.71      1470
   macro avg       0.72      0.71      0.71      1470
weighted avg       0.72      0.71      0.71      1470

Working on SVM Kernal: linear
Accuracy of SVM after MinMax Scaling is: 0.7176870748299

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after MinMax Scaling is: 0.3122448979591837
Confusion Matrix of Decision Tree is:
 [[112   0   0   0  43  55   0]
 [  1   0   2   0   1 206   0]
 [  0   0  95   0   0 115   0]
 [  0   0   1   0   1 208   0]
 [ 78   0   2   0  42  88   0]
 [  0   0   0   0   0 210   0]
 [  2   0   0   0   0 208   0]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.58      0.53      0.56       210
           2       0.00      0.00      0.00       210
           3       0.95      0.45      0.61       210
           4       0.00      0.00      0.00       210
           5       0.48      0.20      0.28       210
           6       0.19      1.00      0.32       210
           7       0.00      0.00      0.00       210

    accuracy                           0.31      1470
   macro avg       0.32      0.31      0.25      1470
weighted avg       0.32      0.31      0.25      1470

Decision Tree with 4 max_depth
A

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after MinMax Scaling is: 0.3979591836734694
Confusion Matrix of Decision Tree is:
 [[109   2   0  53  46   0   0]
 [  0  54   2 152   2   0   0]
 [  0   0  95 115   0   0   0]
 [  1   6   1 202   0   0   0]
 [ 40   4   2  81  80   0   3]
 [  0   3   0 203   0   0   4]
 [  0   0   0 163   2   0  45]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.73      0.52      0.61       210
           2       0.78      0.26      0.39       210
           3       0.95      0.45      0.61       210
           4       0.21      0.96      0.34       210
           5       0.62      0.38      0.47       210
           6       0.00      0.00      0.00       210
           7       0.87      0.21      0.34       210

    accuracy                           0.40      1470
   macro avg       0.59      0.40      0.39      1470
weighted avg       0.59      0.40      0.39      1470

Decision Tree with 6 max_depth
A

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after MinMax Scaling is: 0.45374149659863944
Confusion Matrix of Decision Tree is:
 [[155   2   0   0   0  53   0]
 [  0 122   2   0   3  83   0]
 [  0   0  95   0   0 115   0]
 [  1   6   1   1   0 201   0]
 [ 74  20   2   0  47  65   2]
 [  0   3   0   0   0 203   4]
 [  2   1   0   0   0 163  44]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.67      0.74      0.70       210
           2       0.79      0.58      0.67       210
           3       0.95      0.45      0.61       210
           4       1.00      0.00      0.01       210
           5       0.94      0.22      0.36       210
           6       0.23      0.97      0.37       210
           7       0.88      0.21      0.34       210

    accuracy                           0.45      1470
   macro avg       0.78      0.45      0.44      1470
weighted avg       0.78      0.45      0.44      1470

Decision Tree with 8 max_depth


Accuracy of Decision Tree after MinMax Scaling is: 0.5653061224489796
Confusion Matrix of Decision Tree is:
 [[140   1   0  42  27   0   0]
 [  0 136   6  62   6   0   0]
 [  0  13 144  49   4   0   0]
 [  1  10   5 184   6   0   4]
 [ 33  13   6  43 114   1   0]
 [  0   3   1 179   2   2  23]
 [  0   1   0  93   4   1 111]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.80      0.67      0.73       210
           2       0.77      0.65      0.70       210
           3       0.89      0.69      0.77       210
           4       0.28      0.88      0.43       210
           5       0.70      0.54      0.61       210
           6       0.50      0.01      0.02       210
           7       0.80      0.53      0.64       210

    accuracy                           0.57      1470
   macro avg       0.68      0.57      0.56      1470
weighted avg       0.68      0.57      0.56      1470

Decision Tree with 16 max_depth


Accuracy of Random Forest after MinMax Scaling is: 0.6353741496598639
Confusion Matrix of Random Forest is:
 [[155   5   2  18   6  21   3]
 [  1 126   6  11  23  43   0]
 [  0   0 152  27   0  30   1]
 [  1   7   9 141   7  40   5]
 [ 60  19   6  25  71  13  16]
 [  0   2   5  21   4 139  39]
 [  0   0   1  14   0  45 150]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.71      0.74      0.73       210
           2       0.79      0.60      0.68       210
           3       0.84      0.72      0.78       210
           4       0.55      0.67      0.60       210
           5       0.64      0.34      0.44       210
           6       0.42      0.66      0.51       210
           7       0.70      0.71      0.71       210

    accuracy                           0.64      1470
   macro avg       0.67      0.64      0.64      1470
weighted avg       0.67      0.64      0.64      1470

Random Forest with 3 max_depth
A

Accuracy of Random Forest after MinMax Scaling is: 0.6789115646258503
Confusion Matrix of Random Forest is:
 [[157   2   0   9  12  30   0]
 [  0 142  11  10  10  37   0]
 [  0   1 166  14   0  29   0]
 [  0   6  10 134   5  51   4]
 [ 48  15   7  19  97  15   9]
 [  0   3   4  15   2 149  37]
 [  0   0   1   5   0  51 153]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.77      0.75      0.76       210
           2       0.84      0.68      0.75       210
           3       0.83      0.79      0.81       210
           4       0.65      0.64      0.64       210
           5       0.77      0.46      0.58       210
           6       0.41      0.71      0.52       210
           7       0.75      0.73      0.74       210

    accuracy                           0.68      1470
   macro avg       0.72      0.68      0.69      1470
weighted avg       0.72      0.68      0.69      1470

Random Forest with 11 max_depth


Accuracy of Random Forest after MinMax Scaling is: 0.6938775510204082
Confusion Matrix of Random Forest is:
 [[153   2   0   9  16  30   0]
 [  0 152   8  10  10  30   0]
 [  0   3 170  10   0  27   0]
 [  0   9  10 142   4  41   4]
 [ 40  14   7  20 104  15  10]
 [  0   3   3  20   3 144  37]
 [  1   0   0   5   1  48 155]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.79      0.73      0.76       210
           2       0.83      0.72      0.77       210
           3       0.86      0.81      0.83       210
           4       0.66      0.68      0.67       210
           5       0.75      0.50      0.60       210
           6       0.43      0.69      0.53       210
           7       0.75      0.74      0.75       210

    accuracy                           0.69      1470
   macro avg       0.72      0.69      0.70      1470
weighted avg       0.72      0.69      0.70      1470

Random Forest with 19 max_depth


In [5]:
# Count Vectorizer vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//BagOfWords//cv_500_vectors.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after MinMax Scaling is: 0.6931972789115646
Confusion Matrix of Logistic Regression is:
 [[155   2   1   8  26  16   2]
 [  1 117  41  11  24  16   0]
 [  1   1 182   9   3  14   0]
 [  1   9  19 139  20  18   4]
 [ 23  11  10   7 150   6   3]
 [  4  12   5  22   3 119  45]
 [  1   3   0   5   2  42 157]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.83      0.74      0.78       210
           2       0.75      0.56      0.64       210
           3       0.71      0.87      0.78       210
           4       0.69      0.66      0.68       210
           5       0.66      0.71      0.68       210
           6       0.52      0.57      0.54       210
           7       0.74      0.75      0.75       210

    accuracy                           0.69      1470
   macro avg       0.70      0.69      0.69      1470
weighted avg       0.70      0.69      0.69      1470

KNN with 3 Nei

Accuracy of SVM after MinMax Scaling is: 0.7170068027210884
Confusion Matrix of SVM is:
 [[157   2   2   7  22  18   2]
 [  2 155  18   8  11  16   0]
 [  2   1 185   6   1  15   0]
 [  5  13  22 131  15  20   4]
 [ 30  10  10   7 146   4   3]
 [  4  11   5  15   6 132  37]
 [  1   5   0  10   4  42 148]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.78      0.75      0.76       210
           2       0.79      0.74      0.76       210
           3       0.76      0.88      0.82       210
           4       0.71      0.62      0.66       210
           5       0.71      0.70      0.70       210
           6       0.53      0.63      0.58       210
           7       0.76      0.70      0.73       210

    accuracy                           0.72      1470
   macro avg       0.72      0.72      0.72      1470
weighted avg       0.72      0.72      0.72      1470

Working on SVM Kernal: poly
Accuracy of SVM after MinMax Scali

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after MinMax Scaling is: 0.35034013605442177
Confusion Matrix of Decision Tree is:
 [[155   0   1   0   0  54   0]
 [  0   0  52   0   2 156   0]
 [  0   0 132   0   0  78   0]
 [  1   0  17   0   0 192   0]
 [100   0  26   0  20  64   0]
 [  0   0   2   0   0 208   0]
 [  2   0   0   0   0 208   0]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.60      0.74      0.66       210
           2       0.00      0.00      0.00       210
           3       0.57      0.63      0.60       210
           4       0.00      0.00      0.00       210
           5       0.91      0.10      0.17       210
           6       0.22      0.99      0.36       210
           7       0.00      0.00      0.00       210

    accuracy                           0.35      1470
   macro avg       0.33      0.35      0.26      1470
weighted avg       0.33      0.35      0.26      1470

Decision Tree with 4 max_depth


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after MinMax Scaling is: 0.4312925170068027
Confusion Matrix of Decision Tree is:
 [[137   0   1   0  18  54   0]
 [  0  70  17   0   2 121   0]
 [  0   0 132   0   0  78   0]
 [  0   5  13   0   1 191   0]
 [ 75  13  22   0  46  54   0]
 [  0   1   1   0   0 204   4]
 [  2   1   0   0   0 162  45]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.64      0.65      0.65       210
           2       0.78      0.33      0.47       210
           3       0.71      0.63      0.67       210
           4       0.00      0.00      0.00       210
           5       0.69      0.22      0.33       210
           6       0.24      0.97      0.38       210
           7       0.92      0.21      0.35       210

    accuracy                           0.43      1470
   macro avg       0.57      0.43      0.41      1470
weighted avg       0.57      0.43      0.41      1470

Decision Tree with 6 max_depth
A

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after MinMax Scaling is: 0.4816326530612245
Confusion Matrix of Decision Tree is:
 [[155   0   1   0   0  54   0]
 [  0 106  17   3   4  80   0]
 [  0   0 132   0   0  78   0]
 [  1   4   8   6   1 188   2]
 [ 75  18  22   0  50  45   0]
 [  0   2   1   1   0 188  18]
 [  2   1   0   0   0 136  71]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.67      0.74      0.70       210
           2       0.81      0.50      0.62       210
           3       0.73      0.63      0.68       210
           4       0.60      0.03      0.05       210
           5       0.91      0.24      0.38       210
           6       0.24      0.90      0.38       210
           7       0.78      0.34      0.47       210

    accuracy                           0.48      1470
   macro avg       0.68      0.48      0.47      1470
weighted avg       0.68      0.48      0.47      1470

Decision Tree with 8 max_depth
A

Accuracy of Decision Tree after MinMax Scaling is: 0.5666666666666667
Confusion Matrix of Decision Tree is:
 [[159   0   1   0   6  44   0]
 [  0 115  15   4   7  69   0]
 [  0   0 150   0   0  60   0]
 [  0   5  18  37   2 144   4]
 [ 55  19  16   3  81  35   1]
 [  0   2   3   1   0 178  26]
 [  2   1   1   2   3  88 113]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.74      0.76      0.75       210
           2       0.81      0.55      0.65       210
           3       0.74      0.71      0.72       210
           4       0.79      0.18      0.29       210
           5       0.82      0.39      0.52       210
           6       0.29      0.85      0.43       210
           7       0.78      0.54      0.64       210

    accuracy                           0.57      1470
   macro avg       0.71      0.57      0.57      1470
weighted avg       0.71      0.57      0.57      1470

Decision Tree with 16 max_depth


Accuracy of Random Forest after MinMax Scaling is: 0.5795918367346938
Confusion Matrix of Random Forest is:
 [[147   5   2  11  12  27   6]
 [  1  97  50  12  17  33   0]
 [  0   0 156   7   0  47   0]
 [  0  14  24 103   3  62   4]
 [ 56  23  28  19  57  13  14]
 [  0   8   5  16   2 138  41]
 [  0   2   0   4   0  50 154]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.72      0.70      0.71       210
           2       0.65      0.46      0.54       210
           3       0.59      0.74      0.66       210
           4       0.60      0.49      0.54       210
           5       0.63      0.27      0.38       210
           6       0.37      0.66      0.48       210
           7       0.70      0.73      0.72       210

    accuracy                           0.58      1470
   macro avg       0.61      0.58      0.57      1470
weighted avg       0.61      0.58      0.57      1470

Random Forest with 3 max_depth
A

Accuracy of Random Forest after MinMax Scaling is: 0.645578231292517
Confusion Matrix of Random Forest is:
 [[158   5   1   9   7  29   1]
 [  0 141  22   9   5  33   0]
 [  0   0 158   7   0  45   0]
 [  0  15  20 107   5  60   3]
 [ 53  21  19  15  85   7  10]
 [  0  10   3  12   1 141  43]
 [  1   2   0   5   0  43 159]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.75      0.75      0.75       210
           2       0.73      0.67      0.70       210
           3       0.71      0.75      0.73       210
           4       0.65      0.51      0.57       210
           5       0.83      0.40      0.54       210
           6       0.39      0.67      0.50       210
           7       0.74      0.76      0.75       210

    accuracy                           0.65      1470
   macro avg       0.68      0.65      0.65      1470
weighted avg       0.68      0.65      0.65      1470

Random Forest with 11 max_depth
A

Accuracy of Random Forest after MinMax Scaling is: 0.6537414965986394
Confusion Matrix of Random Forest is:
 [[158   5   2   9   7  29   0]
 [  0 144  18   8   8  32   0]
 [  0   0 165   8   0  37   0]
 [  0  13  19 118   4  53   3]
 [ 53  23  19  20  78   9   8]
 [  0   9   4  12   2 144  39]
 [  2   2   0   6   0  46 154]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.74      0.75      0.75       210
           2       0.73      0.69      0.71       210
           3       0.73      0.79      0.76       210
           4       0.65      0.56      0.60       210
           5       0.79      0.37      0.50       210
           6       0.41      0.69      0.51       210
           7       0.75      0.73      0.74       210

    accuracy                           0.65      1470
   macro avg       0.69      0.65      0.65      1470
weighted avg       0.69      0.65      0.65      1470

Random Forest with 19 max_depth


In [6]:
# Term Frequency vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//BagOfWords//tf_500_vectors.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after MinMax Scaling is: 0.7421768707482993
Confusion Matrix of Logistic Regression is:
 [[159   1   1   4  35   9   1]
 [  0 161  10  11   9  18   1]
 [  1   1 183  12   0  13   0]
 [  1   9  17 151  12  16   4]
 [ 21  11   9  15 148   0   6]
 [  5  10   5  24   6 122  38]
 [  2   1   0   5   2  33 167]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.84      0.76      0.80       210
           2       0.83      0.77      0.80       210
           3       0.81      0.87      0.84       210
           4       0.68      0.72      0.70       210
           5       0.70      0.70      0.70       210
           6       0.58      0.58      0.58       210
           7       0.77      0.80      0.78       210

    accuracy                           0.74      1470
   macro avg       0.74      0.74      0.74      1470
weighted avg       0.74      0.74      0.74      1470

KNN with 3 Nei

Accuracy of SVM after MinMax Scaling is: 0.726530612244898
Confusion Matrix of SVM is:
 [[158   3   0   8  29   9   3]
 [  1 167  10   9   6  17   0]
 [  1   2 185   5   1  16   0]
 [  6  16  18 140  10  18   2]
 [ 27  13  12  16 135   3   4]
 [  6  12   4  20   6 124  38]
 [  4   3   0   7   2  35 159]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.78      0.75      0.77       210
           2       0.77      0.80      0.78       210
           3       0.81      0.88      0.84       210
           4       0.68      0.67      0.67       210
           5       0.71      0.64      0.68       210
           6       0.56      0.59      0.57       210
           7       0.77      0.76      0.76       210

    accuracy                           0.73      1470
   macro avg       0.73      0.73      0.73      1470
weighted avg       0.73      0.73      0.73      1470

Working on SVM Kernal: poly
Accuracy of SVM after MinMax Scalin

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after MinMax Scaling is: 0.3047619047619048
Confusion Matrix of Decision Tree is:
 [[100   0   0   0  55  55   0]
 [  1   0   2   0   1 206   0]
 [  0   0  86   0   0 124   0]
 [  0   0   1   0   1 208   0]
 [ 68   0   1   0  52  89   0]
 [  0   0   0   0   0 210   0]
 [  2   0   0   0   0 208   0]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.58      0.48      0.52       210
           2       0.00      0.00      0.00       210
           3       0.96      0.41      0.57       210
           4       0.00      0.00      0.00       210
           5       0.48      0.25      0.33       210
           6       0.19      1.00      0.32       210
           7       0.00      0.00      0.00       210

    accuracy                           0.30      1470
   macro avg       0.32      0.30      0.25      1470
weighted avg       0.32      0.30      0.25      1470

Decision Tree with 4 max_depth
A

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after MinMax Scaling is: 0.4061224489795918
Confusion Matrix of Decision Tree is:
 [[133   5   0   0  22  50   0]
 [  0  58   2   0   2 148   0]
 [  0   0  86   0   0 124   0]
 [  1   6   1   0   0 202   0]
 [ 49   5   1   0  72  81   2]
 [  0   3   0   0   0 203   4]
 [  0   0   0   0   2 163  45]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.73      0.63      0.68       210
           2       0.75      0.28      0.40       210
           3       0.96      0.41      0.57       210
           4       0.00      0.00      0.00       210
           5       0.73      0.34      0.47       210
           6       0.21      0.97      0.34       210
           7       0.88      0.21      0.34       210

    accuracy                           0.41      1470
   macro avg       0.61      0.41      0.40      1470
weighted avg       0.61      0.41      0.40      1470

Decision Tree with 6 max_depth
A

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after MinMax Scaling is: 0.4557823129251701
Confusion Matrix of Decision Tree is:
 [[119   5   0   0  36  50   0]
 [  0 131   2   0   2  75   0]
 [  0   0  86   0   0 124   0]
 [  0   6   1   0   1 202   0]
 [ 34  22   1   0  87  64   2]
 [  0   3   0   0   0 203   4]
 [  0   1   0   0   2 163  44]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.78      0.57      0.66       210
           2       0.78      0.62      0.69       210
           3       0.96      0.41      0.57       210
           4       0.00      0.00      0.00       210
           5       0.68      0.41      0.51       210
           6       0.23      0.97      0.37       210
           7       0.88      0.21      0.34       210

    accuracy                           0.46      1470
   macro avg       0.61      0.46      0.45      1470
weighted avg       0.61      0.46      0.45      1470

Decision Tree with 8 max_depth
A

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after MinMax Scaling is: 0.49183673469387756
Confusion Matrix of Decision Tree is:
 [[119   6   0   0  36  49   0]
 [  0 147   3   1   3  56   0]
 [  0  30 102   0   0  78   0]
 [  0  22   1   0   1 184   2]
 [ 27  34   4   0  97  45   3]
 [  0   4   1   0   0 188  17]
 [  0   1   0   0   2 137  70]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.82      0.57      0.67       210
           2       0.60      0.70      0.65       210
           3       0.92      0.49      0.64       210
           4       0.00      0.00      0.00       210
           5       0.70      0.46      0.56       210
           6       0.26      0.90      0.40       210
           7       0.76      0.33      0.46       210

    accuracy                           0.49      1470
   macro avg       0.58      0.49      0.48      1470
weighted avg       0.58      0.49      0.48      1470

Decision Tree with 10 max_depth

Accuracy of Decision Tree after MinMax Scaling is: 0.5605442176870749
Confusion Matrix of Decision Tree is:
 [[146   5   0  40  19   0   0]
 [  0 137   7  53  13   0   0]
 [  0   8 152  45   5   0   0]
 [  0  13   9 175   9   0   4]
 [ 40  17   6  40 102   1   4]
 [  0   5   2 173   1   1  28]
 [  1   1   0  86   7   4 111]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.78      0.70      0.74       210
           2       0.74      0.65      0.69       210
           3       0.86      0.72      0.79       210
           4       0.29      0.83      0.43       210
           5       0.65      0.49      0.56       210
           6       0.17      0.00      0.01       210
           7       0.76      0.53      0.62       210

    accuracy                           0.56      1470
   macro avg       0.61      0.56      0.55      1470
weighted avg       0.61      0.56      0.55      1470

Decision Tree with 18 max_depth


Accuracy of Random Forest after MinMax Scaling is: 0.638095238095238
Confusion Matrix of Random Forest is:
 [[151   5   1   9  13  28   3]
 [  0 133  16  12  12  36   1]
 [  0   0 160  11   0  39   0]
 [  1  10  11 121  10  53   4]
 [ 55  20  10  22  79  11  13]
 [  0   6   6  18   3 134  43]
 [  0   0   1   8   1  40 160]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.73      0.72      0.72       210
           2       0.76      0.63      0.69       210
           3       0.78      0.76      0.77       210
           4       0.60      0.58      0.59       210
           5       0.67      0.38      0.48       210
           6       0.39      0.64      0.49       210
           7       0.71      0.76      0.74       210

    accuracy                           0.64      1470
   macro avg       0.66      0.64      0.64      1470
weighted avg       0.66      0.64      0.64      1470

Random Forest with 5 max_depth
Ac

Accuracy of Random Forest after MinMax Scaling is: 0.6775510204081633
Confusion Matrix of Random Forest is:
 [[151   5   1   9  16  28   0]
 [  0 143  11   9  15  32   0]
 [  0   1 172  10   0  27   0]
 [  0  10   8 135   8  46   3]
 [ 44  18   9  21  98  11   9]
 [  0   6   2  16   4 143  39]
 [  1   0   0   5   2  48 154]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.77      0.72      0.74       210
           2       0.78      0.68      0.73       210
           3       0.85      0.82      0.83       210
           4       0.66      0.64      0.65       210
           5       0.69      0.47      0.56       210
           6       0.43      0.68      0.52       210
           7       0.75      0.73      0.74       210

    accuracy                           0.68      1470
   macro avg       0.70      0.68      0.68      1470
weighted avg       0.70      0.68      0.68      1470

Random Forest with 13 max_depth


Accuracy of Random Forest after MinMax Scaling is: 0.6952380952380952
Confusion Matrix of Random Forest is:
 [[144   5   1   7  24  29   0]
 [  0 151  12   9   7  31   0]
 [  0   2 178   6   0  24   0]
 [  0   8  10 138   8  43   3]
 [ 35  19   8  16 113  11   8]
 [  0   5   4  16   5 142  38]
 [  1   0   0   5   2  46 156]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.80      0.69      0.74       210
           2       0.79      0.72      0.76       210
           3       0.84      0.85      0.84       210
           4       0.70      0.66      0.68       210
           5       0.71      0.54      0.61       210
           6       0.44      0.68      0.53       210
           7       0.76      0.74      0.75       210

    accuracy                           0.70      1470
   macro avg       0.72      0.70      0.70      1470
weighted avg       0.72      0.70      0.70      1470

Accuracy of Random Forest after 

### Sentence Transformer Models

In [7]:
# BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=5000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after MinMax Scaling is: 0.6938775510204082
Confusion Matrix of Logistic Regression is:
 [[183   2   2   8   2  13   0]
 [  3 186   7   7   0   7   0]
 [  2   6 195   2   0   5   0]
 [  4  15   1 168   0  18   4]
 [ 95  43   8  25  29   8   2]
 [  5  12   3  18   0 153  19]
 [ 10   1   2   7   1  83 106]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.61      0.87      0.71       210
           2       0.70      0.89      0.78       210
           3       0.89      0.93      0.91       210
           4       0.71      0.80      0.76       210
           5       0.91      0.14      0.24       210
           6       0.53      0.73      0.62       210
           7       0.81      0.50      0.62       210

    accuracy                           0.69      1470
   macro avg       0.74      0.69      0.66      1470
weighted avg       0.74      0.69      0.66      1470

KNN with 3 Nei

KeyboardInterrupt: 

In [None]:
# GKB BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset_gkb.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=5000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

In [None]:
# N Distill BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset_ndisbert.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=5000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

In [None]:
# V BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset_vbert.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

In [None]:
# GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//gpt_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=5000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

In [None]:
# XLM vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//xlm_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=5000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

### Fine Tuned Transformers Models

In [None]:
# BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//bert_base_finetuned_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=5000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

In [None]:
# Hinglish BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//vbert_hinglish_finetuned_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=5000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

In [None]:
# GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//gpt_base_finetuned_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

In [None]:
# Hinglish GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//gpt_hinglish_finetuned_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")

In [None]:
# XLM vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//xlm_base_finetuned_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = minmax_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=5000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_train,x_test,y_train,y_test,"Multinomial Naive Bayes")