In [1]:
try:
    import pandas as pd
    import numpy as np
    import os,sys
    import re
    # importing algorithms
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.naive_bayes import BernoulliNB
    from sklearn import svm
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix, classification_report
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import Normalizer
except Exception as e:
    print("Error is due to",e)
pwd = os.getcwd()
labels_df = pd.read_csv(pwd+"//Datasets//Kabita//Input//kabita_dataset_labels.csv")

In [2]:
# Function of Train-test split, Normalize Scaling
def normalize_scaling(x_data, y_data):
    x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.30,random_state=21,stratify=y_data)
    # Normalize scaling of train data
    normalize_model = Normalizer()
    np.set_printoptions(precision=3)
    scaled_data_train = normalize_model.fit_transform(x_train)
    # Normalize scaling of test data
    scaled_data_test = normalize_model.fit_transform(x_test)
    return scaled_data_train, scaled_data_test, y_train, y_test

In [3]:
# Function for Modelling and extracting Metrics
def ml_training(ml_model, x_train, x_test, y_train, y_test, model_name):
    ml_model.fit(x_train, y_train)
    ml_pred_val = ml_model.predict(x_test)
    print("Accuracy of "+model_name+" after Normalize Scaling is:", ml_model.score(x_test,y_test))
    print("Confusion Matrix of "+model_name+" is:\n", confusion_matrix(y_test,ml_pred_val))
    print("Classification Report of "+model_name+" is:\n", classification_report(y_test,ml_pred_val))
    print(70*"=")

### Bag of words Models

In [4]:
# TFIDF vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//BagOfWords//tfidf_500_vectors.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.753061224489796
Confusion Matrix of Logistic Regression is:
 [[169   0   0   5  14  21   1]
 [  0 163  12   6   8  21   0]
 [  0   1 181  10   0  18   0]
 [  1   8  14 160   5  19   3]
 [ 23  13  16  21 129   2   6]
 [  1   4   4  24   2 139  36]
 [  0   2   0   4   1  37 166]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.87      0.80      0.84       210
           2       0.85      0.78      0.81       210
           3       0.80      0.86      0.83       210
           4       0.70      0.76      0.73       210
           5       0.81      0.61      0.70       210
           6       0.54      0.66      0.60       210
           7       0.78      0.79      0.79       210

    accuracy                           0.75      1470
   macro avg       0.76      0.75      0.76      1470
weighted avg       0.76      0.75      0.76      1470

KNN with 3 N

Accuracy of Bernoulli Naive Bayes after Normalize Scaling is: 0.7136054421768707
Confusion Matrix of Bernoulli Naive Bayes is:
 [[163   1  10   2  30   4   0]
 [  1 161  24   7  13   4   0]
 [  0   1 198  10   0   1   0]
 [  7  11  32 143   6   8   3]
 [ 28  15  17  12 134   0   4]
 [  6  12  44  18   5  97  28]
 [  3   5   7   2   3  37 153]]
Classification Report of Bernoulli Naive Bayes is:
               precision    recall  f1-score   support

           1       0.78      0.78      0.78       210
           2       0.78      0.77      0.77       210
           3       0.60      0.94      0.73       210
           4       0.74      0.68      0.71       210
           5       0.70      0.64      0.67       210
           6       0.64      0.46      0.54       210
           7       0.81      0.73      0.77       210

    accuracy                           0.71      1470
   macro avg       0.72      0.71      0.71      1470
weighted avg       0.72      0.71      0.71      1470

Worki

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.3979591836734694
Confusion Matrix of Decision Tree is:
 [[109   2   0  53  46   0   0]
 [  0  54   2 152   2   0   0]
 [  0   0  95 115   0   0   0]
 [  1   6   1 202   0   0   0]
 [ 40   4   2  81  80   0   3]
 [  0   3   0 203   0   0   4]
 [  0   0   0 163   2   0  45]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.73      0.52      0.61       210
           2       0.78      0.26      0.39       210
           3       0.95      0.45      0.61       210
           4       0.21      0.96      0.34       210
           5       0.62      0.38      0.47       210
           6       0.00      0.00      0.00       210
           7       0.87      0.21      0.34       210

    accuracy                           0.40      1470
   macro avg       0.59      0.40      0.39      1470
weighted avg       0.59      0.40      0.39      1470

Decision Tree with 6 max_dept

Confusion Matrix of Decision Tree is:
 [[156   2   0  42  10   0   0]
 [  0 134   4  63   9   0   0]
 [  0  17 125  64   4   0   0]
 [  0  10   4 183   9   0   4]
 [ 53  12   5  43  96   1   0]
 [  0   4   1 179   2   1  23]
 [  2   1   0  96   0   1 110]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.74      0.74      0.74       210
           2       0.74      0.64      0.69       210
           3       0.90      0.60      0.72       210
           4       0.27      0.87      0.42       210
           5       0.74      0.46      0.56       210
           6       0.33      0.00      0.01       210
           7       0.80      0.52      0.63       210

    accuracy                           0.55      1470
   macro avg       0.65      0.55      0.54      1470
weighted avg       0.65      0.55      0.54      1470

Decision Tree with 14 max_depth
Accuracy of Decision Tree after Normalize Scaling is: 0.55646258503401

Accuracy of Random Forest after Normalize Scaling is: 0.6061224489795919
Confusion Matrix of Random Forest is:
 [[156   3   0  19   5  19   8]
 [  1 111   7  13  35  43   0]
 [  0   0 151  23   0  35   1]
 [  0  12   7 123  11  50   7]
 [ 62  15   5  28  73  13  14]
 [  0   7   4  24   4 129  42]
 [  0   1   0  16   0  45 148]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.71      0.74      0.73       210
           2       0.74      0.53      0.62       210
           3       0.87      0.72      0.79       210
           4       0.50      0.59      0.54       210
           5       0.57      0.35      0.43       210
           6       0.39      0.61      0.47       210
           7       0.67      0.70      0.69       210

    accuracy                           0.61      1470
   macro avg       0.64      0.61      0.61      1470
weighted avg       0.64      0.61      0.61      1470

Random Forest with 2 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.6700680272108843
Confusion Matrix of Random Forest is:
 [[157   2   0  17  10  23   1]
 [  0 139   7  13  15  36   0]
 [  0   0 161  13   0  35   1]
 [  0   6   8 142   4  46   4]
 [ 50  16   6  21  89  17  11]
 [  0   4   4  17   2 142  41]
 [  0   0   0   8   1  46 155]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.76      0.75      0.75       210
           2       0.83      0.66      0.74       210
           3       0.87      0.77      0.81       210
           4       0.61      0.68      0.64       210
           5       0.74      0.42      0.54       210
           6       0.41      0.68      0.51       210
           7       0.73      0.74      0.73       210

    accuracy                           0.67      1470
   macro avg       0.71      0.67      0.68      1470
weighted avg       0.71      0.67      0.68      1470

Random Forest with 10 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.691156462585034
Confusion Matrix of Random Forest is:
 [[152   2   0  10  17  28   1]
 [  0 148   7   7  16  32   0]
 [  0   5 161  10   0  33   1]
 [  0   9   8 139   7  43   4]
 [ 31  14   6  21 113  15  10]
 [  0   3   4  14   2 145  42]
 [  0   0   0   5   2  45 158]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.83      0.72      0.77       210
           2       0.82      0.70      0.76       210
           3       0.87      0.77      0.81       210
           4       0.67      0.66      0.67       210
           5       0.72      0.54      0.62       210
           6       0.43      0.69      0.53       210
           7       0.73      0.75      0.74       210

    accuracy                           0.69      1470
   macro avg       0.72      0.69      0.70      1470
weighted avg       0.72      0.69      0.70      1470

Random Forest with 18 max_dept

In [5]:
# Count Vectorizer vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//BagOfWords//cv_500_vectors.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.7496598639455783
Confusion Matrix of Logistic Regression is:
 [[165   3   1   9  15  15   2]
 [  0 163  15   6   6  20   0]
 [  0   1 181   9   0  19   0]
 [  1   9  17 162   2  16   3]
 [ 24  18  18  17 125   2   6]
 [  0   7   4  23   2 139  35]
 [  0   3   0   6   1  33 167]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.87      0.79      0.82       210
           2       0.80      0.78      0.79       210
           3       0.77      0.86      0.81       210
           4       0.70      0.77      0.73       210
           5       0.83      0.60      0.69       210
           6       0.57      0.66      0.61       210
           7       0.78      0.80      0.79       210

    accuracy                           0.75      1470
   macro avg       0.76      0.75      0.75      1470
weighted avg       0.76      0.75      0.75      1470

KNN with 3 

Accuracy of SVM after Normalize Scaling is: 0.7619047619047619
Confusion Matrix of SVM is:
 [[174   1   0   4  13  16   2]
 [  0 173  11   7   2  17   0]
 [  0   1 183  10   0  16   0]
 [  0  12  18 150   2  24   4]
 [ 27  17  16  18 127   2   3]
 [  0   4   4  19   1 143  39]
 [  1   3   0   6   0  30 170]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.86      0.83      0.84       210
           2       0.82      0.82      0.82       210
           3       0.79      0.87      0.83       210
           4       0.70      0.71      0.71       210
           5       0.88      0.60      0.72       210
           6       0.58      0.68      0.62       210
           7       0.78      0.81      0.79       210

    accuracy                           0.76      1470
   macro avg       0.77      0.76      0.76      1470
weighted avg       0.77      0.76      0.76      1470

Working on SVM Kernal: poly
Accuracy of SVM after Normalize

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.4061224489795918
Confusion Matrix of Decision Tree is:
 [[133   5   0   0  22  50   0]
 [  0  58   2   0   2 148   0]
 [  0   0  86   0   0 124   0]
 [  1   6   1   0   0 202   0]
 [ 49   5   1   0  72  81   2]
 [  0   3   0   0   0 203   4]
 [  0   0   0   0   2 163  45]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.73      0.63      0.68       210
           2       0.75      0.28      0.40       210
           3       0.96      0.41      0.57       210
           4       0.00      0.00      0.00       210
           5       0.73      0.34      0.47       210
           6       0.21      0.97      0.34       210
           7       0.88      0.21      0.34       210

    accuracy                           0.41      1470
   macro avg       0.61      0.41      0.40      1470
weighted avg       0.61      0.41      0.40      1470

Decision Tree with 6 max_depth
Accuracy of Decision Tree after Normalize Scaling is

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Confusion Matrix of Decision Tree is:
 [[119   5   1   0  36  49   0]
 [  0 130  21   1   2  56   0]
 [  0   0 132   0   0  78   0]
 [  0   6  17   0   1 186   0]
 [ 29  22  19   0  92  46   2]
 [  0   3   2   0   0 201   4]
 [  0   1   0   0   2 163  44]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.80      0.57      0.66       210
           2       0.78      0.62      0.69       210
           3       0.69      0.63      0.66       210
           4       0.00      0.00      0.00       210
           5       0.69      0.44      0.54       210
           6       0.26      0.96      0.41       210
           7       0.88      0.21      0.34       210

    accuracy                           0.49      1470
   macro avg       0.59      0.49      0.47      1470
weighted avg       0.59      0.49      0.47      1470

Decision Tree with 9 max_depth
Accuracy of Decision Tree after Normalize Scaling is: 0.491156462585034

Accuracy of Decision Tree after Normalize Scaling is: 0.5591836734693878
Confusion Matrix of Decision Tree is:
 [[147   5   0  40  18   0   0]
 [  0 137   7  53  13   0   0]
 [  0   8 151  45   6   0   0]
 [  0  13   9 175   9   0   4]
 [ 43  17   5  40 100   1   4]
 [  0   5   1 173   2   1  28]
 [  1   1   0  88   5   4 111]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.77      0.70      0.73       210
           2       0.74      0.65      0.69       210
           3       0.87      0.72      0.79       210
           4       0.29      0.83      0.42       210
           5       0.65      0.48      0.55       210
           6       0.17      0.00      0.01       210
           7       0.76      0.53      0.62       210

    accuracy                           0.56      1470
   macro avg       0.61      0.56      0.55      1470
weighted avg       0.61      0.56      0.55      1470

Decision Tree with 18 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.6414965986394557
Confusion Matrix of Random Forest is:
 [[151   4   0  11  13  27   4]
 [  0 129  17  14  16  33   1]
 [  0   0 163  13   0  34   0]
 [  1  11  10 123   8  53   4]
 [ 51  19   8  24  85  11  12]
 [  0   7   5  19   2 133  44]
 [  0   1   0   8   1  41 159]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.74      0.72      0.73       210
           2       0.75      0.61      0.68       210
           3       0.80      0.78      0.79       210
           4       0.58      0.59      0.58       210
           5       0.68      0.40      0.51       210
           6       0.40      0.63      0.49       210
           7       0.71      0.76      0.73       210

    accuracy                           0.64      1470
   macro avg       0.67      0.64      0.64      1470
weighted avg       0.67      0.64      0.64      1470

Random Forest with 6 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.680952380952381
Confusion Matrix of Random Forest is:
 [[145   5   0   9  21  29   1]
 [  0 141  15  10  14  30   0]
 [  0   4 174   9   0  23   0]
 [  0   9  12 131  11  43   4]
 [ 34  17   8  17 112  13   9]
 [  0   5   3  16   4 140  42]
 [  0   1   0   6   2  43 158]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.81      0.69      0.75       210
           2       0.77      0.67      0.72       210
           3       0.82      0.83      0.82       210
           4       0.66      0.62      0.64       210
           5       0.68      0.53      0.60       210
           6       0.44      0.67      0.53       210
           7       0.74      0.75      0.75       210

    accuracy                           0.68      1470
   macro avg       0.70      0.68      0.69      1470
weighted avg       0.70      0.68      0.69      1470

Random Forest with 14 max_dept

In [6]:
# Term Frequency vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//BagOfWords//tf_500_vectors.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.7496598639455783
Confusion Matrix of Logistic Regression is:
 [[165   3   1   9  15  15   2]
 [  0 163  15   6   6  20   0]
 [  0   1 181   9   0  19   0]
 [  1   9  17 162   2  16   3]
 [ 24  18  18  17 125   2   6]
 [  0   7   4  23   2 139  35]
 [  0   3   0   6   1  33 167]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.87      0.79      0.82       210
           2       0.80      0.78      0.79       210
           3       0.77      0.86      0.81       210
           4       0.70      0.77      0.73       210
           5       0.83      0.60      0.69       210
           6       0.57      0.66      0.61       210
           7       0.78      0.80      0.79       210

    accuracy                           0.75      1470
   macro avg       0.76      0.75      0.75      1470
weighted avg       0.76      0.75      0.75      1470

KNN with 3 

Accuracy of SVM after Normalize Scaling is: 0.7619047619047619
Confusion Matrix of SVM is:
 [[174   1   0   4  13  16   2]
 [  0 173  11   7   2  17   0]
 [  0   1 183  10   0  16   0]
 [  0  12  18 150   2  24   4]
 [ 27  17  16  18 127   2   3]
 [  0   4   4  19   1 143  39]
 [  1   3   0   6   0  30 170]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.86      0.83      0.84       210
           2       0.82      0.82      0.82       210
           3       0.79      0.87      0.83       210
           4       0.70      0.71      0.71       210
           5       0.88      0.60      0.72       210
           6       0.58      0.68      0.62       210
           7       0.78      0.81      0.79       210

    accuracy                           0.76      1470
   macro avg       0.77      0.76      0.76      1470
weighted avg       0.77      0.76      0.76      1470

Working on SVM Kernal: poly
Accuracy of SVM after Normalize

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Confusion Matrix of Decision Tree is:
 [[133   5   0   0  22  50   0]
 [  0  58   2   0   2 148   0]
 [  0   0  86   0   0 124   0]
 [  1   6   1   0   0 202   0]
 [ 49   5   1   0  72  81   2]
 [  0   3   0   0   0 203   4]
 [  0   0   0   0   2 163  45]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.73      0.63      0.68       210
           2       0.75      0.28      0.40       210
           3       0.96      0.41      0.57       210
           4       0.00      0.00      0.00       210
           5       0.73      0.34      0.47       210
           6       0.21      0.97      0.34       210
           7       0.88      0.21      0.34       210

    accuracy                           0.41      1470
   macro avg       0.61      0.41      0.40      1470
weighted avg       0.61      0.41      0.40      1470

Decision Tree with 6 max_depth
Accuracy of Decision Tree after Normalize Scaling is: 0.431972789115646

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.4884353741496599
Confusion Matrix of Decision Tree is:
 [[119   5   1   0  36  49   0]
 [  0 130  21   1   2  56   0]
 [  0   0 132   0   0  78   0]
 [  0   6  17   0   1 186   0]
 [ 29  22  19   0  92  46   2]
 [  0   3   2   0   0 201   4]
 [  0   1   0   0   2 163  44]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.80      0.57      0.66       210
           2       0.78      0.62      0.69       210
           3       0.69      0.63      0.66       210
           4       0.00      0.00      0.00       210
           5       0.69      0.44      0.54       210
           6       0.26      0.96      0.41       210
           7       0.88      0.21      0.34       210

    accuracy                           0.49      1470
   macro avg       0.59      0.49      0.47      1470
weighted avg       0.59      0.49      0.47      1470

Decision Tree with 9 max_dept

Accuracy of Decision Tree after Normalize Scaling is: 0.5591836734693878
Confusion Matrix of Decision Tree is:
 [[147   5   0  40  18   0   0]
 [  0 137   7  53  13   0   0]
 [  0   8 151  45   6   0   0]
 [  0  13   9 175   9   0   4]
 [ 43  17   5  40 100   1   4]
 [  0   5   1 173   2   1  28]
 [  1   1   0  88   5   4 111]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.77      0.70      0.73       210
           2       0.74      0.65      0.69       210
           3       0.87      0.72      0.79       210
           4       0.29      0.83      0.42       210
           5       0.65      0.48      0.55       210
           6       0.17      0.00      0.01       210
           7       0.76      0.53      0.62       210

    accuracy                           0.56      1470
   macro avg       0.61      0.56      0.55      1470
weighted avg       0.61      0.56      0.55      1470

Decision Tree with 18 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.6414965986394557
Confusion Matrix of Random Forest is:
 [[151   4   0  11  13  27   4]
 [  0 129  17  14  16  33   1]
 [  0   0 163  13   0  34   0]
 [  1  11  10 123   8  53   4]
 [ 51  19   8  24  85  11  12]
 [  0   7   5  19   2 133  44]
 [  0   1   0   8   1  41 159]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.74      0.72      0.73       210
           2       0.75      0.61      0.68       210
           3       0.80      0.78      0.79       210
           4       0.58      0.59      0.58       210
           5       0.68      0.40      0.51       210
           6       0.40      0.63      0.49       210
           7       0.71      0.76      0.73       210

    accuracy                           0.64      1470
   macro avg       0.67      0.64      0.64      1470
weighted avg       0.67      0.64      0.64      1470

Random Forest with 6 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.680952380952381
Confusion Matrix of Random Forest is:
 [[145   5   0   9  21  29   1]
 [  0 141  15  10  14  30   0]
 [  0   4 174   9   0  23   0]
 [  0   9  12 131  11  43   4]
 [ 34  17   8  17 112  13   9]
 [  0   5   3  16   4 140  42]
 [  0   1   0   6   2  43 158]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.81      0.69      0.75       210
           2       0.77      0.67      0.72       210
           3       0.82      0.83      0.82       210
           4       0.66      0.62      0.64       210
           5       0.68      0.53      0.60       210
           6       0.44      0.67      0.53       210
           7       0.74      0.75      0.75       210

    accuracy                           0.68      1470
   macro avg       0.70      0.68      0.69      1470
weighted avg       0.70      0.68      0.69      1470

Random Forest with 14 max_dept

### Sentence Transformer Models

In [7]:
# BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.7047619047619048
Confusion Matrix of Logistic Regression is:
 [[147   2   1   8  29  20   3]
 [  2 160   8  12  13  13   2]
 [  1   2 192   6   2   7   0]
 [  5  11  23 112  19  25  15]
 [ 24  20   5   8 146   3   4]
 [ 12   7   7  14   4 104  62]
 [  2   1   0   3   1  28 175]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.76      0.70      0.73       210
           2       0.79      0.76      0.77       210
           3       0.81      0.91      0.86       210
           4       0.69      0.53      0.60       210
           5       0.68      0.70      0.69       210
           6       0.52      0.50      0.51       210
           7       0.67      0.83      0.74       210

    accuracy                           0.70      1470
   macro avg       0.70      0.70      0.70      1470
weighted avg       0.70      0.70      0.70      1470

KNN with 3 

Accuracy of SVM after Normalize Scaling is: 0.7299319727891157
Confusion Matrix of SVM is:
 [[154   0   0   6  30  19   1]
 [  1 157   9  16  15   9   3]
 [  1   6 189   5   2   7   0]
 [  3  10  20 121  16  30  10]
 [ 22  17   5   7 153   2   4]
 [  4   8   6  11   7 120  54]
 [  0   0   0   0   3  28 179]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.83      0.73      0.78       210
           2       0.79      0.75      0.77       210
           3       0.83      0.90      0.86       210
           4       0.73      0.58      0.64       210
           5       0.68      0.73      0.70       210
           6       0.56      0.57      0.56       210
           7       0.71      0.85      0.78       210

    accuracy                           0.73      1470
   macro avg       0.73      0.73      0.73      1470
weighted avg       0.73      0.73      0.73      1470

Working on SVM Kernal: poly
Accuracy of SVM after Normalize

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.37891156462585035
Confusion Matrix of Decision Tree is:
 [[  0   5   0   0 169   0  36]
 [  0  25   5   0 155   0  25]
 [  0  12 145   0  41   0  12]
 [  0  10  14   0 132   0  54]
 [  0   2   1   0 192   0  15]
 [  0  10   5   0  47   0 148]
 [  0   0   3   0  12   0 195]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       210
           2       0.39      0.12      0.18       210
           3       0.84      0.69      0.76       210
           4       0.00      0.00      0.00       210
           5       0.26      0.91      0.40       210
           6       0.00      0.00      0.00       210
           7       0.40      0.93      0.56       210

    accuracy                           0.38      1470
   macro avg       0.27      0.38      0.27      1470
weighted avg       0.27      0.38      0.27      1470

Decision Tree with 3 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.4850340136054422
Confusion Matrix of Decision Tree is:
 [[146  23   5   0   0  21  15]
 [  8 168   8   1   0  22   3]
 [  4  37 156   1   0  12   0]
 [ 23 109  15   9   0  27  27]
 [ 53 141   1   0   0   4  11]
 [ 21  26  12   3   0  64  84]
 [  7   5   2   1   0  25 170]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.56      0.70      0.62       210
           2       0.33      0.80      0.47       210
           3       0.78      0.74      0.76       210
           4       0.60      0.04      0.08       210
           5       0.00      0.00      0.00       210
           6       0.37      0.30      0.33       210
           7       0.55      0.81      0.65       210

    accuracy                           0.49      1470
   macro avg       0.46      0.49      0.42      1470
weighted avg       0.46      0.49      0.42      1470

Decision Tree with 4 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.5435374149659864
Confusion Matrix of Decision Tree is:
 [[146   4   5   0  19  26  10]
 [  8 129   5   3  39  25   1]
 [  4  32 158   0   5  11   0]
 [ 23  30  10  12  79  40  16]
 [ 53  22   2   0 118  10   5]
 [ 21   6   8   3  20  94  58]
 [  7   0   0   3   5  53 142]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.56      0.70      0.62       210
           2       0.58      0.61      0.60       210
           3       0.84      0.75      0.79       210
           4       0.57      0.06      0.10       210
           5       0.41      0.56      0.48       210
           6       0.36      0.45      0.40       210
           7       0.61      0.68      0.64       210

    accuracy                           0.54      1470
   macro avg       0.56      0.54      0.52      1470
weighted avg       0.56      0.54      0.52      1470

Decision Tree with 5 max_dept

Accuracy of Decision Tree after Normalize Scaling is: 0.5625850340136055
Confusion Matrix of Decision Tree is:
 [[123   4   3  14  35  18  13]
 [  8 134   6  19  24  15   4]
 [  8   7 181   5   2   7   0]
 [ 18  23   8  86  33  23  19]
 [ 34  25   1  29 104  11   6]
 [ 15  14   4  28  19  71  59]
 [  6   2   0  13  10  51 128]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.58      0.59      0.58       210
           2       0.64      0.64      0.64       210
           3       0.89      0.86      0.88       210
           4       0.44      0.41      0.43       210
           5       0.46      0.50      0.48       210
           6       0.36      0.34      0.35       210
           7       0.56      0.61      0.58       210

    accuracy                           0.56      1470
   macro avg       0.56      0.56      0.56      1470
weighted avg       0.56      0.56      0.56      1470

Decision Tree with 13 max_dep

Accuracy of Decision Tree after Normalize Scaling is: 0.573469387755102
Confusion Matrix of Decision Tree is:
 [[123   8   3  16  28  23   9]
 [  6 138   5  18  24  15   4]
 [  7   8 179   7   2   7   0]
 [ 23  20   5  93  31  26  12]
 [ 36  25   2  27 102  13   5]
 [ 12  17   3  31  17  80  50]
 [  6   2   1  16   7  50 128]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.58      0.59      0.58       210
           2       0.63      0.66      0.64       210
           3       0.90      0.85      0.88       210
           4       0.45      0.44      0.44       210
           5       0.48      0.49      0.48       210
           6       0.37      0.38      0.38       210
           7       0.62      0.61      0.61       210

    accuracy                           0.57      1470
   macro avg       0.58      0.57      0.57      1470
weighted avg       0.58      0.57      0.57      1470

Random Forest with 1 max_depth

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Random Forest after Normalize Scaling is: 0.5680272108843537
Confusion Matrix of Random Forest is:
 [[129   2   1   0  43  15  20]
 [  3 147   6   1  30  16   7]
 [  4  21 168   0   4  13   0]
 [ 15  36  23   6  71  24  35]
 [ 17  36   3   0 136   2  16]
 [  8  14   9   2  12  46 119]
 [  0   0   0   0   3   4 203]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.73      0.61      0.67       210
           2       0.57      0.70      0.63       210
           3       0.80      0.80      0.80       210
           4       0.67      0.03      0.05       210
           5       0.45      0.65      0.53       210
           6       0.38      0.22      0.28       210
           7       0.51      0.97      0.67       210

    accuracy                           0.57      1470
   macro avg       0.59      0.57      0.52      1470
weighted avg       0.59      0.57      0.52      1470

Random Forest with 4 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.7224489795918367
Confusion Matrix of Random Forest is:
 [[140   1   0   9  30  25   5]
 [  2 150   7   8  29  11   3]
 [  2   6 187   3   4   8   0]
 [  7   3   2 127  31  28  12]
 [ 19  16   0  13 150   4   8]
 [  3   2   0  12  11 126  56]
 [  0   0   0   1   3  24 182]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.81      0.67      0.73       210
           2       0.84      0.71      0.77       210
           3       0.95      0.89      0.92       210
           4       0.73      0.60      0.66       210
           5       0.58      0.71      0.64       210
           6       0.56      0.60      0.58       210
           7       0.68      0.87      0.76       210

    accuracy                           0.72      1470
   macro avg       0.74      0.72      0.72      1470
weighted avg       0.74      0.72      0.72      1470

Random Forest with 12 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.7258503401360544
Confusion Matrix of Random Forest is:
 [[140   2   0   8  32  23   5]
 [  1 153   7  14  21  13   1]
 [  2   4 187   5   4   8   0]
 [  5   5   3 136  20  30  11]
 [ 14  19   0  15 152   5   5]
 [  3   6   0  17   3 120  61]
 [  1   0   0   2   2  26 179]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.84      0.67      0.74       210
           2       0.81      0.73      0.77       210
           3       0.95      0.89      0.92       210
           4       0.69      0.65      0.67       210
           5       0.65      0.72      0.68       210
           6       0.53      0.57      0.55       210
           7       0.68      0.85      0.76       210

    accuracy                           0.73      1470
   macro avg       0.74      0.73      0.73      1470
weighted avg       0.74      0.73      0.73      1470

Random Forest with 20 max_dep

In [8]:
# GKB BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset_gkb.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.3333333333333333
Confusion Matrix of Logistic Regression is:
 [[ 89   0  69   8   9  19  16]
 [ 25   0 121   5   8  26  25]
 [  9   0 179   6   1  15   0]
 [ 23   0 113  13  24  10  27]
 [ 23   0 131   8  16   9  23]
 [ 36   0  31   8   7  58  70]
 [ 25   0  10   7   0  33 135]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.39      0.42      0.40       210
           2       0.00      0.00      0.00       210
           3       0.27      0.85      0.41       210
           4       0.24      0.06      0.10       210
           5       0.25      0.08      0.12       210
           6       0.34      0.28      0.31       210
           7       0.46      0.64      0.53       210

    accuracy                           0.33      1470
   macro avg       0.28      0.33      0.27      1470
weighted avg       0.28      0.33      0.27      1470

KNN with 3 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of KNN Model after Normalize Scaling is: 0.44829931972789117
Confusion Matrix of KNN Model is:
 [[117  20   0  20  39   7   7]
 [ 29 121   7  16  28   3   6]
 [  6  26 161   6   7   4   0]
 [ 57  43  14  48  26   9  13]
 [ 45  39   1  24  86   4  11]
 [ 45  39   4  24  22  40  36]
 [ 25  28   4  19  17  31  86]]
Classification Report of KNN Model is:
               precision    recall  f1-score   support

           1       0.36      0.56      0.44       210
           2       0.38      0.58      0.46       210
           3       0.84      0.77      0.80       210
           4       0.31      0.23      0.26       210
           5       0.38      0.41      0.40       210
           6       0.41      0.19      0.26       210
           7       0.54      0.41      0.47       210

    accuracy                           0.45      1470
   macro avg       0.46      0.45      0.44      1470
weighted avg       0.46      0.45      0.44      1470

KNN with 4 Neighbors
Accuracy of KNN Mod

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of SVM after Normalize Scaling is: 0.3149659863945578
Confusion Matrix of SVM is:
 [[ 85   0  80   3   5   8  29]
 [ 26   0 129   1   3   1  50]
 [ 10   0 184   0   1  13   2]
 [ 30   0 142   2   3   4  29]
 [ 21   0 149   1   7   4  28]
 [ 37   0  42   1   2  46  82]
 [ 31   0   9   1   1  29 139]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.35      0.40      0.38       210
           2       0.00      0.00      0.00       210
           3       0.25      0.88      0.39       210
           4       0.22      0.01      0.02       210
           5       0.32      0.03      0.06       210
           6       0.44      0.22      0.29       210
           7       0.39      0.66      0.49       210

    accuracy                           0.31      1470
   macro avg       0.28      0.31      0.23      1470
weighted avg       0.28      0.31      0.23      1470

Working on SVM Kernal: poly


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of SVM after Normalize Scaling is: 0.3292517006802721
Confusion Matrix of SVM is:
 [[ 90   0  75   3   5  19  18]
 [ 26   0 122   7   3  28  24]
 [  8   0 181   4   1  16   0]
 [ 26   0 131  12   7   8  26]
 [ 20   0 144   8   6   8  24]
 [ 35   0  35   6   5  61  68]
 [ 27   0   8   3   1  37 134]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.39      0.43      0.41       210
           2       0.00      0.00      0.00       210
           3       0.26      0.86      0.40       210
           4       0.28      0.06      0.09       210
           5       0.21      0.03      0.05       210
           6       0.34      0.29      0.32       210
           7       0.46      0.64      0.53       210

    accuracy                           0.33      1470
   macro avg       0.28      0.33      0.26      1470
weighted avg       0.28      0.33      0.26      1470

Working on SVM Kernal: rbf


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of SVM after Normalize Scaling is: 0.3251700680272109
Confusion Matrix of SVM is:
 [[ 89   0  77   3   4  18  19]
 [ 26   0 122   3   7  14  38]
 [  8   0 181   4   1  16   0]
 [ 27   0 137   7   5   6  28]
 [ 20   0 145   6   7   7  25]
 [ 37   0  40   2   3  57  71]
 [ 29   0   9   2   0  33 137]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.38      0.42      0.40       210
           2       0.00      0.00      0.00       210
           3       0.25      0.86      0.39       210
           4       0.26      0.03      0.06       210
           5       0.26      0.03      0.06       210
           6       0.38      0.27      0.32       210
           7       0.43      0.65      0.52       210

    accuracy                           0.33      1470
   macro avg       0.28      0.33      0.25      1470
weighted avg       0.28      0.33      0.25      1470

Working on SVM Kernal: sigmoid


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of SVM after Normalize Scaling is: 0.28095238095238095
Confusion Matrix of SVM is:
 [[ 52   1  83  30   5  18  21]
 [ 24   1 133   5   1   1  45]
 [  5   0 185   0   2  17   1]
 [ 25   0 143   9   1   7  25]
 [ 21   2 153   4   1   4  25]
 [ 27   3  45  12   1  42  80]
 [ 29   1  14  14   0  29 123]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.28      0.25      0.26       210
           2       0.12      0.00      0.01       210
           3       0.24      0.88      0.38       210
           4       0.12      0.04      0.06       210
           5       0.09      0.00      0.01       210
           6       0.36      0.20      0.26       210
           7       0.38      0.59      0.46       210

    accuracy                           0.28      1470
   macro avg       0.23      0.28      0.21      1470
weighted avg       0.23      0.28      0.21      1470

Decision Tree with 1 max_depth
Accuracy of Decision Tree a

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.3040816326530612
Confusion Matrix of Decision Tree is:
 [[  0   3   0   0 168   0  39]
 [  0   6   1   0 150   0  53]
 [  0   2 101   0  91   0  16]
 [  0   0   2   0 176   0  32]
 [  0   1   0   0 178   0  31]
 [  0   1   0   0  84   0 125]
 [  0   1   0   0  47   0 162]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       210
           2       0.43      0.03      0.05       210
           3       0.97      0.48      0.64       210
           4       0.00      0.00      0.00       210
           5       0.20      0.85      0.32       210
           6       0.00      0.00      0.00       210
           7       0.35      0.77      0.49       210

    accuracy                           0.30      1470
   macro avg       0.28      0.30      0.21      1470
weighted avg       0.28      0.30      0.21      1470

Decision Tree with 3 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.3585034013605442
Confusion Matrix of Decision Tree is:
 [[ 94   1   2   0  74  27  12]
 [ 29   2   5   0 121  42  11]
 [ 19   0 103   0  72   4  12]
 [ 36   0   2   0 140  11  21]
 [ 26   1   0   0 152  13  18]
 [ 44   1   0   0  40  52  73]
 [ 33   0   1   0  14  38 124]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.33      0.45      0.38       210
           2       0.40      0.01      0.02       210
           3       0.91      0.49      0.64       210
           4       0.00      0.00      0.00       210
           5       0.25      0.72      0.37       210
           6       0.28      0.25      0.26       210
           7       0.46      0.59      0.52       210

    accuracy                           0.36      1470
   macro avg       0.38      0.36      0.31      1470
weighted avg       0.38      0.36      0.31      1470

Decision Tree with 4 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.40476190476190477
Confusion Matrix of Decision Tree is:
 [[ 94  17   0   0  67  20  12]
 [ 29  98   0   0  64   8  11]
 [  8  51 122   0  24   3   2]
 [ 36  27   2   0 116   8  21]
 [ 26  33   0   0 128   5  18]
 [ 44  28   0   0  36  29  73]
 [ 33  20   1   0  14  18 124]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.35      0.45      0.39       210
           2       0.36      0.47      0.40       210
           3       0.98      0.58      0.73       210
           4       0.00      0.00      0.00       210
           5       0.29      0.61      0.39       210
           6       0.32      0.14      0.19       210
           7       0.48      0.59      0.53       210

    accuracy                           0.40      1470
   macro avg       0.39      0.40      0.38      1470
weighted avg       0.39      0.40      0.38      1470

Decision Tree with 5 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.45782312925170066
Confusion Matrix of Decision Tree is:
 [[ 95  12   0  25  42  25  11]
 [ 30  90   3  27  37  14   9]
 [  9  23 147  14  10   6   1]
 [ 36  15  12  65  51  11  20]
 [ 26  25   5  27 101  13  13]
 [ 44  13   2  17  19  73  42]
 [ 33   4   1   8   6  56 102]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.35      0.45      0.39       210
           2       0.49      0.43      0.46       210
           3       0.86      0.70      0.77       210
           4       0.36      0.31      0.33       210
           5       0.38      0.48      0.42       210
           6       0.37      0.35      0.36       210
           7       0.52      0.49      0.50       210

    accuracy                           0.46      1470
   macro avg       0.48      0.46      0.46      1470
weighted avg       0.48      0.46      0.46      1470

Decision Tree with 6 max_dep

Accuracy of Decision Tree after Normalize Scaling is: 0.45034013605442175
Confusion Matrix of Decision Tree is:
 [[ 88  16   3  33  32  25  13]
 [ 20 100  10  27  32  14   7]
 [  6  14 166  12   8   3   1]
 [ 28  25  12  60  36  23  26]
 [ 35  17   3  28  94  15  18]
 [ 21  25   2  26  25  62  49]
 [ 14   9   2  20  13  60  92]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.42      0.42      0.42       210
           2       0.49      0.48      0.48       210
           3       0.84      0.79      0.81       210
           4       0.29      0.29      0.29       210
           5       0.39      0.45      0.42       210
           6       0.31      0.30      0.30       210
           7       0.45      0.44      0.44       210

    accuracy                           0.45      1470
   macro avg       0.45      0.45      0.45      1470
weighted avg       0.45      0.45      0.45      1470

Decision Tree with 14 max_de

Accuracy of Random Forest after Normalize Scaling is: 0.3442176870748299
Confusion Matrix of Random Forest is:
 [[  5   3  10   3 110   0  79]
 [  3  12  48   7  84   0  56]
 [  0   6 152   4  21   0  27]
 [  2   3  31   4 126   0  44]
 [  2   2  20   0 150   0  36]
 [  3   0  15   1  43   0 148]
 [  4   0   1   0  22   0 183]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.26      0.02      0.04       210
           2       0.46      0.06      0.10       210
           3       0.55      0.72      0.62       210
           4       0.21      0.02      0.03       210
           5       0.27      0.71      0.39       210
           6       0.00      0.00      0.00       210
           7       0.32      0.87      0.47       210

    accuracy                           0.34      1470
   macro avg       0.30      0.34      0.24      1470
weighted avg       0.30      0.34      0.24      1470

Random Forest with 2 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Random Forest after Normalize Scaling is: 0.37210884353741497
Confusion Matrix of Random Forest is:
 [[ 46   2   2   0 119   1  40]
 [  6  28  20   0 102   1  53]
 [  0  14 127   1  41   0  27]
 [  6   5   7   4 147   0  41]
 [  8   2   1   0 167   0  32]
 [ 16   5   5   1  48   0 135]
 [ 13   0   0   1  21   0 175]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.48      0.22      0.30       210
           2       0.50      0.13      0.21       210
           3       0.78      0.60      0.68       210
           4       0.57      0.02      0.04       210
           5       0.26      0.80      0.39       210
           6       0.00      0.00      0.00       210
           7       0.35      0.83      0.49       210

    accuracy                           0.37      1470
   macro avg       0.42      0.37      0.30      1470
weighted avg       0.42      0.37      0.30      1470

Random Forest with 3 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.5326530612244897
Confusion Matrix of Random Forest is:
 [[110   4   1  20  42  22  11]
 [ 21 106   8  19  30  17   9]
 [  3  15 169  12   4   6   1]
 [ 27  17   5  79  46  15  21]
 [ 28  14   0  23 115   6  24]
 [ 26   5   1  23  14  72  69]
 [ 17   1   0  10   4  46 132]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.47      0.52      0.50       210
           2       0.65      0.50      0.57       210
           3       0.92      0.80      0.86       210
           4       0.42      0.38      0.40       210
           5       0.45      0.55      0.49       210
           6       0.39      0.34      0.37       210
           7       0.49      0.63      0.55       210

    accuracy                           0.53      1470
   macro avg       0.54      0.53      0.53      1470
weighted avg       0.54      0.53      0.53      1470

Random Forest with 11 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.5489795918367347
Confusion Matrix of Random Forest is:
 [[104   4   1  23  46  19  13]
 [  9 112   9  27  31  13   9]
 [  1  15 169  10   8   6   1]
 [ 17  15   2  89  48  14  25]
 [ 20  12   1  20 127   7  23]
 [ 15   5   3  25  18  71  73]
 [ 12   4   1   8   4  46 135]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.58      0.50      0.54       210
           2       0.67      0.53      0.59       210
           3       0.91      0.80      0.85       210
           4       0.44      0.42      0.43       210
           5       0.45      0.60      0.52       210
           6       0.40      0.34      0.37       210
           7       0.48      0.64      0.55       210

    accuracy                           0.55      1470
   macro avg       0.56      0.55      0.55      1470
weighted avg       0.56      0.55      0.55      1470

Random Forest with 19 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# N Distill BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset_ndisbert.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.6965986394557823
Confusion Matrix of Logistic Regression is:
 [[143   4   2  10  33  15   3]
 [  4 141  16  13  21  13   2]
 [  4   5 182  12   5   2   0]
 [  9   6  22 132  23  13   5]
 [ 24   7  10  12 140  10   7]
 [ 13   9   6  21   4  98  59]
 [  1   2   0   0   3  16 188]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.72      0.68      0.70       210
           2       0.81      0.67      0.73       210
           3       0.76      0.87      0.81       210
           4       0.66      0.63      0.64       210
           5       0.61      0.67      0.64       210
           6       0.59      0.47      0.52       210
           7       0.71      0.90      0.79       210

    accuracy                           0.70      1470
   macro avg       0.70      0.70      0.69      1470
weighted avg       0.70      0.70      0.69      1470

KNN with 3 

Accuracy of SVM after Normalize Scaling is: 0.7278911564625851
Confusion Matrix of SVM is:
 [[146   3   3  10  35  12   1]
 [  3 153   8  12  18  14   2]
 [  2   6 180  10  10   2   0]
 [  4   7  12 146  24  14   3]
 [ 17   5   7  12 153   9   7]
 [ 14   5   6  20   3 100  62]
 [  1   0   0   1   2  14 192]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.78      0.70      0.74       210
           2       0.85      0.73      0.79       210
           3       0.83      0.86      0.85       210
           4       0.69      0.70      0.69       210
           5       0.62      0.73      0.67       210
           6       0.61      0.48      0.53       210
           7       0.72      0.91      0.81       210

    accuracy                           0.73      1470
   macro avg       0.73      0.73      0.72      1470
weighted avg       0.73      0.73      0.72      1470

Working on SVM Kernal: poly
Accuracy of SVM after Normalize

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.3489795918367347
Confusion Matrix of Decision Tree is:
 [[186   0   3   0   0  17   4]
 [174   0   2   0   0  26   8]
 [107   0 103   0   0   0   0]
 [173   0  14   0   0  14   9]
 [183   0   5   0   0  18   4]
 [ 74   0   5   0   0  63  68]
 [  7   0   0   0   0  42 161]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.21      0.89      0.33       210
           2       0.00      0.00      0.00       210
           3       0.78      0.49      0.60       210
           4       0.00      0.00      0.00       210
           5       0.00      0.00      0.00       210
           6       0.35      0.30      0.32       210
           7       0.63      0.77      0.69       210

    accuracy                           0.35      1470
   macro avg       0.28      0.35      0.28      1470
weighted avg       0.28      0.35      0.28      1470

Decision Tree with 3 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.4163265306122449
Confusion Matrix of Decision Tree is:
 [[130   0   1  58   0   7  14]
 [ 20   0   0 156   0   3  31]
 [ 25   0  94  91   0   0   0]
 [ 22   0   4 161   0   4  19]
 [ 33   0   1 154   0   7  15]
 [ 37   0   0  42   0  33  98]
 [  6   0   0   1   0   9 194]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.48      0.62      0.54       210
           2       0.00      0.00      0.00       210
           3       0.94      0.45      0.61       210
           4       0.24      0.77      0.37       210
           5       0.00      0.00      0.00       210
           6       0.52      0.16      0.24       210
           7       0.52      0.92      0.67       210

    accuracy                           0.42      1470
   macro avg       0.39      0.42      0.35      1470
weighted avg       0.39      0.42      0.35      1470

Decision Tree with 4 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.45850340136054424
Confusion Matrix of Decision Tree is:
 [[130   9   1  56   0   7   7]
 [ 20  93   0  84   0   8   5]
 [ 25  13  94  78   0   0   0]
 [ 22  15   4 152   0   8   9]
 [ 33  15   1 147   0   7   7]
 [ 37  27   0  34   0  47  65]
 [  6  15   0   0   0  31 158]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.48      0.62      0.54       210
           2       0.50      0.44      0.47       210
           3       0.94      0.45      0.61       210
           4       0.28      0.72      0.40       210
           5       0.00      0.00      0.00       210
           6       0.44      0.22      0.30       210
           7       0.63      0.75      0.69       210

    accuracy                           0.46      1470
   macro avg       0.46      0.46      0.43      1470
weighted avg       0.46      0.46      0.43      1470

Decision Tree with 5 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.4993197278911565
Confusion Matrix of Decision Tree is:
 [[124   0   2   9  58  12   5]
 [ 19  75   1   8  76  27   4]
 [  8   7 114  12  69   0   0]
 [ 19   8   7  60  94  18   4]
 [ 34   5   3  10 141  11   6]
 [ 35   5  11  10  26  64  59]
 [  9   2   0   0   0  43 156]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.50      0.59      0.54       210
           2       0.74      0.36      0.48       210
           3       0.83      0.54      0.66       210
           4       0.55      0.29      0.38       210
           5       0.30      0.67      0.42       210
           6       0.37      0.30      0.33       210
           7       0.67      0.74      0.70       210

    accuracy                           0.50      1470
   macro avg       0.56      0.50      0.50      1470
weighted avg       0.56      0.50      0.50      1470

Decision Tree with 6 max_dept

Accuracy of Decision Tree after Normalize Scaling is: 0.5659863945578232
Confusion Matrix of Decision Tree is:
 [[121  10   5  18  39  12   5]
 [  7 125   6  23  29  17   3]
 [ 10   5 164  17  11   3   0]
 [ 19  15  14 102  34  20   6]
 [ 28  23   5  32 107  10   5]
 [ 22  26  11  16  14  84  37]
 [  6   5   1   7   9  53 129]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.57      0.58      0.57       210
           2       0.60      0.60      0.60       210
           3       0.80      0.78      0.79       210
           4       0.47      0.49      0.48       210
           5       0.44      0.51      0.47       210
           6       0.42      0.40      0.41       210
           7       0.70      0.61      0.65       210

    accuracy                           0.57      1470
   macro avg       0.57      0.57      0.57      1470
weighted avg       0.57      0.57      0.57      1470

Decision Tree with 14 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.29183673469387755
Confusion Matrix of Random Forest is:
 [[  7   1 171   1   1   2  27]
 [  1   9 133   0  12   5  50]
 [  1   0 199   0   0   0  10]
 [  3   1 168   0   2   0  36]
 [  5   2 171   0   7   0  25]
 [  5   1  36   0   4   1 163]
 [  1   0   0   0   1   2 206]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.30      0.03      0.06       210
           2       0.64      0.04      0.08       210
           3       0.23      0.95      0.37       210
           4       0.00      0.00      0.00       210
           5       0.26      0.03      0.06       210
           6       0.10      0.00      0.01       210
           7       0.40      0.98      0.57       210

    accuracy                           0.29      1470
   macro avg       0.28      0.29      0.16      1470
weighted avg       0.28      0.29      0.16      1470

Random Forest with 2 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.6693877551020408
Confusion Matrix of Random Forest is:
 [[134   4   2  11  36  20   3]
 [  2 134   5  21  29  17   2]
 [  9   5 167  14  10   5   0]
 [  9   8   2 130  32  24   5]
 [ 19   9   3  23 137  11   8]
 [  9   5   0  20   7  97  72]
 [  0   0   0   0   4  21 185]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.74      0.64      0.68       210
           2       0.81      0.64      0.71       210
           3       0.93      0.80      0.86       210
           4       0.59      0.62      0.61       210
           5       0.54      0.65      0.59       210
           6       0.50      0.46      0.48       210
           7       0.67      0.88      0.76       210

    accuracy                           0.67      1470
   macro avg       0.68      0.67      0.67      1470
weighted avg       0.68      0.67      0.67      1470

Random Forest with 10 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.689795918367347
Confusion Matrix of Random Forest is:
 [[135   3   1  12  34  22   3]
 [  1 148   5  17  19  17   3]
 [  4   7 174  11   6   8   0]
 [  3   7   6 140  28  22   4]
 [ 22  12   2  26 128  13   7]
 [ 11   6   1  20   5 103  64]
 [  1   0   0   0   1  22 186]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.76      0.64      0.70       210
           2       0.81      0.70      0.75       210
           3       0.92      0.83      0.87       210
           4       0.62      0.67      0.64       210
           5       0.58      0.61      0.59       210
           6       0.50      0.49      0.49       210
           7       0.70      0.89      0.78       210

    accuracy                           0.69      1470
   macro avg       0.70      0.69      0.69      1470
weighted avg       0.70      0.69      0.69      1470

Random Forest with 18 max_dept

In [10]:
# V BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset_vbert.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.7537414965986394
Confusion Matrix of Logistic Regression is:
 [[149   2   0   0  44  12   3]
 [  0 156   9  10  19  14   2]
 [  1   2 189   7   6   4   1]
 [  4   4  17 145  23  10   7]
 [ 10   7   3  14 160   5  11]
 [  1   7  14  13   4 125  46]
 [  0   0   0   1   1  24 184]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.90      0.71      0.79       210
           2       0.88      0.74      0.80       210
           3       0.81      0.90      0.86       210
           4       0.76      0.69      0.72       210
           5       0.62      0.76      0.69       210
           6       0.64      0.60      0.62       210
           7       0.72      0.88      0.79       210

    accuracy                           0.75      1470
   macro avg       0.76      0.75      0.75      1470
weighted avg       0.76      0.75      0.75      1470

KNN with 3 

Accuracy of SVM after Normalize Scaling is: 0.7687074829931972
Confusion Matrix of SVM is:
 [[144   2   0   2  47  13   2]
 [  0 162   8  14  13  12   1]
 [  0   3 190   7   5   4   1]
 [  3   5  16 145  22  13   6]
 [ 12  11   2   9 161   7   8]
 [  3   4   9  13   3 139  39]
 [  0   0   0   0   1  20 189]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.89      0.69      0.77       210
           2       0.87      0.77      0.82       210
           3       0.84      0.90      0.87       210
           4       0.76      0.69      0.72       210
           5       0.64      0.77      0.70       210
           6       0.67      0.66      0.67       210
           7       0.77      0.90      0.83       210

    accuracy                           0.77      1470
   macro avg       0.78      0.77      0.77      1470
weighted avg       0.78      0.77      0.77      1470

Working on SVM Kernal: poly
Accuracy of SVM after Normalize

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.3414965986394558
Confusion Matrix of Decision Tree is:
 [[188   2   1   0   0   0  19]
 [157  27   8   0   0   0  18]
 [ 52  17 135   0   0   0   6]
 [157  23  15   0   0   0  15]
 [195   5   1   0   0   0   9]
 [ 92  11   8   0   0   0  99]
 [ 55   2   1   0   0   0 152]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.21      0.90      0.34       210
           2       0.31      0.13      0.18       210
           3       0.80      0.64      0.71       210
           4       0.00      0.00      0.00       210
           5       0.00      0.00      0.00       210
           6       0.00      0.00      0.00       210
           7       0.48      0.72      0.58       210

    accuracy                           0.34      1470
   macro avg       0.26      0.34      0.26      1470
weighted avg       0.26      0.34      0.26      1470

Decision Tree with 3 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.4061224489795918
Confusion Matrix of Decision Tree is:
 [[ 67   0   1   2 121  13   6]
 [  7  25   5   2 150  19   2]
 [  7   2 126  15  45  14   1]
 [ 11   3  12  20 146  12   6]
 [ 11   1   1   4 184   3   6]
 [ 10   3   1   8  82  65  41]
 [  0   2   0   0  55  43 110]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.59      0.32      0.41       210
           2       0.69      0.12      0.20       210
           3       0.86      0.60      0.71       210
           4       0.39      0.10      0.15       210
           5       0.23      0.88      0.37       210
           6       0.38      0.31      0.34       210
           7       0.64      0.52      0.58       210

    accuracy                           0.41      1470
   macro avg       0.54      0.41      0.40      1470
weighted avg       0.54      0.41      0.40      1470

Decision Tree with 4 max_dept

Accuracy of Decision Tree after Normalize Scaling is: 0.5285714285714286
Confusion Matrix of Decision Tree is:
 [[109   7   1  24  41  23   5]
 [  8 125   5  26  23  16   7]
 [  2  10 163  23   7   4   1]
 [ 22  14   8 105  31  22   8]
 [ 34  24   1  42  86  14   9]
 [ 21  15   6  35  18  67  48]
 [  7   7   3  12  16  43 122]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.54      0.52      0.53       210
           2       0.62      0.60      0.61       210
           3       0.87      0.78      0.82       210
           4       0.39      0.50      0.44       210
           5       0.39      0.41      0.40       210
           6       0.35      0.32      0.34       210
           7       0.61      0.58      0.60       210

    accuracy                           0.53      1470
   macro avg       0.54      0.53      0.53      1470
weighted avg       0.54      0.53      0.53      1470

Decision Tree with 12 max_dep

Accuracy of Decision Tree after Normalize Scaling is: 0.5129251700680272
Confusion Matrix of Decision Tree is:
 [[102   8   4  18  45  22  11]
 [ 14 130  10  17  22  11   6]
 [  1   8 178  12   5   4   2]
 [ 25  22  11  82  35  25  10]
 [ 35  25   9  30  81  16  14]
 [ 23  22   9  33  17  61  45]
 [  9   5   1   9  15  51 120]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.49      0.49      0.49       210
           2       0.59      0.62      0.60       210
           3       0.80      0.85      0.82       210
           4       0.41      0.39      0.40       210
           5       0.37      0.39      0.38       210
           6       0.32      0.29      0.31       210
           7       0.58      0.57      0.57       210

    accuracy                           0.51      1470
   macro avg       0.51      0.51      0.51      1470
weighted avg       0.51      0.51      0.51      1470

Decision Tree with 20 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Random Forest after Normalize Scaling is: 0.4707482993197279
Confusion Matrix of Random Forest is:
 [[ 66   5   3   0  95   0  41]
 [  2  98  18   0  70   0  22]
 [  0  15 163   0  29   0   3]
 [  5  16  28  15  86   0  60]
 [  2  18   0   0 145   0  45]
 [  4   3  24   4  22   3 150]
 [  1   3   2   0   2   0 202]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.82      0.31      0.46       210
           2       0.62      0.47      0.53       210
           3       0.68      0.78      0.73       210
           4       0.79      0.07      0.13       210
           5       0.32      0.69      0.44       210
           6       1.00      0.01      0.03       210
           7       0.39      0.96      0.55       210

    accuracy                           0.47      1470
   macro avg       0.66      0.47      0.41      1470
weighted avg       0.66      0.47      0.41      1470

Random Forest with 3 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.7061224489795919
Confusion Matrix of Random Forest is:
 [[121   3   0   6  54  19   7]
 [  0 156   7  11  19  14   3]
 [  2   3 177  13  10   5   0]
 [  6   5   6 139  29  16   9]
 [  8  13   1  16 150  10  12]
 [  4   4   9  16  10 102  65]
 [  0   0   0   3   4  10 193]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.86      0.58      0.69       210
           2       0.85      0.74      0.79       210
           3       0.89      0.84      0.86       210
           4       0.68      0.66      0.67       210
           5       0.54      0.71      0.62       210
           6       0.58      0.49      0.53       210
           7       0.67      0.92      0.77       210

    accuracy                           0.71      1470
   macro avg       0.72      0.71      0.71      1470
weighted avg       0.72      0.71      0.71      1470

Random Forest with 11 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.717687074829932
Confusion Matrix of Random Forest is:
 [[130   2   1   4  48  19   6]
 [  0 153   9  11  22  13   2]
 [  2   2 186   9   5   6   0]
 [  2   1   7 142  33  18   7]
 [ 14  15   0  23 142   5  11]
 [  5   5   8  13   8 118  53]
 [  0   1   0   3   3  19 184]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.85      0.62      0.72       210
           2       0.85      0.73      0.79       210
           3       0.88      0.89      0.88       210
           4       0.69      0.68      0.68       210
           5       0.54      0.68      0.60       210
           6       0.60      0.56      0.58       210
           7       0.70      0.88      0.78       210

    accuracy                           0.72      1470
   macro avg       0.73      0.72      0.72      1470
weighted avg       0.73      0.72      0.72      1470

Random Forest with 19 max_dept

In [11]:
# GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//gpt_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.7210884353741497
Confusion Matrix of Logistic Regression is:
 [[148   2   0   7  32  13   8]
 [  1 160  15   9  15   9   1]
 [  2   2 188   9   3   6   0]
 [  3   5  10 152  13  16  11]
 [ 21  18   0  20 131   9  11]
 [ 10  13  10  17   4 104  52]
 [  1   1   0   1   4  26 177]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.80      0.70      0.75       210
           2       0.80      0.76      0.78       210
           3       0.84      0.90      0.87       210
           4       0.71      0.72      0.72       210
           5       0.65      0.62      0.64       210
           6       0.57      0.50      0.53       210
           7       0.68      0.84      0.75       210

    accuracy                           0.72      1470
   macro avg       0.72      0.72      0.72      1470
weighted avg       0.72      0.72      0.72      1470

KNN with 3 

Accuracy of SVM after Normalize Scaling is: 0.7272108843537415
Confusion Matrix of SVM is:
 [[148   1   0   8  30  17   6]
 [  0 162  11  10  14  12   1]
 [  1   2 191   7   1   8   0]
 [  3   4   9 152  14  19   9]
 [ 24  19   1  21 129   7   9]
 [  9  12  12  15   2 111  49]
 [  2   0   0   1   6  25 176]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.79      0.70      0.75       210
           2       0.81      0.77      0.79       210
           3       0.85      0.91      0.88       210
           4       0.71      0.72      0.72       210
           5       0.66      0.61      0.64       210
           6       0.56      0.53      0.54       210
           7       0.70      0.84      0.77       210

    accuracy                           0.73      1470
   macro avg       0.73      0.73      0.73      1470
weighted avg       0.73      0.73      0.73      1470

Working on SVM Kernal: poly
Accuracy of SVM after Normalize

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.32040816326530613
Confusion Matrix of Decision Tree is:
 [[  0   0  10   0  87   0 113]
 [  0   0  16   0 153   0  41]
 [  0   0 143   0  34   0  33]
 [  0   0  26   0 100   0  84]
 [  0   0   4   0 161   0  45]
 [  0   0  20   0  48   0 142]
 [  0   0   7   0  36   0 167]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       210
           2       0.00      0.00      0.00       210
           3       0.63      0.68      0.66       210
           4       0.00      0.00      0.00       210
           5       0.26      0.77      0.39       210
           6       0.00      0.00      0.00       210
           7       0.27      0.80      0.40       210

    accuracy                           0.32      1470
   macro avg       0.17      0.32      0.21      1470
weighted avg       0.17      0.32      0.21      1470

Decision Tree with 3 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.408843537414966
Confusion Matrix of Decision Tree is:
 [[ 53   8   6   0  79   0  64]
 [ 14 108  15   0  45   0  28]
 [ 22  20 141   0  14   0  13]
 [  3  29  25   0  71   0  82]
 [  1  27   4   0 134   0  44]
 [ 19  10  20   0  38   0 123]
 [  3   7   6   0  29   0 165]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.46      0.25      0.33       210
           2       0.52      0.51      0.52       210
           3       0.65      0.67      0.66       210
           4       0.00      0.00      0.00       210
           5       0.33      0.64      0.43       210
           6       0.00      0.00      0.00       210
           7       0.32      0.79      0.45       210

    accuracy                           0.41      1470
   macro avg       0.32      0.41      0.34      1470
weighted avg       0.32      0.41      0.34      1470

Decision Tree with 4 max_depth

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.4204081632653061
Confusion Matrix of Decision Tree is:
 [[ 53   3   1  50  34  60   9]
 [ 14  88   8  34  31  29   6]
 [ 20   2 140  28   6  14   0]
 [  3   5  22  64  33  77   6]
 [  1  19   4  48  94  36   8]
 [ 19   4   8  28  16 106  29]
 [  3   6   3  20  10  95  73]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.47      0.25      0.33       210
           2       0.69      0.42      0.52       210
           3       0.75      0.67      0.71       210
           4       0.24      0.30      0.27       210
           5       0.42      0.45      0.43       210
           6       0.25      0.50      0.34       210
           7       0.56      0.35      0.43       210

    accuracy                           0.42      1470
   macro avg       0.48      0.42      0.43      1470
weighted avg       0.48      0.42      0.43      1470

Decision Tree with 5 max_dept

Accuracy of Decision Tree after Normalize Scaling is: 0.5095238095238095
Confusion Matrix of Decision Tree is:
 [[107   8   5  18  38  25   9]
 [ 14 122  11  11  37  11   4]
 [  9   6 163   9   4  15   4]
 [ 11  11   9  80  36  43  20]
 [ 31  27   4  28  93  13  14]
 [ 22  23   7  21  19  73  45]
 [ 15   8   5  12  12  47 111]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.51      0.51      0.51       210
           2       0.60      0.58      0.59       210
           3       0.80      0.78      0.79       210
           4       0.45      0.38      0.41       210
           5       0.39      0.44      0.41       210
           6       0.32      0.35      0.33       210
           7       0.54      0.53      0.53       210

    accuracy                           0.51      1470
   macro avg       0.51      0.51      0.51      1470
weighted avg       0.51      0.51      0.51      1470

Decision Tree with 13 max_dep

Accuracy of Decision Tree after Normalize Scaling is: 0.5163265306122449
Confusion Matrix of Decision Tree is:
 [[117   6   1  15  35  28   8]
 [ 11 125  11  12  39   6   6]
 [  7   5 166  10   5  15   2]
 [ 12  16  13  89  25  40  15]
 [ 26  25   5  28  92  20  14]
 [ 23  19  10  26  24  65  43]
 [ 13  12   4  15  23  38 105]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.56      0.56      0.56       210
           2       0.60      0.60      0.60       210
           3       0.79      0.79      0.79       210
           4       0.46      0.42      0.44       210
           5       0.38      0.44      0.41       210
           6       0.31      0.31      0.31       210
           7       0.54      0.50      0.52       210

    accuracy                           0.52      1470
   macro avg       0.52      0.52      0.52      1470
weighted avg       0.52      0.52      0.52      1470

Random Forest with 1 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.6748299319727891
Confusion Matrix of Random Forest is:
 [[119   2   0   7  43  21  18]
 [  0 148   7  11  29  11   4]
 [  3   2 177  10   4  14   0]
 [  4   4   3 145  13  17  24]
 [ 17  16   1  20 121  13  22]
 [  7   9   5  12  10  95  72]
 [  0   1   0   0   4  18 187]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.79      0.57      0.66       210
           2       0.81      0.70      0.76       210
           3       0.92      0.84      0.88       210
           4       0.71      0.69      0.70       210
           5       0.54      0.58      0.56       210
           6       0.50      0.45      0.48       210
           7       0.57      0.89      0.70       210

    accuracy                           0.67      1470
   macro avg       0.69      0.67      0.67      1470
weighted avg       0.69      0.67      0.67      1470

Random Forest with 9 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.6863945578231293
Confusion Matrix of Random Forest is:
 [[123   2   0   8  39  30   8]
 [  0 160   6  12  24   6   2]
 [  4   3 181  10   2   9   1]
 [  5   6   3 141  21  19  15]
 [ 19  18   0  15 129  13  16]
 [  8   8   2  20  10 103  59]
 [  0   1   0   2   4  31 172]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.77      0.59      0.67       210
           2       0.81      0.76      0.78       210
           3       0.94      0.86      0.90       210
           4       0.68      0.67      0.67       210
           5       0.56      0.61      0.59       210
           6       0.49      0.49      0.49       210
           7       0.63      0.82      0.71       210

    accuracy                           0.69      1470
   macro avg       0.70      0.69      0.69      1470
weighted avg       0.70      0.69      0.69      1470

Random Forest with 17 max_dep

In [12]:
# XLM vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//xlm_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.7380952380952381
Confusion Matrix of Logistic Regression is:
 [[149   0   0   9  28  21   3]
 [  0 167   9  13  10  11   0]
 [  0   7 184   5   4   9   1]
 [  3   8  15 143  12  23   6]
 [ 27  21   2   9 145   3   3]
 [  5   8   4  17   4 117  55]
 [  0   2   0   5   2  21 180]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.81      0.71      0.76       210
           2       0.78      0.80      0.79       210
           3       0.86      0.88      0.87       210
           4       0.71      0.68      0.70       210
           5       0.71      0.69      0.70       210
           6       0.57      0.56      0.56       210
           7       0.73      0.86      0.79       210

    accuracy                           0.74      1470
   macro avg       0.74      0.74      0.74      1470
weighted avg       0.74      0.74      0.74      1470

KNN with 3 

Accuracy of SVM after Normalize Scaling is: 0.7653061224489796
Confusion Matrix of SVM is:
 [[153   0   0   6  29  19   3]
 [  1 169   7  13  11   9   0]
 [  0   6 187   2   4  11   0]
 [  2   9  15 150  10  21   3]
 [ 21  20   3   7 154   3   2]
 [  3   7   3  17   3 126  51]
 [  0   1   0   2   1  20 186]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.85      0.73      0.78       210
           2       0.80      0.80      0.80       210
           3       0.87      0.89      0.88       210
           4       0.76      0.71      0.74       210
           5       0.73      0.73      0.73       210
           6       0.60      0.60      0.60       210
           7       0.76      0.89      0.82       210

    accuracy                           0.77      1470
   macro avg       0.77      0.77      0.76      1470
weighted avg       0.77      0.77      0.76      1470

Working on SVM Kernal: poly
Accuracy of SVM after Normalize

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.34965986394557824
Confusion Matrix of Decision Tree is:
 [[  0   0   5   0 172   0  33]
 [  0   0  10   0 174   0  26]
 [  0   0 137   0  56   0  17]
 [  0   0  15   0 148   0  47]
 [  0   0   3   0 189   0  18]
 [  0   0   9   0  49   0 152]
 [  0   0   2   0  20   0 188]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       210
           2       0.00      0.00      0.00       210
           3       0.76      0.65      0.70       210
           4       0.00      0.00      0.00       210
           5       0.23      0.90      0.37       210
           6       0.00      0.00      0.00       210
           7       0.39      0.90      0.54       210

    accuracy                           0.35      1470
   macro avg       0.20      0.35      0.23      1470
weighted avg       0.20      0.35      0.23      1470

Decision Tree with 3 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.4775510204081633
Confusion Matrix of Decision Tree is:
 [[162  10   3   2   0  24   9]
 [ 15 159   9   1   0  18   8]
 [ 15  41 137   0   0  16   1]
 [ 58  90   7   8   0  40   7]
 [124  65   1   2   0   6  12]
 [ 23  26   9   0   0  91  61]
 [ 14   6   2   0   0  43 145]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.39      0.77      0.52       210
           2       0.40      0.76      0.52       210
           3       0.82      0.65      0.72       210
           4       0.62      0.04      0.07       210
           5       0.00      0.00      0.00       210
           6       0.38      0.43      0.41       210
           7       0.60      0.69      0.64       210

    accuracy                           0.48      1470
   macro avg       0.46      0.48      0.41      1470
weighted avg       0.46      0.48      0.41      1470

Decision Tree with 4 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.5231292517006803
Confusion Matrix of Decision Tree is:
 [[ 87   1   0  10  76  32   4]
 [  2 128   5  32  13  28   2]
 [  2  11 127  32  13  25   0]
 [  2  26   0  72  56  48   6]
 [ 13  29   0  36 113  11   8]
 [  1   9   1  17  22 118  42]
 [  0   0   1   6  14  65 124]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.81      0.41      0.55       210
           2       0.63      0.61      0.62       210
           3       0.95      0.60      0.74       210
           4       0.35      0.34      0.35       210
           5       0.37      0.54      0.44       210
           6       0.36      0.56      0.44       210
           7       0.67      0.59      0.63       210

    accuracy                           0.52      1470
   macro avg       0.59      0.52      0.54      1470
weighted avg       0.59      0.52      0.54      1470

Decision Tree with 5 max_dept

Accuracy of Decision Tree after Normalize Scaling is: 0.5748299319727891
Confusion Matrix of Decision Tree is:
 [[113   8   2  17  50  16   4]
 [  5 140  13  16  21   9   6]
 [  4  13 173  11   2   5   2]
 [ 12  17   8 102  32  26  13]
 [ 26  31   7  33 102   5   6]
 [ 16  13   7  23  17  85  49]
 [  9   4   1  12  13  41 130]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.61      0.54      0.57       210
           2       0.62      0.67      0.64       210
           3       0.82      0.82      0.82       210
           4       0.48      0.49      0.48       210
           5       0.43      0.49      0.46       210
           6       0.45      0.40      0.43       210
           7       0.62      0.62      0.62       210

    accuracy                           0.57      1470
   macro avg       0.58      0.57      0.57      1470
weighted avg       0.58      0.57      0.57      1470

Decision Tree with 13 max_dep

Accuracy of Decision Tree after Normalize Scaling is: 0.5741496598639456
Confusion Matrix of Decision Tree is:
 [[120   9   4  12  40  19   6]
 [  7 133  12  20  23  11   4]
 [  4  14 174   9   1   6   2]
 [ 11  17  13 105  27  26  11]
 [ 35  35   4  27  96   9   4]
 [ 12  17   5  23   6  87  60]
 [  6   5   1  10   9  50 129]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.62      0.57      0.59       210
           2       0.58      0.63      0.60       210
           3       0.82      0.83      0.82       210
           4       0.51      0.50      0.50       210
           5       0.48      0.46      0.47       210
           6       0.42      0.41      0.42       210
           7       0.60      0.61      0.61       210

    accuracy                           0.57      1470
   macro avg       0.57      0.57      0.57      1470
weighted avg       0.57      0.57      0.57      1470

Random Forest with 1 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.7027210884353742
Confusion Matrix of Random Forest is:
 [[128   0   0   5  42  22  13]
 [  1 153   5  18  18  14   1]
 [  0   4 177  10   6  13   0]
 [  1   5   2 132  23  32  15]
 [ 18  23   0  11 147   3   8]
 [  2   4   1  13   8 109  73]
 [  0   0   0   2   2  19 187]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.85      0.61      0.71       210
           2       0.81      0.73      0.77       210
           3       0.96      0.84      0.90       210
           4       0.69      0.63      0.66       210
           5       0.60      0.70      0.64       210
           6       0.51      0.52      0.52       210
           7       0.63      0.89      0.74       210

    accuracy                           0.70      1470
   macro avg       0.72      0.70      0.70      1470
weighted avg       0.72      0.70      0.70      1470

Random Forest with 9 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.7319727891156462
Confusion Matrix of Random Forest is:
 [[139   0   0   9  32  24   6]
 [  1 161   6  19  16   7   0]
 [  0   3 186  10   2   8   1]
 [  4   4   1 146  16  29  10]
 [ 19  21   0  14 145   6   5]
 [  3   4   2  17   5 122  57]
 [  0   0   0   0   1  32 177]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.84      0.66      0.74       210
           2       0.83      0.77      0.80       210
           3       0.95      0.89      0.92       210
           4       0.68      0.70      0.69       210
           5       0.67      0.69      0.68       210
           6       0.54      0.58      0.56       210
           7       0.69      0.84      0.76       210

    accuracy                           0.73      1470
   macro avg       0.74      0.73      0.73      1470
weighted avg       0.74      0.73      0.73      1470

Random Forest with 17 max_dep

### Fine Tuned Transformers Models

In [13]:
# BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//bert_base_finetuned_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.6462585034013606
Confusion Matrix of Logistic Regression is:
 [[129   6   0  10  42  12  11]
 [  2 157  15   8  19   5   4]
 [  4  14 171   6   5   8   2]
 [  8   8  16 105  24  33  16]
 [ 21  17   3   9 144   2  14]
 [  8  11  12  19  10  82  68]
 [  4   3   0   4   5  32 162]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.73      0.61      0.67       210
           2       0.73      0.75      0.74       210
           3       0.79      0.81      0.80       210
           4       0.65      0.50      0.57       210
           5       0.58      0.69      0.63       210
           6       0.47      0.39      0.43       210
           7       0.58      0.77      0.67       210

    accuracy                           0.65      1470
   macro avg       0.65      0.65      0.64      1470
weighted avg       0.65      0.65      0.64      1470

KNN with 3 

Accuracy of Bernoulli Naive Bayes after Normalize Scaling is: 0.46530612244897956
Confusion Matrix of Bernoulli Naive Bayes is:
 [[ 75   7   5  10  61  13  39]
 [  1 134   8  13  43   4   7]
 [  5  47 135   5   5  10   3]
 [  5  27   9  52  49  17  51]
 [ 10  25   1  10 125   2  37]
 [  6  29   9  27  26  35  78]
 [  4   9   1  16  27  25 128]]
Classification Report of Bernoulli Naive Bayes is:
               precision    recall  f1-score   support

           1       0.71      0.36      0.47       210
           2       0.48      0.64      0.55       210
           3       0.80      0.64      0.71       210
           4       0.39      0.25      0.30       210
           5       0.37      0.60      0.46       210
           6       0.33      0.17      0.22       210
           7       0.37      0.61      0.46       210

    accuracy                           0.47      1470
   macro avg       0.49      0.47      0.45      1470
weighted avg       0.49      0.47      0.45      1470

Work

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.2979591836734694
Confusion Matrix of Decision Tree is:
 [[ 13 115   0   0   0   0  82]
 [  2 170   9   0   0   0  29]
 [  0  68 113   0   0   0  29]
 [  0 135   3   0   0   0  72]
 [  1 153   1   0   0   0  55]
 [  4 102   6   0   0   0  98]
 [  1  67   0   0   0   0 142]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.62      0.06      0.11       210
           2       0.21      0.81      0.33       210
           3       0.86      0.54      0.66       210
           4       0.00      0.00      0.00       210
           5       0.00      0.00      0.00       210
           6       0.00      0.00      0.00       210
           7       0.28      0.68      0.40       210

    accuracy                           0.30      1470
   macro avg       0.28      0.30      0.21      1470
weighted avg       0.28      0.30      0.21      1470

Decision Tree with 3 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.3333333333333333
Confusion Matrix of Decision Tree is:
 [[ 68   7   0   0 110   0  25]
 [ 21  62   4   0 115   0   8]
 [ 25  10 112   0  59   0   4]
 [ 39  10   3   0 125   0  33]
 [ 25  14   0   0 141   0  30]
 [ 37   8   4   0 100   0  61]
 [ 35   2   0   0  66   0 107]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.27      0.32      0.30       210
           2       0.55      0.30      0.38       210
           3       0.91      0.53      0.67       210
           4       0.00      0.00      0.00       210
           5       0.20      0.67      0.30       210
           6       0.00      0.00      0.00       210
           7       0.40      0.51      0.45       210

    accuracy                           0.33      1470
   macro avg       0.33      0.33      0.30      1470
weighted avg       0.33      0.33      0.30      1470

Decision Tree with 4 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.35034013605442177
Confusion Matrix of Decision Tree is:
 [[ 68   7   0  83  27   0  25]
 [ 21  62   2  99  16   2   8]
 [ 12  10 123  56   3   2   4]
 [ 39  10   3 111  14   0  33]
 [ 25  14   0  99  42   0  30]
 [ 37   5   4  96   5   2  61]
 [ 35   1   0  60   6   1 107]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.29      0.32      0.30       210
           2       0.57      0.30      0.39       210
           3       0.93      0.59      0.72       210
           4       0.18      0.53      0.27       210
           5       0.37      0.20      0.26       210
           6       0.29      0.01      0.02       210
           7       0.40      0.51      0.45       210

    accuracy                           0.35      1470
   macro avg       0.43      0.35      0.34      1470
weighted avg       0.43      0.35      0.34      1470

Decision Tree with 5 max_dep

Accuracy of Decision Tree after Normalize Scaling is: 0.42925170068027213
Confusion Matrix of Decision Tree is:
 [[ 79  14   3  32  37  28  17]
 [ 16 100   6  37  27  14  10]
 [ 12  19 145  12  11  10   1]
 [ 21  16   7  82  29  37  18]
 [ 28  16   4  39  91  19  13]
 [ 16  22   6  42  30  56  38]
 [ 18  14   1  34  19  46  78]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.42      0.38      0.40       210
           2       0.50      0.48      0.49       210
           3       0.84      0.69      0.76       210
           4       0.29      0.39      0.34       210
           5       0.37      0.43      0.40       210
           6       0.27      0.27      0.27       210
           7       0.45      0.37      0.41       210

    accuracy                           0.43      1470
   macro avg       0.45      0.43      0.44      1470
weighted avg       0.45      0.43      0.44      1470

Decision Tree with 13 max_de

Accuracy of Decision Tree after Normalize Scaling is: 0.4204081632653061
Confusion Matrix of Decision Tree is:
 [[ 79  25   7  26  32  24  17]
 [ 17 102   8  29  24  17  13]
 [ 11  12 160   9   7   8   3]
 [ 26  17  14  61  30  39  23]
 [ 26  27  12  29  75  22  19]
 [ 25  19  10  38  29  59  30]
 [ 22  10   6  28  26  36  82]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.38      0.38      0.38       210
           2       0.48      0.49      0.48       210
           3       0.74      0.76      0.75       210
           4       0.28      0.29      0.28       210
           5       0.34      0.36      0.35       210
           6       0.29      0.28      0.28       210
           7       0.44      0.39      0.41       210

    accuracy                           0.42      1470
   macro avg       0.42      0.42      0.42      1470
weighted avg       0.42      0.42      0.42      1470

Random Forest with 1 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.5537414965986395
Confusion Matrix of Random Forest is:
 [[ 91   2   0  11  64  14  28]
 [  1 134   4  19  38   7   7]
 [  1  21 164   7   7   5   5]
 [  5  14   6  70  39  35  41]
 [ 10  18   2  11 135   2  32]
 [  5  19   2  24  28  55  77]
 [  2   7   0   6  14  16 165]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.79      0.43      0.56       210
           2       0.62      0.64      0.63       210
           3       0.92      0.78      0.85       210
           4       0.47      0.33      0.39       210
           5       0.42      0.64      0.50       210
           6       0.41      0.26      0.32       210
           7       0.46      0.79      0.58       210

    accuracy                           0.55      1470
   macro avg       0.59      0.55      0.55      1470
weighted avg       0.59      0.55      0.55      1470

Random Forest with 9 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.5693877551020409
Confusion Matrix of Random Forest is:
 [[ 97   3   0  13  53  24  20]
 [  4 133   4  10  36  15   8]
 [  5  15 163   7  10  10   0]
 [  6  12   3  90  35  32  32]
 [  9  22   0  11 135   5  28]
 [  9  14   3  24  18  72  70]
 [  1   3   0  11  15  33 147]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.74      0.46      0.57       210
           2       0.66      0.63      0.65       210
           3       0.94      0.78      0.85       210
           4       0.54      0.43      0.48       210
           5       0.45      0.64      0.53       210
           6       0.38      0.34      0.36       210
           7       0.48      0.70      0.57       210

    accuracy                           0.57      1470
   macro avg       0.60      0.57      0.57      1470
weighted avg       0.60      0.57      0.57      1470

Random Forest with 17 max_dep

In [14]:
# Hinglish BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//vbert_hinglish_finetuned_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.6666666666666666
Confusion Matrix of Logistic Regression is:
 [[107  11   9  14  50  16   3]
 [  2 159   4  11  21   9   4]
 [ 10   5 174  10   8   3   0]
 [ 16   4  11 129  27  19   4]
 [ 13  10   8  13 148   5  13]
 [ 16  15  11  15  11  92  50]
 [  1   4   1   1   4  28 171]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.65      0.51      0.57       210
           2       0.76      0.76      0.76       210
           3       0.80      0.83      0.81       210
           4       0.67      0.61      0.64       210
           5       0.55      0.70      0.62       210
           6       0.53      0.44      0.48       210
           7       0.70      0.81      0.75       210

    accuracy                           0.67      1470
   macro avg       0.67      0.67      0.66      1470
weighted avg       0.67      0.67      0.66      1470

KNN with 3 

Accuracy of Bernoulli Naive Bayes after Normalize Scaling is: 0.463265306122449
Confusion Matrix of Bernoulli Naive Bayes is:
 [[ 46  25   3  13  81  34   8]
 [  6 112   0   5  53  21  13]
 [ 10   8  96  13  65  18   0]
 [ 18   7   6  37 101  31  10]
 [  6  18   0   3 153  13  17]
 [ 12  18   6  13  13  65  83]
 [  1   7   0   1   2  27 172]]
Classification Report of Bernoulli Naive Bayes is:
               precision    recall  f1-score   support

           1       0.46      0.22      0.30       210
           2       0.57      0.53      0.55       210
           3       0.86      0.46      0.60       210
           4       0.44      0.18      0.25       210
           5       0.33      0.73      0.45       210
           6       0.31      0.31      0.31       210
           7       0.57      0.82      0.67       210

    accuracy                           0.46      1470
   macro avg       0.51      0.46      0.45      1470
weighted avg       0.51      0.46      0.45      1470

Workin

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.3360544217687075
Confusion Matrix of Decision Tree is:
 [[  7   0   2   0 189   0  12]
 [  3   0   6   0 173   0  28]
 [  0   0 130   0  78   0   2]
 [  0   0   9   0 186   0  15]
 [  0   0   0   0 191   0  19]
 [  1   0   7   0 101   0 101]
 [  0   0   0   0  44   0 166]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.64      0.03      0.06       210
           2       0.00      0.00      0.00       210
           3       0.84      0.62      0.71       210
           4       0.00      0.00      0.00       210
           5       0.20      0.91      0.33       210
           6       0.00      0.00      0.00       210
           7       0.48      0.79      0.60       210

    accuracy                           0.34      1470
   macro avg       0.31      0.34      0.24      1470
weighted avg       0.31      0.34      0.24      1470

Decision Tree with 3 max_dept

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.4
Confusion Matrix of Decision Tree is:
 [[  7 107   2   0  82   6   6]
 [  0 151   6   0  25  21   7]
 [  0  50 130   0  28   2   0]
 [  0  85   9   0 101  13   2]
 [  0  60   0   0 131   5  14]
 [  1  87   7   0  14  50  51]
 [  0  40   0   0   4  47 119]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.88      0.03      0.06       210
           2       0.26      0.72      0.38       210
           3       0.84      0.62      0.71       210
           4       0.00      0.00      0.00       210
           5       0.34      0.62      0.44       210
           6       0.35      0.24      0.28       210
           7       0.60      0.57      0.58       210

    accuracy                           0.40      1470
   macro avg       0.47      0.40      0.35      1470
weighted avg       0.47      0.40      0.35      1470

Decision Tree with 4 max_depth


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.4170068027210884
Confusion Matrix of Decision Tree is:
 [[104  11   0  47  35   8   5]
 [ 88  78   6   7  18   8   5]
 [ 48   3 127  22   6   4   0]
 [ 77  10   4  67  34  16   2]
 [ 54   6   0  48  83  10   9]
 [ 84  17   4   7   7  44  47]
 [ 39  10   0   2   2  47 110]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.21      0.50      0.30       210
           2       0.58      0.37      0.45       210
           3       0.90      0.60      0.72       210
           4       0.34      0.32      0.33       210
           5       0.45      0.40      0.42       210
           6       0.32      0.21      0.25       210
           7       0.62      0.52      0.57       210

    accuracy                           0.42      1470
   macro avg       0.49      0.42      0.43      1470
weighted avg       0.49      0.42      0.43      1470

Decision Tree with 5 max_dept

Accuracy of Decision Tree after Normalize Scaling is: 0.49183673469387756
Confusion Matrix of Decision Tree is:
 [[ 76   8   7  36  41  28  14]
 [ 12 115  10  22  27  16   8]
 [ 11   5 167  14   5   6   2]
 [ 36   8  10  88  35  28   5]
 [ 25  10   8  33 104  18  12]
 [ 27  22  10  20  12  55  64]
 [ 25  13   2   7   6  39 118]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.36      0.36      0.36       210
           2       0.64      0.55      0.59       210
           3       0.78      0.80      0.79       210
           4       0.40      0.42      0.41       210
           5       0.45      0.50      0.47       210
           6       0.29      0.26      0.28       210
           7       0.53      0.56      0.55       210

    accuracy                           0.49      1470
   macro avg       0.49      0.49      0.49      1470
weighted avg       0.49      0.49      0.49      1470

Decision Tree with 13 max_de

Accuracy of Decision Tree after Normalize Scaling is: 0.49115646258503404
Confusion Matrix of Decision Tree is:
 [[ 81  14   9  29  36  24  17]
 [ 15 122  12  13  26  15   7]
 [ 12   6 172  10   3   5   2]
 [ 40  17   7  86  30  19  11]
 [ 28  18   6  36  87  19  16]
 [ 21  24  15  22  14  53  61]
 [ 16  11   4  12   9  37 121]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.38      0.39      0.38       210
           2       0.58      0.58      0.58       210
           3       0.76      0.82      0.79       210
           4       0.41      0.41      0.41       210
           5       0.42      0.41      0.42       210
           6       0.31      0.25      0.28       210
           7       0.51      0.58      0.54       210

    accuracy                           0.49      1470
   macro avg       0.48      0.49      0.49      1470
weighted avg       0.48      0.49      0.49      1470

Random Forest with 1 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.6074829931972789
Confusion Matrix of Random Forest is:
 [[ 89  14   0  21  55  24   7]
 [  0 144   3  14  22  19   8]
 [ 10   1 166  15   9   8   1]
 [ 14   3   0 104  47  35   7]
 [ 15  15   1  17 135   9  18]
 [ 14  20   4  13   6  76  77]
 [  3   3   0   1   2  22 179]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.61      0.42      0.50       210
           2       0.72      0.69      0.70       210
           3       0.95      0.79      0.86       210
           4       0.56      0.50      0.53       210
           5       0.49      0.64      0.56       210
           6       0.39      0.36      0.38       210
           7       0.60      0.85      0.71       210

    accuracy                           0.61      1470
   macro avg       0.62      0.61      0.60      1470
weighted avg       0.62      0.61      0.60      1470

Random Forest with 9 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.6129251700680272
Confusion Matrix of Random Forest is:
 [[ 94  13   0  20  52  23   8]
 [  3 150   4  13  18  15   7]
 [  7   3 168  12   5  14   1]
 [ 18   6   4 114  38  23   7]
 [ 13  12   3  24 128  14  16]
 [ 16  17   3  16  10  77  71]
 [  2   5   0   0   3  30 170]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.61      0.45      0.52       210
           2       0.73      0.71      0.72       210
           3       0.92      0.80      0.86       210
           4       0.57      0.54      0.56       210
           5       0.50      0.61      0.55       210
           6       0.39      0.37      0.38       210
           7       0.61      0.81      0.69       210

    accuracy                           0.61      1470
   macro avg       0.62      0.61      0.61      1470
weighted avg       0.62      0.61      0.61      1470

Random Forest with 17 max_dep

In [15]:
# GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//gpt_base_finetuned_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.3707482993197279
Confusion Matrix of Logistic Regression is:
 [[ 83   2  20   1  54   8  42]
 [ 21  78  18   1  57  10  25]
 [ 19  34 120   0  15  10  12]
 [ 23  12  10   6  79  10  70]
 [ 31   4   1   1 107   3  63]
 [ 20  10  10   2  46  12 110]
 [  1   7   0   1  56   6 139]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.42      0.40      0.41       210
           2       0.53      0.37      0.44       210
           3       0.67      0.57      0.62       210
           4       0.50      0.03      0.05       210
           5       0.26      0.51      0.34       210
           6       0.20      0.06      0.09       210
           7       0.30      0.66      0.41       210

    accuracy                           0.37      1470
   macro avg       0.41      0.37      0.34      1470
weighted avg       0.41      0.37      0.34      1470

KNN with 3 

Confusion Matrix of Bernoulli Naive Bayes is:
 [[ 85   5  12   9  46  17  36]
 [  1 148   6   6  32   7  10]
 [  5  17 155  16   4   9   4]
 [  7  19  16  75  23  15  55]
 [ 27  20   0   9 108   2  44]
 [ 15  26  12  27  13  32  85]
 [  0  16   0  23  10  30 131]]
Classification Report of Bernoulli Naive Bayes is:
               precision    recall  f1-score   support

           1       0.61      0.40      0.49       210
           2       0.59      0.70      0.64       210
           3       0.77      0.74      0.75       210
           4       0.45      0.36      0.40       210
           5       0.46      0.51      0.48       210
           6       0.29      0.15      0.20       210
           7       0.36      0.62      0.46       210

    accuracy                           0.50      1470
   macro avg       0.50      0.50      0.49      1470
weighted avg       0.50      0.50      0.49      1470

Working on SVM Kernal: linear
Accuracy of SVM after Normalize Scaling is: 0.3013605442

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.32108843537414966
Confusion Matrix of Decision Tree is:
 [[ 12 168   1   0   0   0  29]
 [ 10 179   8   0   0   0  13]
 [  1  68 136   0   0   0   5]
 [  2 148   3   0   0   0  57]
 [  1 163   4   0   0   0  42]
 [  2 113   5   0   0   0  90]
 [  0  65   0   0   0   0 145]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.43      0.06      0.10       210
           2       0.20      0.85      0.32       210
           3       0.87      0.65      0.74       210
           4       0.00      0.00      0.00       210
           5       0.00      0.00      0.00       210
           6       0.00      0.00      0.00       210
           7       0.38      0.69      0.49       210

    accuracy                           0.32      1470
   macro avg       0.27      0.32      0.24      1470
weighted avg       0.27      0.32      0.24      1470

Decision Tree with 3 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.35918367346938773
Confusion Matrix of Decision Tree is:
 [[176   4   1   0   0  19  10]
 [116  77   4   0   0   9   4]
 [ 65   8 132   0   0   1   4]
 [141  10   2   0   0  31  26]
 [161   4   3   0   0  18  24]
 [106  10   4   0   0  28  62]
 [ 62   3   0   0   0  30 115]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.21      0.84      0.34       210
           2       0.66      0.37      0.47       210
           3       0.90      0.63      0.74       210
           4       0.00      0.00      0.00       210
           5       0.00      0.00      0.00       210
           6       0.21      0.13      0.16       210
           7       0.47      0.55      0.51       210

    accuracy                           0.36      1470
   macro avg       0.35      0.36      0.32      1470
weighted avg       0.35      0.36      0.32      1470

Decision Tree with 4 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.39931972789115644
Confusion Matrix of Decision Tree is:
 [[ 98   3   1  79   0  18  11]
 [ 12  77   4 104   0   8   5]
 [ 25   8 132  40   0   1   4]
 [  6   8   2 137   0  31  26]
 [ 50   4   2 111   0  17  26]
 [  9   7   4  99   0  24  67]
 [  1   3   0  61   0  26 119]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.49      0.47      0.48       210
           2       0.70      0.37      0.48       210
           3       0.91      0.63      0.74       210
           4       0.22      0.65      0.33       210
           5       0.00      0.00      0.00       210
           6       0.19      0.11      0.14       210
           7       0.46      0.57      0.51       210

    accuracy                           0.40      1470
   macro avg       0.42      0.40      0.38      1470
weighted avg       0.42      0.40      0.38      1470

Decision Tree with 5 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.4380952380952381
Confusion Matrix of Decision Tree is:
 [[ 93   2   6  48  37  16   8]
 [  5  71  13  53  55   8   5]
 [  3   5 153  34  10   2   3]
 [  4   4   4 114  39  26  19]
 [ 41   3  11  49  68  19  19]
 [  6   5   7  82  20  37  53]
 [  1   3   0  38  28  32 108]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.61      0.44      0.51       210
           2       0.76      0.34      0.47       210
           3       0.79      0.73      0.76       210
           4       0.27      0.54      0.36       210
           5       0.26      0.32      0.29       210
           6       0.26      0.18      0.21       210
           7       0.50      0.51      0.51       210

    accuracy                           0.44      1470
   macro avg       0.49      0.44      0.44      1470
weighted avg       0.49      0.44      0.44      1470

Decision Tree with 6 max_dept

Accuracy of Decision Tree after Normalize Scaling is: 0.4965986394557823
Confusion Matrix of Decision Tree is:
 [[ 97   8   7  16  44  29   9]
 [  6 116   8  20  28  22  10]
 [  0  14 163  10   8  11   4]
 [ 19  12   5  89  28  35  22]
 [ 33  17   5  23  90  20  22]
 [ 17  19  14  20  18  68  54]
 [  9  11   4  19  14  46 107]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.54      0.46      0.50       210
           2       0.59      0.55      0.57       210
           3       0.79      0.78      0.78       210
           4       0.45      0.42      0.44       210
           5       0.39      0.43      0.41       210
           6       0.29      0.32      0.31       210
           7       0.47      0.51      0.49       210

    accuracy                           0.50      1470
   macro avg       0.50      0.50      0.50      1470
weighted avg       0.50      0.50      0.50      1470

Decision Tree with 14 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.3
Confusion Matrix of Random Forest is:
 [[ 25   3  30   1   7   0 144]
 [  2  20  84   4   8   0  92]
 [  0   5 175   3   5   0  22]
 [  2   5  26   5   2   0 170]
 [  5   5  10   0   9   0 181]
 [  2   5  19   2   2   0 180]
 [  1   2   0   0   0   0 207]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.68      0.12      0.20       210
           2       0.44      0.10      0.16       210
           3       0.51      0.83      0.63       210
           4       0.33      0.02      0.04       210
           5       0.27      0.04      0.07       210
           6       0.00      0.00      0.00       210
           7       0.21      0.99      0.34       210

    accuracy                           0.30      1470
   macro avg       0.35      0.30      0.21      1470
weighted avg       0.35      0.30      0.21      1470

Random Forest with 2 max_depth


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Random Forest after Normalize Scaling is: 0.46802721088435373
Confusion Matrix of Random Forest is:
 [[ 60  12   0   2  86   1  49]
 [  2 123  12   2  56   0  15]
 [  4  14 167   2  14   0   9]
 [  3  19  19  21  64   0  84]
 [  9  15   1   1 133   0  51]
 [  2  13  16   8  43   0 128]
 [  0   5   0   6  14   1 184]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.75      0.29      0.41       210
           2       0.61      0.59      0.60       210
           3       0.78      0.80      0.79       210
           4       0.50      0.10      0.17       210
           5       0.32      0.63      0.43       210
           6       0.00      0.00      0.00       210
           7       0.35      0.88      0.50       210

    accuracy                           0.47      1470
   macro avg       0.47      0.47      0.41      1470
weighted avg       0.47      0.47      0.41      1470

Random Forest with 3 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.6496598639455783
Confusion Matrix of Random Forest is:
 [[113   1   1  14  41  20  20]
 [  0 148   5  11  32   9   5]
 [  3   5 176   5   6  15   0]
 [  8   7   7 125  15  33  15]
 [ 20  11   0  15 131   3  30]
 [  4  12   3  16  12  82  81]
 [  0   2   0   2   5  21 180]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.76      0.54      0.63       210
           2       0.80      0.70      0.75       210
           3       0.92      0.84      0.88       210
           4       0.66      0.60      0.63       210
           5       0.54      0.62      0.58       210
           6       0.45      0.39      0.42       210
           7       0.54      0.86      0.67       210

    accuracy                           0.65      1470
   macro avg       0.67      0.65      0.65      1470
weighted avg       0.67      0.65      0.65      1470

Random Forest with 11 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.6632653061224489
Confusion Matrix of Random Forest is:
 [[115   1   0   8  48  23  15]
 [  1 163   3   8  22  10   3]
 [  4   8 180   6   4   8   0]
 [  8   6   7 132  17  30  10]
 [ 27  13   0  19 122  11  18]
 [  8  13   4  17   6  89  73]
 [  0   3   0   7   6  20 174]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.71      0.55      0.62       210
           2       0.79      0.78      0.78       210
           3       0.93      0.86      0.89       210
           4       0.67      0.63      0.65       210
           5       0.54      0.58      0.56       210
           6       0.47      0.42      0.44       210
           7       0.59      0.83      0.69       210

    accuracy                           0.66      1470
   macro avg       0.67      0.66      0.66      1470
weighted avg       0.67      0.66      0.66      1470

Random Forest with 19 max_dep

In [16]:
# Hinglish GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//gpt_hinglish_finetuned_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.2938775510204082
Confusion Matrix of Logistic Regression is:
 [[ 36   0  63   2  80  25   4]
 [  2  17 103   8  70   8   2]
 [  1   6 188   2  11   2   0]
 [  5   3  98  10  73  11  10]
 [ 21  10  37   4 101  23  14]
 [  0   6  93   5  58  35  13]
 [  1   2  52   5  64  41  45]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.55      0.17      0.26       210
           2       0.39      0.08      0.13       210
           3       0.30      0.90      0.45       210
           4       0.28      0.05      0.08       210
           5       0.22      0.48      0.30       210
           6       0.24      0.17      0.20       210
           7       0.51      0.21      0.30       210

    accuracy                           0.29      1470
   macro avg       0.35      0.29      0.25      1470
weighted avg       0.35      0.29      0.25      1470

KNN with 3 

Accuracy of Bernoulli Naive Bayes after Normalize Scaling is: 0.5149659863945578
Confusion Matrix of Bernoulli Naive Bayes is:
 [[ 82   0  11   5  54  25  33]
 [  1 117   8   9  43  24   8]
 [  9   4 152  11   6  26   2]
 [  6   9  15  76  31  26  47]
 [ 18  14   1   8 119   9  41]
 [  7   7  11  25  20  44  96]
 [  0   2   0  15  12  14 167]]
Classification Report of Bernoulli Naive Bayes is:
               precision    recall  f1-score   support

           1       0.67      0.39      0.49       210
           2       0.76      0.56      0.64       210
           3       0.77      0.72      0.75       210
           4       0.51      0.36      0.42       210
           5       0.42      0.57      0.48       210
           6       0.26      0.21      0.23       210
           7       0.42      0.80      0.55       210

    accuracy                           0.51      1470
   macro avg       0.54      0.51      0.51      1470
weighted avg       0.54      0.51      0.51      1470

Worki

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of SVM after Normalize Scaling is: 0.24693877551020407
Confusion Matrix of SVM is:
 [[ 21  10  86   0  66  27   0]
 [  0  36 124   0  44   6   0]
 [  0   7 196   0   6   1   0]
 [  1  10 119  10  61   9   0]
 [  7  25  70   0  80  28   0]
 [  2  13 130   3  42  20   0]
 [  0  11 101   2  54  42   0]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.68      0.10      0.17       210
           2       0.32      0.17      0.22       210
           3       0.24      0.93      0.38       210
           4       0.67      0.05      0.09       210
           5       0.23      0.38      0.28       210
           6       0.15      0.10      0.12       210
           7       0.00      0.00      0.00       210

    accuracy                           0.25      1470
   macro avg       0.33      0.25      0.18      1470
weighted avg       0.33      0.25      0.18      1470

Working on SVM Kernal: sigmoid


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of SVM after Normalize Scaling is: 0.21768707482993196
Confusion Matrix of SVM is:
 [[ 17   1 130   1  60   1   0]
 [  2  19 147   0  40   2   0]
 [  1   4 201   0   4   0   0]
 [  6   7 138   8  51   0   0]
 [ 12   7 120   0  71   0   0]
 [  3   3 159   4  38   3   0]
 [  0   6 150   4  48   1   1]]
Classification Report of SVM is:
               precision    recall  f1-score   support

           1       0.41      0.08      0.14       210
           2       0.40      0.09      0.15       210
           3       0.19      0.96      0.32       210
           4       0.47      0.04      0.07       210
           5       0.23      0.34      0.27       210
           6       0.43      0.01      0.03       210
           7       1.00      0.00      0.01       210

    accuracy                           0.22      1470
   macro avg       0.45      0.22      0.14      1470
weighted avg       0.45      0.22      0.14      1470

Decision Tree with 1 max_depth
Accuracy of Decision Tree a

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.30612244897959184
Confusion Matrix of Decision Tree is:
 [[ 13   0   2   0 147   0  48]
 [  4   0   4   0 166   0  36]
 [  2   0 121   0  63   0  24]
 [  5   0   8   0 115   0  82]
 [  1   0   1   0 163   0  45]
 [  2   0   2   0  72   0 134]
 [  0   0   1   0  56   0 153]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.48      0.06      0.11       210
           2       0.00      0.00      0.00       210
           3       0.87      0.58      0.69       210
           4       0.00      0.00      0.00       210
           5       0.21      0.78      0.33       210
           6       0.00      0.00      0.00       210
           7       0.29      0.73      0.42       210

    accuracy                           0.31      1470
   macro avg       0.26      0.31      0.22      1470
weighted avg       0.26      0.31      0.22      1470

Decision Tree with 3 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.36054421768707484
Confusion Matrix of Decision Tree is:
 [[ 12  14   0   3 133  45   3]
 [  0 108   2   6  58  34   2]
 [  1  46 119   3  17  23   1]
 [  1  46   2  10  69  79   3]
 [  1  36   1   0 127  42   3]
 [  0  20   2   2  52 111  23]
 [  0  12   0   1  44 110  43]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.80      0.06      0.11       210
           2       0.38      0.51      0.44       210
           3       0.94      0.57      0.71       210
           4       0.40      0.05      0.09       210
           5       0.25      0.60      0.36       210
           6       0.25      0.53      0.34       210
           7       0.55      0.20      0.30       210

    accuracy                           0.36      1470
   macro avg       0.51      0.36      0.33      1470
weighted avg       0.51      0.36      0.33      1470

Decision Tree with 4 max_dep

Accuracy of Decision Tree after Normalize Scaling is: 0.5027210884353741
Confusion Matrix of Decision Tree is:
 [[ 92   9   5  20  56  21   7]
 [  4 122   8  18  25  25   8]
 [  9   7 162  14   6   9   3]
 [ 24  12  17  81  39  22  15]
 [ 34  21   3  22 101  17  12]
 [ 13  16   9  30  23  72  47]
 [ 10   7   3  13  19  49 109]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.49      0.44      0.46       210
           2       0.63      0.58      0.60       210
           3       0.78      0.77      0.78       210
           4       0.41      0.39      0.40       210
           5       0.38      0.48      0.42       210
           6       0.33      0.34      0.34       210
           7       0.54      0.52      0.53       210

    accuracy                           0.50      1470
   macro avg       0.51      0.50      0.50      1470
weighted avg       0.51      0.50      0.50      1470

Decision Tree with 12 max_dep

Accuracy of Decision Tree after Normalize Scaling is: 0.49795918367346936
Confusion Matrix of Decision Tree is:
 [[ 94   5  10  24  43  24  10]
 [  8 130   8  12  22  22   8]
 [  9   8 168   8   3   9   5]
 [ 21  19  14  81  26  32  17]
 [ 45  26   5  25  75  24  10]
 [ 14  16  11  33  18  71  47]
 [ 13  14   3  15  11  41 113]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.46      0.45      0.45       210
           2       0.60      0.62      0.61       210
           3       0.77      0.80      0.78       210
           4       0.41      0.39      0.40       210
           5       0.38      0.36      0.37       210
           6       0.32      0.34      0.33       210
           7       0.54      0.54      0.54       210

    accuracy                           0.50      1470
   macro avg       0.50      0.50      0.50      1470
weighted avg       0.50      0.50      0.50      1470

Decision Tree with 20 max_de

Accuracy of Random Forest after Normalize Scaling is: 0.6421768707482993
Confusion Matrix of Random Forest is:
 [[106   1   1   8  52  29  13]
 [  0 142   4  12  29  18   5]
 [  1   6 171  13   3  15   1]
 [  1   8   8 126  15  29  23]
 [ 14  19   1  12 125  13  26]
 [  3   6   4  21   9  77  90]
 [  0   0   0   2   4   7 197]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.85      0.50      0.63       210
           2       0.78      0.68      0.72       210
           3       0.90      0.81      0.86       210
           4       0.65      0.60      0.62       210
           5       0.53      0.60      0.56       210
           6       0.41      0.37      0.39       210
           7       0.55      0.94      0.70       210

    accuracy                           0.64      1470
   macro avg       0.67      0.64      0.64      1470
weighted avg       0.67      0.64      0.64      1470

Random Forest with 8 max_dept

Accuracy of Random Forest after Normalize Scaling is: 0.6925170068027211
Confusion Matrix of Random Forest is:
 [[122   0   1  12  41  25   9]
 [  0 157   4   9  23  13   4]
 [  5   3 171  12   2  17   0]
 [  2   6   8 146  13  21  14]
 [ 14  15   3  19 127  12  20]
 [  0   8   3  16  10 106  67]
 [  0   2   0   2   3  14 189]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.85      0.58      0.69       210
           2       0.82      0.75      0.78       210
           3       0.90      0.81      0.85       210
           4       0.68      0.70      0.69       210
           5       0.58      0.60      0.59       210
           6       0.51      0.50      0.51       210
           7       0.62      0.90      0.74       210

    accuracy                           0.69      1470
   macro avg       0.71      0.69      0.69      1470
weighted avg       0.71      0.69      0.69      1470

Random Forest with 16 max_dep

In [17]:
# XLM vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//xlm_base_finetuned_vectorized_kabita_dataset.csv")

x_train,x_test,y_train,y_test = normalize_scaling(x_df,labels_df['kabita_labels'])

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=2000)
ml_training(tv_lr_model,x_train,x_test,y_train,y_test,"Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_train,x_test,y_train,y_test,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_train,x_test,y_train,y_test,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_train,x_test,y_train,y_test,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_train,x_test,y_train,y_test,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
tv_dt_model = DecisionTreeClassifier(random_state=3)
ml_training(tv_dt_model,x_train,x_test,y_train,y_test,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")

tv_rf_model = RandomForestClassifier(random_state=3)
ml_training(tv_rf_model,x_train,x_test,y_train,y_test,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_train=mms_scale.fit_transform(x_train)
m_test=mms_scale.fit_transform(x_test)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_train,m_test,y_train,y_test,"Multinomial Naive Bayes")

Accuracy of Logistic Regression after Normalize Scaling is: 0.5197278911564626
Confusion Matrix of Logistic Regression is:
 [[102   4   1  14  47  13  29]
 [  8 120  17  20  29  10   6]
 [  4  14 171   7   3  10   1]
 [  7  15  12  83  33  38  22]
 [ 12  26   6  23 104  10  29]
 [ 15  14  11  24  23  59  64]
 [  8   6   0  14  25  32 125]]
Classification Report of Logistic Regression is:
               precision    recall  f1-score   support

           1       0.65      0.49      0.56       210
           2       0.60      0.57      0.59       210
           3       0.78      0.81      0.80       210
           4       0.45      0.40      0.42       210
           5       0.39      0.50      0.44       210
           6       0.34      0.28      0.31       210
           7       0.45      0.60      0.51       210

    accuracy                           0.52      1470
   macro avg       0.53      0.52      0.52      1470
weighted avg       0.53      0.52      0.52      1470

KNN with 3 

Accuracy of Bernoulli Naive Bayes after Normalize Scaling is: 0.3979591836734694
Confusion Matrix of Bernoulli Naive Bayes is:
 [[ 52  26   1  53  41   7  30]
 [ 11 114   5  37  23   3  17]
 [  7  38 139  21   2   0   3]
 [  6  23   5  77  42  15  42]
 [  8  36   2  49  70   6  39]
 [ 11  32   2  49  23  30  63]
 [  4   9   0  45  32  17 103]]
Classification Report of Bernoulli Naive Bayes is:
               precision    recall  f1-score   support

           1       0.53      0.25      0.34       210
           2       0.41      0.54      0.47       210
           3       0.90      0.66      0.76       210
           4       0.23      0.37      0.28       210
           5       0.30      0.33      0.32       210
           6       0.38      0.14      0.21       210
           7       0.35      0.49      0.41       210

    accuracy                           0.40      1470
   macro avg       0.44      0.40      0.40      1470
weighted avg       0.44      0.40      0.40      1470

Worki

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.24081632653061225
Confusion Matrix of Decision Tree is:
 [[  0   1   4   0 198   7   0]
 [  0  29   5   0 173   3   0]
 [  0   0 116   0  88   6   0]
 [  0   1   0   0 204   5   0]
 [  0   2   0   0 205   3   0]
 [  0   1   1   0 204   4   0]
 [  0   1   0   0 209   0   0]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       210
           2       0.83      0.14      0.24       210
           3       0.92      0.55      0.69       210
           4       0.00      0.00      0.00       210
           5       0.16      0.98      0.27       210
           6       0.14      0.02      0.03       210
           7       0.00      0.00      0.00       210

    accuracy                           0.24      1470
   macro avg       0.29      0.24      0.18      1470
weighted avg       0.29      0.24      0.18      1470

Decision Tree with 3 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.26326530612244897
Confusion Matrix of Decision Tree is:
 [[ 29   4   1   0   0   1 175]
 [  3  30   3   0   0   3 171]
 [  0   1 115   0   0   6  88]
 [  0   1   0   0   0   5 204]
 [  2   1   0   0   0   1 206]
 [  1   0   1   0   0   3 205]
 [  0   0   0   0   0   0 210]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.83      0.14      0.24       210
           2       0.81      0.14      0.24       210
           3       0.96      0.55      0.70       210
           4       0.00      0.00      0.00       210
           5       0.00      0.00      0.00       210
           6       0.16      0.01      0.03       210
           7       0.17      1.00      0.29       210

    accuracy                           0.26      1470
   macro avg       0.42      0.26      0.21      1470
weighted avg       0.42      0.26      0.21      1470

Decision Tree with 4 max_dep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of Decision Tree after Normalize Scaling is: 0.282312925170068
Confusion Matrix of Decision Tree is:
 [[ 27   2  17   2   1   2 159]
 [  1  29  21   1   0   6 152]
 [  0   1 146   0   0   2  61]
 [  0   0  11   1   0   5 193]
 [  0   1   7   0   2   1 199]
 [  1   0  13   0   1   3 192]
 [  0   0   3   0   0   0 207]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.93      0.13      0.23       210
           2       0.88      0.14      0.24       210
           3       0.67      0.70      0.68       210
           4       0.25      0.00      0.01       210
           5       0.50      0.01      0.02       210
           6       0.16      0.01      0.03       210
           7       0.18      0.99      0.30       210

    accuracy                           0.28      1470
   macro avg       0.51      0.28      0.21      1470
weighted avg       0.51      0.28      0.21      1470

Decision Tree with 5 max_depth

Accuracy of Decision Tree after Normalize Scaling is: 0.3687074829931973
Confusion Matrix of Decision Tree is:
 [[ 75  15   3  46  19  39  13]
 [ 20  98   7  40  15  22   8]
 [  6  11 149  26   9   7   2]
 [ 39  13   4  73  19  42  20]
 [ 38  15   2  45  49  40  21]
 [ 29  29   3  53  27  49  20]
 [ 23  12   3  41  33  49  49]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.33      0.36      0.34       210
           2       0.51      0.47      0.49       210
           3       0.87      0.71      0.78       210
           4       0.23      0.35      0.27       210
           5       0.29      0.23      0.26       210
           6       0.20      0.23      0.21       210
           7       0.37      0.23      0.29       210

    accuracy                           0.37      1470
   macro avg       0.40      0.37      0.38      1470
weighted avg       0.40      0.37      0.38      1470

Decision Tree with 13 max_dep

Accuracy of Decision Tree after Normalize Scaling is: 0.37210884353741497
Confusion Matrix of Decision Tree is:
 [[ 78  17   8  30  22  32  23]
 [ 23  97   6  25  17  28  14]
 [  9  10 156  10   9  10   6]
 [ 36  18   6  58  27  34  31]
 [ 36  29   6  28  51  34  26]
 [ 27  33   5  33  29  45  38]
 [ 27  13   8  33  30  37  62]]
Classification Report of Decision Tree is:
               precision    recall  f1-score   support

           1       0.33      0.37      0.35       210
           2       0.45      0.46      0.45       210
           3       0.80      0.74      0.77       210
           4       0.27      0.28      0.27       210
           5       0.28      0.24      0.26       210
           6       0.20      0.21      0.21       210
           7       0.31      0.30      0.30       210

    accuracy                           0.37      1470
   macro avg       0.38      0.37      0.37      1470
weighted avg       0.38      0.37      0.37      1470

Random Forest with 1 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.44081632653061226
Confusion Matrix of Random Forest is:
 [[ 64  11   0  41  55   7  32]
 [  4 109   2  36  30  14  15]
 [  2  21 154  21   5   5   2]
 [  7  13   5  76  43  20  46]
 [  2  17   2  42  95  12  40]
 [  6  15   3  40  35  38  73]
 [  4   4   0  38  30  22 112]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.72      0.30      0.43       210
           2       0.57      0.52      0.55       210
           3       0.93      0.73      0.82       210
           4       0.26      0.36      0.30       210
           5       0.32      0.45      0.38       210
           6       0.32      0.18      0.23       210
           7       0.35      0.53      0.42       210

    accuracy                           0.44      1470
   macro avg       0.50      0.44      0.45      1470
weighted avg       0.50      0.44      0.45      1470

Random Forest with 9 max_dep

Accuracy of Random Forest after Normalize Scaling is: 0.4496598639455782
Confusion Matrix of Random Forest is:
 [[ 86  10   1  28  48  11  26]
 [  6 101   4  29  36  21  13]
 [  4  14 160  12   9  10   1]
 [ 13  11   3  72  43  39  29]
 [  9  23   2  32  90  18  36]
 [ 13  11   1  47  29  49  60]
 [  3   4   0  40  33  27 103]]
Classification Report of Random Forest is:
               precision    recall  f1-score   support

           1       0.64      0.41      0.50       210
           2       0.58      0.48      0.53       210
           3       0.94      0.76      0.84       210
           4       0.28      0.34      0.31       210
           5       0.31      0.43      0.36       210
           6       0.28      0.23      0.25       210
           7       0.38      0.49      0.43       210

    accuracy                           0.45      1470
   macro avg       0.49      0.45      0.46      1470
weighted avg       0.49      0.45      0.46      1470

Random Forest with 17 max_dep