In [1]:
try:
    import pandas as pd
    import numpy as np
    import os,sys
    import re
    # importing algorithms
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.naive_bayes import BernoulliNB
    from sklearn import svm
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix, classification_report
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import ShuffleSplit
    from sklearn.model_selection import cross_val_score
except Exception as e:
    print("Error is due to",e)

In [2]:
# getting path of file
pwd = os.getcwd()
labels_df = pd.read_csv(pwd+"//Datasets//Kabita//Input//kabita_dataset_labels.csv")
#converting datframe to numpy array
labels = labels_df.to_numpy().ravel()

In [3]:
# Function for Modelling and extracting Metrics
def ml_training(ml_model, x_fold, y_fold, model_name):
    rsfold = ShuffleSplit(n_splits=10, random_state=7, test_size=0.3)
    results = cross_val_score(ml_model, x_fold, y_fold, cv=rsfold)
    print("Accuracies for Random Shuffle Split for "+model_name+" :", results)
    print("Mean Accuracy of Random Shuffle Split for "+model_name+" :", results.mean()*100.0)
    print(70*"=")

### Bag of words Models

In [4]:
# TFIDF vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//BagOfWords//tfidf_500_vectors.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_df, labels,"Multinomial Naive Bayes")

Accuracies for Random Shuffle Split for Logistic Regression : [0.75918367 0.75442177 0.76054422 0.77823129 0.75510204 0.75578231
 0.76802721 0.77278912 0.75170068 0.76462585]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 76.20408163265306
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.53741497 0.5877551  0.54761905 0.56258503 0.57959184 0.46326531
 0.53265306 0.52585034 0.5877551  0.56462585]
Mean Accuracy of Random Shuffle Split for KNN Model : 54.89115646258502
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.54693878 0.57619048 0.56870748 0.56190476 0.58027211 0.5462585
 0.53809524 0.54217687 0.56326531 0.5707483 ]
Mean Accuracy of Random Shuffle Split for KNN Model : 55.945578231292515
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.5585034  0.57346939 0.55238095 0.55918367 0.56462585 0.54693878
 0.52653061 0.52993197 0.55510204 0.5755102 ]
Mean Accuracy of Random Shuffle Split for

Accuracies for Random Shuffle Split for Decision Tree : [0.5585034  0.5292517  0.53945578 0.55306122 0.53333333 0.53537415
 0.5292517  0.57210884 0.52312925 0.54081633]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.142857142857146
Decision Tree with 14 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.56802721 0.53401361 0.55238095 0.54897959 0.54081633 0.53129252
 0.52176871 0.57210884 0.54761905 0.55170068]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.68707482993198
Decision Tree with 15 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.57142857 0.54557823 0.56394558 0.56258503 0.54353741 0.56598639
 0.54081633 0.57687075 0.54965986 0.55918367]
Mean Accuracy of Random Shuffle Split for Decision Tree : 55.79591836734694
Decision Tree with 16 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.58163265 0.5462585  0.56326531 0.57210884 0.54557823 0.57006803
 0.54421769 0.58571429 0.56054422 0.56326531]

Accuracies for Random Shuffle Split for Random Forest : [0.70204082 0.68435374 0.67959184 0.70204082 0.68435374 0.69863946
 0.69387755 0.69795918 0.69387755 0.68911565]
Mean Accuracy of Random Shuffle Split for Random Forest : 69.25850340136054
Random Forest with 18 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.70680272 0.68027211 0.6829932  0.70680272 0.67891156 0.7047619
 0.70204082 0.70068027 0.68707483 0.69591837]
Mean Accuracy of Random Shuffle Split for Random Forest : 69.46258503401361
Random Forest with 19 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.70612245 0.68367347 0.68639456 0.70952381 0.6829932  0.70068027
 0.69727891 0.69659864 0.69319728 0.70408163]
Mean Accuracy of Random Shuffle Split for Random Forest : 69.60544217687075
Random Forest with 20 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.72108844 0.68435374 0.69183673 0.7170068  0.68571429 0.70204082
 0.70136054 0.69863946 0.6952381  0.69183673]
M

In [5]:
# Count Vectorizer vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//BagOfWords//cv_500_vectors.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_df, labels,"Multinomial Naive Bayes")

Accuracies for Random Shuffle Split for Logistic Regression : [0.75782313 0.75442177 0.76530612 0.77959184 0.75102041 0.75782313
 0.77482993 0.76462585 0.75034014 0.76530612]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 76.2108843537415
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.55102041 0.6047619  0.58095238 0.58639456 0.60408163 0.49251701
 0.55238095 0.53197279 0.59659864 0.59727891]
Mean Accuracy of Random Shuffle Split for KNN Model : 56.9795918367347
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.55986395 0.59115646 0.59319728 0.5707483  0.56870748 0.57482993
 0.56462585 0.55578231 0.54353741 0.58979592]
Mean Accuracy of Random Shuffle Split for KNN Model : 57.12244897959183
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.56802721 0.5877551  0.59659864 0.58435374 0.57619048 0.57891156
 0.55510204 0.58911565 0.54965986 0.59795918]
Mean Accuracy of Random Shuffle Split for K

Accuracies for Random Shuffle Split for Decision Tree : [0.57006803 0.55238095 0.5755102  0.56326531 0.56258503 0.57619048
 0.56938776 0.59319728 0.56530612 0.57006803]
Mean Accuracy of Random Shuffle Split for Decision Tree : 56.97959183673469
Decision Tree with 14 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.57823129 0.55782313 0.58639456 0.57210884 0.57346939 0.58707483
 0.5755102  0.59319728 0.5707483  0.57687075]
Mean Accuracy of Random Shuffle Split for Decision Tree : 57.714285714285715
Decision Tree with 15 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.58503401 0.56734694 0.58503401 0.57959184 0.58571429 0.58571429
 0.58707483 0.59863946 0.57006803 0.58163265]
Mean Accuracy of Random Shuffle Split for Decision Tree : 58.25850340136054
Decision Tree with 16 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.58571429 0.57142857 0.59183673 0.58707483 0.58639456 0.59251701
 0.59455782 0.60204082 0.58367347 0.5829932 ]

Accuracies for Random Shuffle Split for Random Forest : [0.66938776 0.6755102  0.67482993 0.70068027 0.68571429 0.68163265
 0.67823129 0.69387755 0.66734694 0.68231293]
Mean Accuracy of Random Shuffle Split for Random Forest : 68.0952380952381
Random Forest with 18 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.67142857 0.67482993 0.68027211 0.69183673 0.69319728 0.68503401
 0.68435374 0.69455782 0.66938776 0.6755102 ]
Mean Accuracy of Random Shuffle Split for Random Forest : 68.20408163265306
Random Forest with 19 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.66462585 0.6755102  0.68843537 0.70408163 0.69387755 0.69115646
 0.68639456 0.69591837 0.67210884 0.67959184]
Mean Accuracy of Random Shuffle Split for Random Forest : 68.5170068027211
Random Forest with 20 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.66870748 0.66938776 0.68843537 0.70204082 0.6952381  0.69727891
 0.69319728 0.69863946 0.67414966 0.67414966]
Me

In [6]:
# Term Frequency vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//BagOfWords//tf_500_vectors.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_df, labels,"Multinomial Naive Bayes")

Accuracies for Random Shuffle Split for Logistic Regression : [0.75306122 0.75034014 0.76598639 0.77278912 0.75034014 0.7537415
 0.76734694 0.76190476 0.74557823 0.75102041]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 75.72108843537416
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.5707483  0.6170068  0.59047619 0.60408163 0.61360544 0.51428571
 0.56870748 0.56326531 0.62244898 0.62040816]
Mean Accuracy of Random Shuffle Split for KNN Model : 58.85034013605443
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.60068027 0.61156463 0.61088435 0.5952381  0.59795918 0.5877551
 0.59863946 0.58639456 0.58979592 0.59047619]
Mean Accuracy of Random Shuffle Split for KNN Model : 59.6938775510204
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.6        0.61088435 0.61020408 0.58435374 0.60272109 0.57959184
 0.60204082 0.59047619 0.59387755 0.5877551 ]
Mean Accuracy of Random Shuffle Split for KN

Accuracies for Random Shuffle Split for Decision Tree : [0.56462585 0.53129252 0.5462585  0.54557823 0.53809524 0.55238095
 0.53265306 0.57823129 0.51836735 0.53401361]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.414965986394556
Decision Tree with 14 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.56870748 0.53673469 0.56870748 0.55170068 0.54829932 0.55510204
 0.54353741 0.58571429 0.52585034 0.54761905]
Mean Accuracy of Random Shuffle Split for Decision Tree : 55.319727891156454
Decision Tree with 15 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.57687075 0.54217687 0.58027211 0.56122449 0.55714286 0.56598639
 0.55170068 0.59115646 0.54421769 0.56054422]
Mean Accuracy of Random Shuffle Split for Decision Tree : 56.31292517006802
Decision Tree with 16 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.58163265 0.54557823 0.58367347 0.56734694 0.56462585 0.57278912
 0.55714286 0.59115646 0.54965986 0.56802721

Accuracies for Random Shuffle Split for Random Forest : [0.70408163 0.66666667 0.68503401 0.70612245 0.69591837 0.68843537
 0.69455782 0.69795918 0.6877551  0.68979592]
Mean Accuracy of Random Shuffle Split for Random Forest : 69.16326530612245
Random Forest with 18 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.69931973 0.67210884 0.68231293 0.69931973 0.69115646 0.69591837
 0.69251701 0.69863946 0.69863946 0.68571429]
Mean Accuracy of Random Shuffle Split for Random Forest : 69.15646258503402
Random Forest with 19 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.70136054 0.67414966 0.68503401 0.70340136 0.69251701 0.69727891
 0.69727891 0.70272109 0.69183673 0.69795918]
Mean Accuracy of Random Shuffle Split for Random Forest : 69.43537414965986
Random Forest with 20 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.70272109 0.67823129 0.6829932  0.70340136 0.69251701 0.69863946
 0.69455782 0.70884354 0.69863946 0.69251701]


### Sentence Transformer Models

In [7]:
# BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.77142857 0.78707483 0.78367347 0.77278912 0.77210884 0.78095238
 0.78435374 0.77210884 0.77414966 0.79659864]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 77.95238095238093
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.67687075 0.69319728 0.69727891 0.67823129 0.68163265 0.68367347
 0.68843537 0.68027211 0.67823129 0.67755102]
Mean Accuracy of Random Shuffle Split for KNN Model : 68.35374149659863
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.68571429 0.70408163 0.68911565 0.68367347 0.69727891 0.69455782
 0.69115646 0.68367347 0.67482993 0.69183673]
Mean Accuracy of Random Shuffle Split for KNN Model : 68.95918367346938
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.68911565 0.70136054 0.69319728 0.69251701 0.69455782 0.6952381
 0.6952381  0.69591837 0.68503401 0.68707483]
Mean Accuracy of Random Shuffle Split for 

Accuracies for Random Shuffle Split for Decision Tree : [0.58095238 0.57482993 0.58503401 0.58707483 0.57346939 0.58027211
 0.57278912 0.57959184 0.55510204 0.59319728]
Mean Accuracy of Random Shuffle Split for Decision Tree : 57.82312925170069
Decision Tree with 14 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.58095238 0.57755102 0.57687075 0.57414966 0.56870748 0.5755102
 0.55918367 0.57346939 0.56734694 0.58979592]
Mean Accuracy of Random Shuffle Split for Decision Tree : 57.43537414965988
Decision Tree with 15 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.59251701 0.58027211 0.57755102 0.56258503 0.56190476 0.57414966
 0.56326531 0.57346939 0.57346939 0.58571429]
Mean Accuracy of Random Shuffle Split for Decision Tree : 57.44897959183673
Decision Tree with 16 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.5829932  0.57755102 0.5755102  0.57210884 0.57142857 0.56598639
 0.57006803 0.57210884 0.56190476 0.58503401]
M

Accuracies for Random Shuffle Split for Random Forest : [0.73061224 0.71836735 0.72721088 0.71836735 0.71836735 0.72789116
 0.71972789 0.71904762 0.7122449  0.72517007]
Mean Accuracy of Random Shuffle Split for Random Forest : 72.17006802721089
Random Forest with 18 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.72244898 0.71292517 0.72721088 0.71632653 0.71564626 0.73401361
 0.72380952 0.71904762 0.71088435 0.72517007]
Mean Accuracy of Random Shuffle Split for Random Forest : 72.07482993197279
Random Forest with 19 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.73741497 0.72585034 0.72380952 0.71904762 0.7244898  0.72653061
 0.71836735 0.72176871 0.70408163 0.73129252]
Mean Accuracy of Random Shuffle Split for Random Forest : 72.3265306122449
Random Forest with 20 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.72653061 0.71768707 0.7170068  0.72312925 0.7170068  0.72993197
 0.72380952 0.72176871 0.7122449  0.72789116]
M

In [8]:
# GKB BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset_gkb.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.556 0.534 0.539 0.54  0.521 0.542 0.551 0.542 0.531 0.554]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 54.10204081632652
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.447 0.437 0.431 0.449 0.435 0.434 0.45  0.439 0.422 0.454]
Mean Accuracy of Random Shuffle Split for KNN Model : 43.97959183673469
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.458 0.444 0.45  0.459 0.444 0.459 0.463 0.446 0.429 0.474]
Mean Accuracy of Random Shuffle Split for KNN Model : 45.27210884353742
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.465 0.433 0.453 0.463 0.449 0.461 0.467 0.448 0.433 0.473]
Mean Accuracy of Random Shuffle Split for KNN Model : 45.442176870748305
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.468 0.448 0.454 0.461 0.455 0.46  0.471 0.449 0.441 0.476]
Mean Accuracy of Random Shuffle Split

Accuracies for Random Shuffle Split for Decision Tree : [0.448 0.461 0.454 0.462 0.456 0.451 0.441 0.446 0.444 0.48 ]
Mean Accuracy of Random Shuffle Split for Decision Tree : 45.42176870748299
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.447 0.461 0.455 0.454 0.45  0.442 0.437 0.455 0.454 0.478]
Mean Accuracy of Random Shuffle Split for Decision Tree : 45.34693877551021
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.441 0.459 0.441 0.44  0.449 0.453 0.429 0.442 0.45  0.486]
Mean Accuracy of Random Shuffle Split for Decision Tree : 44.89115646258502
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.462 0.463 0.448 0.446 0.46  0.446 0.44  0.437 0.441 0.484]
Mean Accuracy of Random Shuffle Split for Decision Tree : 45.26530612244898
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.339 0.324 0.33  0.326 0.331 0.338 0.39

In [9]:
# N Distill BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset_ndisbert.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.762 0.756 0.769 0.778 0.763 0.748 0.764 0.786 0.765 0.784]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 76.74829931972789
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.64  0.648 0.673 0.648 0.62  0.631 0.646 0.65  0.634 0.657]
Mean Accuracy of Random Shuffle Split for KNN Model : 64.46938775510205
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.652 0.666 0.688 0.665 0.648 0.649 0.651 0.66  0.652 0.661]
Mean Accuracy of Random Shuffle Split for KNN Model : 65.91156462585033
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.673 0.665 0.674 0.667 0.65  0.648 0.657 0.661 0.652 0.65 ]
Mean Accuracy of Random Shuffle Split for KNN Model : 65.9795918367347
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.674 0.664 0.689 0.682 0.648 0.661 0.661 0.661 0.665 0.657]
Mean Accuracy of Random Shuffle Split f

Accuracies for Random Shuffle Split for Decision Tree : [0.552 0.517 0.557 0.537 0.563 0.531 0.565 0.541 0.551 0.544]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.56462585034013
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.542 0.529 0.562 0.537 0.557 0.541 0.558 0.537 0.555 0.56 ]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.782312925170075
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.544 0.531 0.559 0.535 0.55  0.537 0.564 0.525 0.537 0.55 ]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.319727891156475
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.543 0.531 0.555 0.529 0.554 0.539 0.55  0.542 0.552 0.554]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.48979591836734
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.365 0.273 0.26  0.281 0.263 0.268 0.

In [10]:
# V BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//bert_vectorized_kabita_dataset_vbert.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

Accuracies for Random Shuffle Split for Logistic Regression : [0.785 0.784 0.793 0.79  0.79  0.79  0.801 0.793 0.786 0.797]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 79.10204081632652
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.676 0.677 0.669 0.661 0.658 0.663 0.663 0.661 0.665 0.68 ]
Mean Accuracy of Random Shuffle Split for KNN Model : 66.71428571428572
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.697 0.677 0.688 0.676 0.676 0.675 0.682 0.669 0.67  0.695]
Mean Accuracy of Random Shuffle Split for KNN Model : 68.0408163265306
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.705 0.677 0.684 0.679 0.676 0.677 0.694 0.675 0.683 0.697]
Mean Accuracy of Random Shuffle Split for KNN Model : 68.45578231292518
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.702 0.676 0.684 0.695 0.684 0.679 0.692 0.682 0.679 0.689]
Mean Accuracy of Random Shuffle Split f

Accuracies for Random Shuffle Split for Decision Tree : [0.551 0.536 0.52  0.531 0.526 0.529 0.54  0.539 0.521 0.535]
Mean Accuracy of Random Shuffle Split for Decision Tree : 53.27210884353741
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.543 0.529 0.52  0.528 0.526 0.528 0.54  0.533 0.524 0.527]
Mean Accuracy of Random Shuffle Split for Decision Tree : 52.97278911564626
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.546 0.531 0.515 0.53  0.526 0.527 0.529 0.532 0.533 0.524]
Mean Accuracy of Random Shuffle Split for Decision Tree : 52.925170068027214
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.539 0.531 0.511 0.541 0.526 0.524 0.529 0.532 0.531 0.527]
Mean Accuracy of Random Shuffle Split for Decision Tree : 52.9047619047619
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.41  0.292 0.373 0.397 0.338 0.285 0.43

In [11]:
# GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//gpt_vectorized_kabita_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.77  0.77  0.754 0.753 0.754 0.759 0.774 0.769 0.752 0.78 ]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 76.36054421768706
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.654 0.654 0.646 0.653 0.648 0.654 0.636 0.641 0.645 0.654]
Mean Accuracy of Random Shuffle Split for KNN Model : 64.87074829931971
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.677 0.657 0.646 0.659 0.671 0.668 0.657 0.656 0.657 0.659]
Mean Accuracy of Random Shuffle Split for KNN Model : 66.06802721088435
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.682 0.664 0.671 0.67  0.68  0.667 0.675 0.661 0.652 0.671]
Mean Accuracy of Random Shuffle Split for KNN Model : 66.93877551020408
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.686 0.658 0.67  0.669 0.673 0.672 0.664 0.65  0.667 0.67 ]
Mean Accuracy of Random Shuffle Split 

Accuracies for Random Shuffle Split for Decision Tree : [0.515 0.518 0.52  0.516 0.489 0.524 0.497 0.519 0.497 0.518]
Mean Accuracy of Random Shuffle Split for Decision Tree : 51.12925170068028
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.52  0.518 0.515 0.507 0.48  0.522 0.497 0.514 0.514 0.528]
Mean Accuracy of Random Shuffle Split for Decision Tree : 51.156462585034014
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.515 0.518 0.527 0.522 0.488 0.519 0.505 0.51  0.514 0.509]
Mean Accuracy of Random Shuffle Split for Decision Tree : 51.27891156462587
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.517 0.518 0.52  0.518 0.489 0.514 0.5   0.51  0.504 0.509]
Mean Accuracy of Random Shuffle Split for Decision Tree : 50.98639455782313
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.337 0.411 0.374 0.418 0.276 0.296 0.4

In [12]:
# XLM vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//SentenceTransformers//xlm_vectorized_kabita_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.761 0.765 0.784 0.768 0.766 0.774 0.785 0.782 0.771 0.776]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 77.3265306122449
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.659 0.652 0.68  0.658 0.661 0.666 0.667 0.674 0.663 0.669]
Mean Accuracy of Random Shuffle Split for KNN Model : 66.48979591836735
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.667 0.672 0.68  0.665 0.671 0.671 0.683 0.663 0.692 0.678]
Mean Accuracy of Random Shuffle Split for KNN Model : 67.43537414965985
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.686 0.678 0.691 0.678 0.682 0.674 0.684 0.68  0.699 0.677]
Mean Accuracy of Random Shuffle Split for KNN Model : 68.29251700680273
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.688 0.686 0.694 0.666 0.688 0.662 0.682 0.678 0.692 0.671]
Mean Accuracy of Random Shuffle Split f

Accuracies for Random Shuffle Split for Decision Tree : [0.586 0.572 0.573 0.578 0.59  0.562 0.584 0.59  0.578 0.597]
Mean Accuracy of Random Shuffle Split for Decision Tree : 58.10204081632653
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.596 0.573 0.573 0.574 0.584 0.559 0.593 0.573 0.581 0.594]
Mean Accuracy of Random Shuffle Split for Decision Tree : 58.00680272108843
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.598 0.569 0.573 0.582 0.581 0.556 0.582 0.588 0.581 0.589]
Mean Accuracy of Random Shuffle Split for Decision Tree : 57.99319727891156
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.588 0.573 0.573 0.585 0.578 0.566 0.583 0.593 0.571 0.585]
Mean Accuracy of Random Shuffle Split for Decision Tree : 57.965986394557824
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.458 0.468 0.447 0.486 0.464 0.463 0.4

### Fine Tuned Transformers Models

In [13]:
# BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//bert_base_finetuned_vectorized_kabita_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.693 0.68  0.71  0.687 0.688 0.679 0.718 0.688 0.701 0.673]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 69.16326530612244
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.546 0.544 0.543 0.549 0.534 0.537 0.53  0.545 0.545 0.551]
Mean Accuracy of Random Shuffle Split for KNN Model : 54.22448979591837
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.569 0.56  0.556 0.556 0.564 0.541 0.557 0.552 0.548 0.559]
Mean Accuracy of Random Shuffle Split for KNN Model : 55.625850340136054
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.579 0.578 0.566 0.565 0.561 0.549 0.569 0.561 0.546 0.576]
Mean Accuracy of Random Shuffle Split for KNN Model : 56.482993197278915
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.58  0.571 0.576 0.558 0.57  0.546 0.561 0.562 0.557 0.57 ]
Mean Accuracy of Random Shuffle Spli

Accuracies for Random Shuffle Split for Decision Tree : [0.429 0.425 0.422 0.422 0.432 0.405 0.411 0.442 0.441 0.435]
Mean Accuracy of Random Shuffle Split for Decision Tree : 42.63945578231293
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.444 0.424 0.424 0.425 0.429 0.42  0.414 0.446 0.439 0.427]
Mean Accuracy of Random Shuffle Split for Decision Tree : 42.91156462585034
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.437 0.423 0.429 0.412 0.429 0.416 0.426 0.445 0.437 0.427]
Mean Accuracy of Random Shuffle Split for Decision Tree : 42.816326530612244
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.437 0.436 0.42  0.431 0.431 0.421 0.427 0.452 0.439 0.429]
Mean Accuracy of Random Shuffle Split for Decision Tree : 43.238095238095234
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.218 0.249 0.237 0.225 0.228 0.245 0.

In [14]:
# Hinglish BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//vbert_hinglish_finetuned_vectorized_kabita_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.69  0.685 0.669 0.667 0.666 0.667 0.659 0.689 0.678 0.659]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 67.29251700680273
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.622 0.613 0.608 0.601 0.588 0.593 0.611 0.619 0.622 0.607]
Mean Accuracy of Random Shuffle Split for KNN Model : 60.85034013605443
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.638 0.618 0.618 0.625 0.599 0.605 0.631 0.627 0.624 0.622]
Mean Accuracy of Random Shuffle Split for KNN Model : 62.07482993197277
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.636 0.631 0.609 0.635 0.603 0.612 0.631 0.627 0.637 0.627]
Mean Accuracy of Random Shuffle Split for KNN Model : 62.47619047619046
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.64  0.627 0.618 0.618 0.608 0.618 0.637 0.627 0.628 0.632]
Mean Accuracy of Random Shuffle Split 

Accuracies for Random Shuffle Split for Decision Tree : [0.488 0.502 0.478 0.503 0.49  0.48  0.498 0.484 0.493 0.489]
Mean Accuracy of Random Shuffle Split for Decision Tree : 49.05442176870748
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.495 0.501 0.476 0.494 0.482 0.486 0.5   0.495 0.495 0.489]
Mean Accuracy of Random Shuffle Split for Decision Tree : 49.129251700680264
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.495 0.499 0.482 0.497 0.499 0.484 0.503 0.494 0.499 0.489]
Mean Accuracy of Random Shuffle Split for Decision Tree : 49.414965986394556
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.492 0.503 0.481 0.497 0.486 0.492 0.493 0.491 0.497 0.488]
Mean Accuracy of Random Shuffle Split for Decision Tree : 49.19727891156464
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.401 0.38  0.388 0.407 0.389 0.386 0.

In [15]:
# GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//gpt_base_finetuned_vectorized_kabita_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.771 0.767 0.776 0.77  0.761 0.754 0.763 0.779 0.761 0.771]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 76.74149659863946
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.473 0.493 0.472 0.45  0.469 0.455 0.462 0.473 0.475 0.495]
Mean Accuracy of Random Shuffle Split for KNN Model : 47.17006802721088
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.478 0.486 0.488 0.462 0.49  0.476 0.475 0.501 0.49  0.495]
Mean Accuracy of Random Shuffle Split for KNN Model : 48.414965986394556
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.481 0.49  0.489 0.466 0.49  0.484 0.481 0.507 0.486 0.503]
Mean Accuracy of Random Shuffle Split for KNN Model : 48.77551020408163
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.472 0.488 0.488 0.469 0.486 0.476 0.483 0.503 0.484 0.5  ]
Mean Accuracy of Random Shuffle Split

Accuracies for Random Shuffle Split for Decision Tree : [0.51  0.52  0.492 0.505 0.482 0.482 0.517 0.522 0.503 0.514]
Mean Accuracy of Random Shuffle Split for Decision Tree : 50.482993197278915
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.501 0.516 0.48  0.497 0.491 0.484 0.52  0.528 0.501 0.507]
Mean Accuracy of Random Shuffle Split for Decision Tree : 50.2517006802721
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.503 0.517 0.479 0.499 0.494 0.493 0.518 0.524 0.497 0.505]
Mean Accuracy of Random Shuffle Split for Decision Tree : 50.27891156462585
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.51  0.517 0.479 0.494 0.494 0.493 0.52  0.518 0.497 0.505]
Mean Accuracy of Random Shuffle Split for Decision Tree : 50.26530612244897
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.376 0.32  0.382 0.399 0.35  0.363 0.42

In [16]:
# Hinglish GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//gpt_hinglish_finetuned_vectorized_kabita_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.788 0.786 0.782 0.786 0.791 0.77  0.799 0.795 0.776 0.801]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 78.73469387755104
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.468 0.47  0.465 0.471 0.464 0.468 0.467 0.456 0.471 0.455]
Mean Accuracy of Random Shuffle Split for KNN Model : 46.55102040816327
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.486 0.47  0.486 0.473 0.471 0.482 0.461 0.471 0.487 0.468]
Mean Accuracy of Random Shuffle Split for KNN Model : 47.5578231292517
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.49  0.477 0.499 0.478 0.474 0.488 0.482 0.49  0.499 0.469]
Mean Accuracy of Random Shuffle Split for KNN Model : 48.46938775510205
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.499 0.488 0.488 0.475 0.473 0.493 0.476 0.482 0.495 0.475]
Mean Accuracy of Random Shuffle Split f

Accuracies for Random Shuffle Split for Decision Tree : [0.499 0.5   0.497 0.49  0.517 0.49  0.517 0.516 0.509 0.495]
Mean Accuracy of Random Shuffle Split for Decision Tree : 50.28571428571429
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.503 0.497 0.506 0.493 0.506 0.476 0.512 0.518 0.5   0.501]
Mean Accuracy of Random Shuffle Split for Decision Tree : 50.11564625850341
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.501 0.502 0.497 0.489 0.503 0.476 0.52  0.51  0.506 0.495]
Mean Accuracy of Random Shuffle Split for Decision Tree : 50.0
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.502 0.502 0.497 0.491 0.503 0.476 0.522 0.517 0.511 0.484]
Mean Accuracy of Random Shuffle Split for Decision Tree : 50.034013605442176
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.261 0.292 0.284 0.298 0.257 0.279 0.326 0.278 0.32

In [17]:
# XLM vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Kabita//FineTunedTransformers//xlm_base_finetuned_vectorized_kabita_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.501 0.512 0.509 0.504 0.512 0.495 0.502 0.487 0.511 0.521]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 50.544217687074834
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.451 0.456 0.449 0.443 0.452 0.462 0.461 0.444 0.441 0.451]
Mean Accuracy of Random Shuffle Split for KNN Model : 45.0952380952381
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.48  0.479 0.471 0.459 0.471 0.47  0.461 0.45  0.455 0.46 ]
Mean Accuracy of Random Shuffle Split for KNN Model : 46.55102040816327
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.474 0.482 0.473 0.463 0.484 0.466 0.476 0.457 0.478 0.47 ]
Mean Accuracy of Random Shuffle Split for KNN Model : 47.23129251700681
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.482 0.472 0.456 0.465 0.48  0.454 0.472 0.451 0.467 0.476]
Mean Accuracy of Random Shuffle Split 

Accuracies for Random Shuffle Split for Decision Tree : [0.391 0.361 0.361 0.364 0.365 0.359 0.38  0.359 0.362 0.368]
Mean Accuracy of Random Shuffle Split for Decision Tree : 36.70068027210884
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.391 0.367 0.367 0.357 0.373 0.353 0.374 0.35  0.366 0.371]
Mean Accuracy of Random Shuffle Split for Decision Tree : 36.70068027210884
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.393 0.366 0.37  0.367 0.365 0.362 0.362 0.361 0.367 0.366]
Mean Accuracy of Random Shuffle Split for Decision Tree : 36.78231292517007
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.401 0.363 0.368 0.362 0.376 0.374 0.365 0.357 0.357 0.368]
Mean Accuracy of Random Shuffle Split for Decision Tree : 36.904761904761905
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.226 0.231 0.233 0.236 0.23  0.222 0.2