In [1]:
try:
    import pandas as pd
    import numpy as np
    import os,sys
    import re
    # importing algorithms
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.naive_bayes import BernoulliNB
    from sklearn import svm
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix, classification_report
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import ShuffleSplit
    from sklearn.model_selection import cross_val_score
except Exception as e:
    print("Error is due to",e)

In [2]:
# getting path of file
pwd = os.getcwd()
labels_df = pd.read_csv(pwd+"//Datasets//Nisha//Input//Nisha_dataset_labels.csv")
#converting datframe to numpy array
labels = labels_df.to_numpy().ravel()

In [3]:
# Function for Modelling and extracting Metrics
def ml_training(ml_model, x_fold, y_fold, model_name):
    rsfold = ShuffleSplit(n_splits=10, random_state=7, test_size=0.3)
    results = cross_val_score(ml_model, x_fold, y_fold, cv=rsfold)
    print("Accuracies for Random Shuffle Split for "+model_name+" :", results)
    print("Mean Accuracy of Random Shuffle Split for "+model_name+" :", results.mean()*100.0)
    print(70*"=")

### Bag of words Models

In [4]:
# TFIDF vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//BagOfWords//tfidf_500_vectors.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_df, labels,"Multinomial Naive Bayes")

Accuracies for Random Shuffle Split for Logistic Regression : [0.70952381 0.72244898 0.71972789 0.71020408 0.70340136 0.7170068
 0.68911565 0.72993197 0.71904762 0.73061224]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 71.51020408163265
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.51564626 0.52517007 0.55510204 0.50340136 0.53809524 0.53469388
 0.54421769 0.50748299 0.53333333 0.52040816]
Mean Accuracy of Random Shuffle Split for KNN Model : 52.775510204081634
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.54489796 0.49863946 0.51972789 0.52789116 0.5122449  0.5
 0.53333333 0.51496599 0.52108844 0.53061224]
Mean Accuracy of Random Shuffle Split for KNN Model : 52.034013605442176
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.53877551 0.5170068  0.51564626 0.51292517 0.51020408 0.50340136
 0.52517007 0.50952381 0.51632653 0.53197279]
Mean Accuracy of Random Shuffle Split for KNN M

Accuracies for Random Shuffle Split for Decision Tree : [0.52993197 0.51088435 0.52380952 0.53401361 0.50340136 0.53129252
 0.53537415 0.52517007 0.52244898 0.51904762]
Mean Accuracy of Random Shuffle Split for Decision Tree : 52.35374149659864
Decision Tree with 14 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.53605442 0.51632653 0.53877551 0.53401361 0.50884354 0.53129252
 0.54081633 0.53469388 0.53537415 0.51972789]
Mean Accuracy of Random Shuffle Split for Decision Tree : 52.959183673469404
Decision Tree with 15 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.54285714 0.5170068  0.54421769 0.53945578 0.50952381 0.53537415
 0.54217687 0.5414966  0.54557823 0.52176871]
Mean Accuracy of Random Shuffle Split for Decision Tree : 53.394557823129254
Decision Tree with 16 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.54693878 0.5170068  0.5537415  0.55238095 0.51632653 0.54897959
 0.55034014 0.54897959 0.55034014 0.54557823

Accuracies for Random Shuffle Split for Random Forest : [0.66326531 0.65646259 0.64285714 0.66258503 0.63741497 0.65442177
 0.66870748 0.66666667 0.66870748 0.6585034 ]
Mean Accuracy of Random Shuffle Split for Random Forest : 65.79591836734696
Random Forest with 18 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.6585034  0.6537415  0.64693878 0.66190476 0.64013605 0.66530612
 0.66462585 0.66394558 0.66122449 0.66598639]
Mean Accuracy of Random Shuffle Split for Random Forest : 65.82312925170068
Random Forest with 19 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.65578231 0.65646259 0.65646259 0.66530612 0.63469388 0.66394558
 0.66258503 0.66122449 0.67006803 0.66462585]
Mean Accuracy of Random Shuffle Split for Random Forest : 65.91156462585033
Random Forest with 20 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.66258503 0.65510204 0.66666667 0.65714286 0.64217687 0.66394558
 0.66530612 0.65578231 0.67823129 0.66938776]


In [5]:
# Count Vectorizer vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//BagOfWords//cv_500_vectors.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_df, labels,"Multinomial Naive Bayes")

Accuracies for Random Shuffle Split for Logistic Regression : [0.72040816 0.72040816 0.71904762 0.72653061 0.70340136 0.72721088
 0.71632653 0.73265306 0.72653061 0.73741497]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 72.29931972789115
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.50136054 0.54693878 0.53333333 0.5047619  0.53061224 0.5414966
 0.54353741 0.49659864 0.54897959 0.52108844]
Mean Accuracy of Random Shuffle Split for KNN Model : 52.68707482993198
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.51428571 0.51972789 0.51428571 0.53741497 0.49795918 0.54217687
 0.52244898 0.51156463 0.51496599 0.53061224]
Mean Accuracy of Random Shuffle Split for KNN Model : 52.05442176870749
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.5244898  0.51292517 0.52312925 0.53129252 0.48707483 0.51768707
 0.53537415 0.52585034 0.52721088 0.53605442]
Mean Accuracy of Random Shuffle Split for 

Accuracies for Random Shuffle Split for Decision Tree : [0.54285714 0.52857143 0.55510204 0.55170068 0.5122449  0.54693878
 0.53741497 0.54353741 0.55578231 0.53809524]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.12244897959183
Decision Tree with 14 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.54693878 0.53469388 0.55918367 0.55918367 0.52040816 0.54353741
 0.54965986 0.5537415  0.56190476 0.5414966 ]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.707482993197274
Decision Tree with 15 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.54897959 0.53741497 0.56326531 0.56666667 0.52380952 0.5537415
 0.55034014 0.56802721 0.56190476 0.54489796]
Mean Accuracy of Random Shuffle Split for Decision Tree : 55.19047619047619
Decision Tree with 16 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.54761905 0.54421769 0.57210884 0.57414966 0.53333333 0.56190476
 0.54829932 0.56666667 0.57278912 0.55238095]


Accuracies for Random Shuffle Split for Random Forest : [0.65782313 0.64353741 0.6462585  0.64285714 0.62653061 0.64829932
 0.65170068 0.64693878 0.63809524 0.65306122]
Mean Accuracy of Random Shuffle Split for Random Forest : 64.55102040816325
Random Forest with 18 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.66530612 0.62653061 0.65306122 0.64081633 0.62721088 0.65102041
 0.65714286 0.65646259 0.65306122 0.65918367]
Mean Accuracy of Random Shuffle Split for Random Forest : 64.89795918367346
Random Forest with 19 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.65442177 0.63401361 0.64557823 0.64353741 0.62585034 0.65578231
 0.65646259 0.64897959 0.65442177 0.6585034 ]
Mean Accuracy of Random Shuffle Split for Random Forest : 64.77551020408163
Random Forest with 20 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.66122449 0.63197279 0.65102041 0.64761905 0.62585034 0.65306122
 0.65578231 0.65238095 0.65238095 0.65918367]


In [6]:
# Term Frequency vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//BagOfWords//tf_500_vectors.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,x_df, labels,"Multinomial Naive Bayes")

Accuracies for Random Shuffle Split for Logistic Regression : [0.69319728 0.70544218 0.70816327 0.70680272 0.68639456 0.69183673
 0.68843537 0.72244898 0.71904762 0.71020408]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 70.31972789115646
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.52585034 0.54897959 0.57142857 0.52857143 0.55034014 0.55782313
 0.56938776 0.51632653 0.56530612 0.55034014]
Mean Accuracy of Random Shuffle Split for KNN Model : 54.84353741496599
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.56190476 0.54761905 0.57006803 0.52653061 0.51088435 0.51904762
 0.5462585  0.5585034  0.5244898  0.55714286]
Mean Accuracy of Random Shuffle Split for KNN Model : 54.22448979591837
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.55646259 0.54897959 0.55578231 0.53673469 0.50748299 0.51768707
 0.54421769 0.54693878 0.54761905 0.54897959]
Mean Accuracy of Random Shuffle Split for

Accuracies for Random Shuffle Split for Decision Tree : [0.51972789 0.50748299 0.5170068  0.52993197 0.49795918 0.5292517
 0.51768707 0.51768707 0.52176871 0.51496599]
Mean Accuracy of Random Shuffle Split for Decision Tree : 51.73469387755103
Decision Tree with 14 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.53401361 0.52312925 0.52312925 0.53809524 0.50204082 0.53197279
 0.52176871 0.53197279 0.53537415 0.5244898 ]
Mean Accuracy of Random Shuffle Split for Decision Tree : 52.65986394557823
Decision Tree with 15 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.53469388 0.52380952 0.53469388 0.54557823 0.50612245 0.53809524
 0.53129252 0.54489796 0.54557823 0.53265306]
Mean Accuracy of Random Shuffle Split for Decision Tree : 53.37414965986393
Decision Tree with 16 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.53265306 0.52721088 0.54217687 0.53809524 0.51360544 0.53877551
 0.53197279 0.54829932 0.54761905 0.53401361]
M

Accuracies for Random Shuffle Split for Random Forest : [0.65510204 0.64557823 0.65102041 0.64421769 0.63809524 0.65714286
 0.66054422 0.66598639 0.65238095 0.67006803]
Mean Accuracy of Random Shuffle Split for Random Forest : 65.4013605442177
Random Forest with 18 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.65782313 0.65578231 0.65646259 0.64761905 0.64421769 0.66734694
 0.66122449 0.66938776 0.66190476 0.6707483 ]
Mean Accuracy of Random Shuffle Split for Random Forest : 65.9251700680272
Random Forest with 19 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.6537415  0.64693878 0.65102041 0.65238095 0.63809524 0.66258503
 0.67006803 0.66802721 0.66666667 0.67210884]
Mean Accuracy of Random Shuffle Split for Random Forest : 65.81632653061224
Random Forest with 20 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.65646259 0.64421769 0.66122449 0.65918367 0.6462585  0.66598639
 0.66054422 0.66326531 0.66462585 0.6707483 ]
Me

### Sentence Transformer Models

In [7]:
# BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//SentenceTransformers//bert_vectorized_Nisha_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.73401361 0.74217687 0.74489796 0.73673469 0.73605442 0.73129252
 0.73945578 0.72993197 0.7244898  0.74081633]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 73.59863945578232
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.63537415 0.63197279 0.62653061 0.65170068 0.63265306 0.63469388
 0.63333333 0.63061224 0.63401361 0.63877551]
Mean Accuracy of Random Shuffle Split for KNN Model : 63.49659863945577
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.63945578 0.64965986 0.64081633 0.64897959 0.63265306 0.65170068
 0.63741497 0.6537415  0.63741497 0.65782313]
Mean Accuracy of Random Shuffle Split for KNN Model : 64.49659863945578
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.66122449 0.64693878 0.65034014 0.66258503 0.63605442 0.64965986
 0.64353741 0.64829932 0.65238095 0.65918367]
Mean Accuracy of Random Shuffle Split for

Accuracies for Random Shuffle Split for Decision Tree : [0.5462585  0.57482993 0.53401361 0.53945578 0.53673469 0.53333333
 0.55238095 0.56190476 0.53401361 0.54013605]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.530612244897966
Decision Tree with 14 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.54081633 0.5829932  0.52653061 0.54421769 0.53809524 0.53741497
 0.55646259 0.55306122 0.54353741 0.53673469]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.59863945578232
Decision Tree with 15 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.55510204 0.57414966 0.5292517  0.54421769 0.53197279 0.52585034
 0.54965986 0.5462585  0.53469388 0.53945578]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.306122448979586
Decision Tree with 16 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.5585034  0.57346939 0.53605442 0.53537415 0.52653061 0.52312925
 0.55714286 0.55102041 0.53537415 0.53945578

Accuracies for Random Shuffle Split for Random Forest : [0.68639456 0.69591837 0.68367347 0.69455782 0.66870748 0.66258503
 0.67959184 0.69931973 0.67278912 0.69863946]
Mean Accuracy of Random Shuffle Split for Random Forest : 68.421768707483
Random Forest with 18 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.69863946 0.69727891 0.67823129 0.70408163 0.66870748 0.67006803
 0.69183673 0.70340136 0.67959184 0.68911565]
Mean Accuracy of Random Shuffle Split for Random Forest : 68.80952380952381
Random Forest with 19 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.69183673 0.69795918 0.67687075 0.70680272 0.66802721 0.66734694
 0.69659864 0.70272109 0.68231293 0.68979592]
Mean Accuracy of Random Shuffle Split for Random Forest : 68.80272108843538
Random Forest with 20 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.69047619 0.69387755 0.67823129 0.71156463 0.67142857 0.66734694
 0.68503401 0.70816327 0.67346939 0.69659864]
Me

In [8]:
# GKB BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//SentenceTransformers//bert_vectorized_Nisha_dataset_gkb.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.552 0.54  0.531 0.548 0.524 0.548 0.553 0.548 0.556 0.537]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 54.38775510204083
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.461 0.442 0.448 0.445 0.439 0.435 0.456 0.437 0.441 0.461]
Mean Accuracy of Random Shuffle Split for KNN Model : 44.65986394557822
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.468 0.448 0.453 0.452 0.45  0.46  0.467 0.461 0.448 0.464]
Mean Accuracy of Random Shuffle Split for KNN Model : 45.714285714285715
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.459 0.454 0.446 0.456 0.44  0.469 0.468 0.464 0.453 0.461]
Mean Accuracy of Random Shuffle Split for KNN Model : 45.70068027210884
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.463 0.448 0.451 0.451 0.429 0.463 0.481 0.459 0.456 0.457]
Mean Accuracy of Random Shuffle Split

Accuracies for Random Shuffle Split for Decision Tree : [0.439 0.434 0.431 0.448 0.422 0.445 0.463 0.432 0.422 0.443]
Mean Accuracy of Random Shuffle Split for Decision Tree : 43.76870748299319
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.44  0.435 0.44  0.442 0.427 0.452 0.467 0.427 0.423 0.448]
Mean Accuracy of Random Shuffle Split for Decision Tree : 44.02721088435374
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.442 0.439 0.427 0.437 0.418 0.444 0.451 0.432 0.43  0.453]
Mean Accuracy of Random Shuffle Split for Decision Tree : 43.72108843537415
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.431 0.427 0.428 0.443 0.416 0.448 0.461 0.433 0.423 0.442]
Mean Accuracy of Random Shuffle Split for Decision Tree : 43.51700680272109
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.333 0.36  0.337 0.317 0.321 0.359 0.37

In [9]:
# N Distill BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//SentenceTransformers//bert_vectorized_Nisha_dataset_ndisbert.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.713 0.735 0.74  0.737 0.727 0.712 0.73  0.724 0.75  0.725]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 72.93197278911563
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.617 0.615 0.604 0.627 0.599 0.607 0.625 0.601 0.61  0.607]
Mean Accuracy of Random Shuffle Split for KNN Model : 61.11564625850339
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.634 0.629 0.631 0.65  0.62  0.635 0.636 0.612 0.624 0.611]
Mean Accuracy of Random Shuffle Split for KNN Model : 62.83673469387756
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.635 0.63  0.637 0.652 0.616 0.629 0.634 0.614 0.636 0.619]
Mean Accuracy of Random Shuffle Split for KNN Model : 63.02721088435375
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.634 0.62  0.639 0.646 0.616 0.62  0.639 0.62  0.635 0.618]
Mean Accuracy of Random Shuffle Split 

Accuracies for Random Shuffle Split for Decision Tree : [0.511 0.527 0.525 0.538 0.514 0.529 0.532 0.523 0.534 0.501]
Mean Accuracy of Random Shuffle Split for Decision Tree : 52.33333333333333
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.508 0.53  0.516 0.527 0.501 0.527 0.546 0.522 0.529 0.5  ]
Mean Accuracy of Random Shuffle Split for Decision Tree : 52.06122448979593
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.515 0.532 0.514 0.535 0.502 0.532 0.535 0.522 0.537 0.509]
Mean Accuracy of Random Shuffle Split for Decision Tree : 52.34013605442177
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.514 0.537 0.528 0.527 0.506 0.531 0.529 0.518 0.539 0.496]
Mean Accuracy of Random Shuffle Split for Decision Tree : 52.244897959183675
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.304 0.288 0.271 0.278 0.265 0.295 0.2

In [10]:
# V BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//SentenceTransformers//bert_vectorized_Nisha_dataset_vbert.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

Accuracies for Random Shuffle Split for Logistic Regression : [0.758 0.761 0.754 0.769 0.752 0.745 0.747 0.778 0.765 0.77 ]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 75.98639455782312
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.645 0.642 0.617 0.654 0.631 0.633 0.648 0.636 0.64  0.66 ]
Mean Accuracy of Random Shuffle Split for KNN Model : 64.05442176870748
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.659 0.645 0.626 0.667 0.646 0.65  0.656 0.639 0.643 0.657]
Mean Accuracy of Random Shuffle Split for KNN Model : 64.87755102040816
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.65  0.652 0.634 0.667 0.65  0.64  0.654 0.653 0.644 0.664]
Mean Accuracy of Random Shuffle Split for KNN Model : 65.07482993197277
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.663 0.646 0.637 0.665 0.653 0.648 0.663 0.662 0.652 0.671]
Mean Accuracy of Random Shuffle Split 

Accuracies for Random Shuffle Split for Decision Tree : [0.53  0.522 0.505 0.514 0.493 0.535 0.52  0.516 0.519 0.516]
Mean Accuracy of Random Shuffle Split for Decision Tree : 51.687074829931966
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.53  0.516 0.51  0.505 0.493 0.528 0.514 0.514 0.527 0.526]
Mean Accuracy of Random Shuffle Split for Decision Tree : 51.61904761904762
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.52  0.512 0.516 0.509 0.501 0.531 0.524 0.519 0.532 0.52 ]
Mean Accuracy of Random Shuffle Split for Decision Tree : 51.84353741496598
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.519 0.515 0.512 0.515 0.488 0.536 0.522 0.513 0.528 0.52 ]
Mean Accuracy of Random Shuffle Split for Decision Tree : 51.680272108843525
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.439 0.45  0.355 0.363 0.331 0.421 0.

In [11]:
# GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//SentenceTransformers//gpt_vectorized_Nisha_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.727 0.735 0.727 0.72  0.73  0.725 0.722 0.74  0.722 0.732]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 72.80952380952381
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.614 0.589 0.61  0.613 0.607 0.614 0.618 0.617 0.602 0.627]
Mean Accuracy of Random Shuffle Split for KNN Model : 61.108843537414955
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.607 0.607 0.624 0.605 0.599 0.615 0.628 0.62  0.608 0.635]
Mean Accuracy of Random Shuffle Split for KNN Model : 61.482993197278915
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.62  0.605 0.622 0.624 0.601 0.623 0.627 0.621 0.619 0.637]
Mean Accuracy of Random Shuffle Split for KNN Model : 61.999999999999986
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.62  0.601 0.624 0.622 0.61  0.622 0.627 0.617 0.624 0.639]
Mean Accuracy of Random Shuffle Spl

Accuracies for Random Shuffle Split for Decision Tree : [0.488 0.484 0.471 0.49  0.486 0.495 0.494 0.491 0.484 0.485]
Mean Accuracy of Random Shuffle Split for Decision Tree : 48.68707482993197
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.489 0.484 0.467 0.493 0.48  0.51  0.485 0.499 0.489 0.485]
Mean Accuracy of Random Shuffle Split for Decision Tree : 48.80272108843536
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.493 0.483 0.48  0.495 0.473 0.495 0.484 0.499 0.473 0.484]
Mean Accuracy of Random Shuffle Split for Decision Tree : 48.59183673469388
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.493 0.482 0.475 0.496 0.483 0.5   0.495 0.492 0.487 0.484]
Mean Accuracy of Random Shuffle Split for Decision Tree : 48.870748299319736
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.406 0.448 0.373 0.39  0.382 0.405 0.4

In [12]:
# XLM vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//SentenceTransformers//xlm_vectorized_Nisha_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracies for Random Shuffle Split for Logistic Regression : [0.742 0.745 0.725 0.751 0.737 0.741 0.735 0.748 0.74  0.742]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 74.07482993197279
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.622 0.635 0.639 0.652 0.626 0.634 0.635 0.643 0.641 0.644]
Mean Accuracy of Random Shuffle Split for KNN Model : 63.71428571428572
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.643 0.629 0.637 0.66  0.628 0.641 0.641 0.662 0.648 0.659]
Mean Accuracy of Random Shuffle Split for KNN Model : 64.46258503401361
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.656 0.641 0.639 0.661 0.637 0.639 0.652 0.665 0.649 0.661]
Mean Accuracy of Random Shuffle Split for KNN Model : 65.00680272108845
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.649 0.631 0.63  0.669 0.636 0.641 0.644 0.666 0.644 0.66 ]
Mean Accuracy of Random Shuffle Split 

Accuracies for Random Shuffle Split for Decision Tree : [0.533 0.55  0.554 0.544 0.533 0.545 0.552 0.558 0.55  0.528]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.46938775510205
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.544 0.546 0.548 0.55  0.521 0.546 0.558 0.558 0.546 0.537]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.530612244897966
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.541 0.55  0.554 0.546 0.526 0.536 0.548 0.558 0.546 0.537]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.42857142857142
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.544 0.55  0.554 0.549 0.53  0.537 0.544 0.558 0.544 0.537]
Mean Accuracy of Random Shuffle Split for Decision Tree : 54.47619047619048
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.457 0.487 0.435 0.476 0.442 0.461 0.4

### Fine Tuned Transformers Models

In [13]:
# BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//FineTunedTransformers//bert_base_finetuned_vectorized_Nisha_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.661 0.658 0.656 0.663 0.65  0.669 0.647 0.665 0.665 0.674]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 66.07482993197279
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.522 0.512 0.521 0.514 0.499 0.519 0.525 0.518 0.495 0.52 ]
Mean Accuracy of Random Shuffle Split for KNN Model : 51.46258503401361
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.551 0.515 0.533 0.529 0.515 0.529 0.528 0.531 0.512 0.535]
Mean Accuracy of Random Shuffle Split for KNN Model : 52.782312925170075
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.556 0.531 0.527 0.53  0.51  0.533 0.536 0.531 0.526 0.542]
Mean Accuracy of Random Shuffle Split for KNN Model : 53.21768707482993
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.552 0.531 0.52  0.548 0.512 0.538 0.539 0.531 0.528 0.545]
Mean Accuracy of Random Shuffle Split

Accuracies for Random Shuffle Split for Decision Tree : [0.394 0.43  0.403 0.399 0.405 0.414 0.406 0.387 0.39  0.392]
Mean Accuracy of Random Shuffle Split for Decision Tree : 40.2108843537415
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.389 0.419 0.397 0.393 0.39  0.41  0.42  0.388 0.402 0.393]
Mean Accuracy of Random Shuffle Split for Decision Tree : 39.993197278911566
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.392 0.418 0.408 0.395 0.396 0.412 0.412 0.382 0.39  0.393]
Mean Accuracy of Random Shuffle Split for Decision Tree : 39.986394557823125
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.388 0.426 0.395 0.407 0.392 0.405 0.415 0.391 0.39  0.389]
Mean Accuracy of Random Shuffle Split for Decision Tree : 39.993197278911566
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.268 0.363 0.267 0.287 0.256 0.237 0.

In [14]:
# Hinglish BERT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//FineTunedTransformers//vbert_hinglish_finetuned_vectorized_Nisha_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.65  0.644 0.631 0.638 0.646 0.634 0.66  0.657 0.659 0.652]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 64.70068027210884
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.562 0.546 0.546 0.55  0.544 0.558 0.571 0.546 0.55  0.554]
Mean Accuracy of Random Shuffle Split for KNN Model : 55.2721088435374
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.573 0.563 0.555 0.563 0.569 0.567 0.576 0.561 0.565 0.576]
Mean Accuracy of Random Shuffle Split for KNN Model : 56.680272108843546
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.584 0.576 0.56  0.571 0.566 0.573 0.583 0.56  0.571 0.57 ]
Mean Accuracy of Random Shuffle Split for KNN Model : 57.129251700680264
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.58  0.572 0.555 0.566 0.558 0.574 0.586 0.554 0.57  0.573]
Mean Accuracy of Random Shuffle Split

Accuracies for Random Shuffle Split for Decision Tree : [0.471 0.467 0.443 0.467 0.44  0.483 0.469 0.437 0.445 0.475]
Mean Accuracy of Random Shuffle Split for Decision Tree : 45.972789115646265
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.462 0.471 0.442 0.453 0.435 0.476 0.459 0.443 0.458 0.473]
Mean Accuracy of Random Shuffle Split for Decision Tree : 45.72108843537415
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.454 0.459 0.443 0.473 0.437 0.484 0.459 0.455 0.456 0.463]
Mean Accuracy of Random Shuffle Split for Decision Tree : 45.823129251700685
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.454 0.461 0.441 0.476 0.43  0.485 0.467 0.446 0.452 0.464]
Mean Accuracy of Random Shuffle Split for Decision Tree : 45.75510204081633
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.377 0.38  0.336 0.368 0.354 0.395 0.

In [15]:
# GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//FineTunedTransformers//gpt_base_finetuned_vectorized_Nisha_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.712 0.719 0.731 0.73  0.711 0.735 0.722 0.733 0.726 0.739]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 72.57142857142857
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.439 0.426 0.427 0.46  0.431 0.431 0.435 0.417 0.431 0.441]
Mean Accuracy of Random Shuffle Split for KNN Model : 43.374149659863946
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.441 0.446 0.448 0.474 0.433 0.454 0.462 0.437 0.452 0.473]
Mean Accuracy of Random Shuffle Split for KNN Model : 45.20408163265306
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.439 0.455 0.452 0.478 0.437 0.454 0.449 0.431 0.451 0.469]
Mean Accuracy of Random Shuffle Split for KNN Model : 45.156462585034014
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.447 0.454 0.454 0.469 0.437 0.469 0.456 0.448 0.433 0.468]
Mean Accuracy of Random Shuffle Spli

Accuracies for Random Shuffle Split for Decision Tree : [0.472 0.443 0.461 0.477 0.437 0.473 0.479 0.467 0.445 0.471]
Mean Accuracy of Random Shuffle Split for Decision Tree : 46.23809523809525
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.469 0.448 0.451 0.483 0.44  0.482 0.476 0.466 0.452 0.468]
Mean Accuracy of Random Shuffle Split for Decision Tree : 46.34013605442176
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.469 0.448 0.463 0.48  0.444 0.467 0.476 0.457 0.454 0.465]
Mean Accuracy of Random Shuffle Split for Decision Tree : 46.224489795918366
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.472 0.448 0.467 0.49  0.44  0.468 0.474 0.462 0.454 0.47 ]
Mean Accuracy of Random Shuffle Split for Decision Tree : 46.45578231292517
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.388 0.373 0.329 0.384 0.342 0.401 0.3

In [16]:
# Hinglish GPT vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//FineTunedTransformers//gpt_hinglish_finetuned_vectorized_Nisha_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.754 0.75  0.739 0.748 0.735 0.737 0.741 0.75  0.753 0.749]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 74.54421768707483
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.433 0.444 0.444 0.435 0.414 0.428 0.446 0.438 0.43  0.448]
Mean Accuracy of Random Shuffle Split for KNN Model : 43.605442176870746
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.429 0.452 0.456 0.434 0.429 0.442 0.455 0.447 0.435 0.437]
Mean Accuracy of Random Shuffle Split for KNN Model : 44.15646258503402
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.43  0.464 0.446 0.437 0.431 0.448 0.441 0.458 0.428 0.438]
Mean Accuracy of Random Shuffle Split for KNN Model : 44.204081632653065
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.448 0.456 0.437 0.438 0.429 0.458 0.436 0.456 0.431 0.441]
Mean Accuracy of Random Shuffle Spli

Accuracies for Random Shuffle Split for Decision Tree : [0.47  0.459 0.478 0.483 0.446 0.485 0.456 0.456 0.463 0.453]
Mean Accuracy of Random Shuffle Split for Decision Tree : 46.48979591836735
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.478 0.454 0.476 0.485 0.459 0.486 0.463 0.452 0.471 0.457]
Mean Accuracy of Random Shuffle Split for Decision Tree : 46.79591836734694
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.476 0.465 0.472 0.468 0.457 0.483 0.459 0.459 0.473 0.452]
Mean Accuracy of Random Shuffle Split for Decision Tree : 46.632653061224495
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.471 0.448 0.472 0.481 0.459 0.482 0.459 0.453 0.469 0.452]
Mean Accuracy of Random Shuffle Split for Decision Tree : 46.45578231292517
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.337 0.41  0.353 0.348 0.276 0.329 0.3

In [17]:
# XLM vectorized data
x_df = pd.read_csv(pwd+"//Datasets//Nisha//FineTunedTransformers//xlm_base_finetuned_vectorized_Nisha_dataset.csv")

# Logistic regression
tv_lr_model = LogisticRegression(max_iter=1000)
ml_training(tv_lr_model, x_df, labels, "Logistic Regression")

# KNN Model
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    print("KNN with",x,"Neighbors")
    tv_knn_model = KNeighborsClassifier(n_neighbors=x)
    ml_training(tv_knn_model,x_df, labels,"KNN Model")
    
# Gaussian Naive Bayes
tv_gnb_model = GaussianNB()
ml_training(tv_gnb_model,x_df, labels,"Gaussian Naive Bayes")

# Bernoulli Naive Bayes
tv_bnb_model = BernoulliNB()
ml_training(tv_bnb_model,x_df, labels,"Bernoulli Naive Bayes")

# Support Vector Machine Classifier
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for a_kernel in svm_kernels:
    print("Working on SVM Kernal:", a_kernel)
    tv_svm_model = svm.SVC(kernel=a_kernel)
    ml_training(tv_svm_model,x_df, labels,"SVM")

# Decision Tree Classifier
for x in range(1,21):
    print("Decision Tree with",x,"max_depth")
    tv_dt_model = DecisionTreeClassifier(random_state=3, max_depth=x)
    ml_training(tv_dt_model,x_df, labels,"Decision Tree")
    
# Random Forest
for x in range(1,21):
    print("Random Forest with",x,"max_depth")
    tv_rf_model = RandomForestClassifier(max_depth=x, random_state=3)
    ml_training(tv_rf_model,x_df, labels,"Random Forest")
    
# scaling using MinMax scaler
mms_scale=MinMaxScaler(feature_range=(0,10))
m_df=mms_scale.fit_transform(x_df)
np.set_printoptions(precision=3)
# Multinomial Naive Bayes
tv_mnb_model = MultinomialNB()
ml_training(tv_mnb_model,m_df, labels,"Multinomial Naive Bayes")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies for Random Shuffle Split for Logistic Regression : [0.494 0.508 0.5   0.499 0.489 0.491 0.502 0.5   0.495 0.499]
Mean Accuracy of Random Shuffle Split for Logistic Regression : 49.77551020408163
KNN with 3 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.413 0.406 0.409 0.398 0.386 0.405 0.408 0.428 0.395 0.42 ]
Mean Accuracy of Random Shuffle Split for KNN Model : 40.68707482993197
KNN with 4 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.419 0.418 0.413 0.421 0.391 0.418 0.422 0.439 0.414 0.423]
Mean Accuracy of Random Shuffle Split for KNN Model : 41.78231292517006
KNN with 5 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.434 0.422 0.418 0.434 0.393 0.433 0.425 0.434 0.425 0.422]
Mean Accuracy of Random Shuffle Split for KNN Model : 42.421768707483
KNN with 6 Neighbors
Accuracies for Random Shuffle Split for KNN Model : [0.443 0.437 0.426 0.431 0.41  0.427 0.429 0.438 0.423 0.429]
Mean Accuracy of Random Shuffle Split fo

Accuracies for Random Shuffle Split for Decision Tree : [0.342 0.334 0.351 0.328 0.329 0.361 0.352 0.349 0.339 0.344]
Mean Accuracy of Random Shuffle Split for Decision Tree : 34.292517006802726
Decision Tree with 18 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.335 0.339 0.36  0.337 0.329 0.36  0.342 0.335 0.356 0.364]
Mean Accuracy of Random Shuffle Split for Decision Tree : 34.5578231292517
Decision Tree with 19 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.342 0.337 0.365 0.348 0.318 0.363 0.351 0.345 0.345 0.355]
Mean Accuracy of Random Shuffle Split for Decision Tree : 34.6938775510204
Decision Tree with 20 max_depth
Accuracies for Random Shuffle Split for Decision Tree : [0.35  0.327 0.365 0.334 0.328 0.348 0.343 0.349 0.341 0.354]
Mean Accuracy of Random Shuffle Split for Decision Tree : 34.39455782312925
Random Forest with 1 max_depth
Accuracies for Random Shuffle Split for Random Forest : [0.238 0.271 0.252 0.267 0.244 0.235 0.258