In [215]:
import pandas as pd 
from sklearn import metrics
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier

In [216]:
TFIDF = TfidfVectorizer()

def calc_similarity(row): 
    row = row[0:2]
    tfidf_matrix = TFIDF.fit_transform(row)
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    print(cosine_sim[0][1])
    return cosine_sim[0][1]

In [217]:
x = pd.read_csv('similarities.csv')
y = pd.read_csv('labels.csv')
x = np.array(x['sim']).reshape(-1, 1)
y = y['label']

In [218]:
train_x, test_x, train_y, test_y = train_test_split(x,y , test_size=0.3, random_state=1)

# balance data 
sm = SMOTE(random_state=1)
train_x_r, train_y_r = sm.fit_resample(train_x, train_y.ravel()) 

In [223]:
clf = LogisticRegression(solver='lbfgs',multi_class='multinomial')
clf.fit(train_x_r, train_y_r)
clf_y_pred = clf.predict(test_x)
# print results 
print("Accuracy_LogisticRegression:")
print(metrics.accuracy_score(test_y, clf_y_pred)*100)
print("\nConfusion Matrix:")
print(metrics.confusion_matrix(test_y, clf_y_pred))
print("\nClassification Report:")
print(metrics.classification_report(test_y, clf_y_pred))

Accuracy_LogisticRegression:
65.52324750107236

Confusion Matrix:
[[38613  9003  5020]
 [  783   573   538]
 [ 5433  5747 11223]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.73      0.79     52636
           1       0.04      0.30      0.07      1894
           2       0.67      0.50      0.57     22403

    accuracy                           0.66     76933
   macro avg       0.52      0.51      0.48     76933
weighted avg       0.78      0.66      0.71     76933



In [224]:
clf = LinearSVC()
clf.fit(train_x_r, train_y_r)
clf_y_pred = clf.predict(test_x)
# print results 
print("Accuracy_LinearSVC:")
print(metrics.accuracy_score(test_y, clf_y_pred)*100)
print("\nConfusion Matrix:")
print(metrics.confusion_matrix(test_y, clf_y_pred))
print("\nClassification Report:")
print(metrics.classification_report(test_y, clf_y_pred))

Accuracy_LinearSVC:
73.63550102036838

Confusion Matrix:
[[42608  1616  8412]
 [ 1008   108   778]
 [ 7441  1028 13934]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82     52636
           1       0.04      0.06      0.05      1894
           2       0.60      0.62      0.61     22403

    accuracy                           0.74     76933
   macro avg       0.49      0.50      0.49     76933
weighted avg       0.75      0.74      0.74     76933



In [225]:
sgd = SGDClassifier(loss="log_loss", penalty="l2")
sgd.fit(train_x_r, train_y_r)
sgd_pred = sgd.predict(test_x)
# print results 
print("Accuracy_SGDC:")
print(metrics.accuracy_score(test_y, sgd_pred)*100)
print("\nConfusion Matrix:")
print(metrics.confusion_matrix(test_y, sgd_pred))
print("\nClassification Report:")
print(metrics.classification_report(test_y, sgd_pred))

Accuracy_SGDC:
71.49727685128619

Confusion Matrix:
[[40863  3361  8412]
 [  908   208   778]
 [ 6477  1992 13934]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.78      0.81     52636
           1       0.04      0.11      0.06      1894
           2       0.60      0.62      0.61     22403

    accuracy                           0.71     76933
   macro avg       0.50      0.50      0.49     76933
weighted avg       0.76      0.71      0.73     76933



In [227]:
clf = LinearSVC()
clf.fit(train_x, train_y)
clf_y_pred = clf.predict(test_x)
# print results 
print("Accuracy_LinearSVC_withoutSMOTE:")
print(metrics.accuracy_score(test_y, clf_y_pred)*100)
print("\nConfusion Matrix:")
print(metrics.confusion_matrix(test_y, clf_y_pred))
print("\nClassification Report:")
print(metrics.classification_report(test_y, clf_y_pred))

Accuracy_LinearSVC_withoutSMOTE:
76.27806013024319

Confusion Matrix:
[[49262     0  3374]
 [ 1473     0   421]
 [12982     0  9421]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.94      0.85     52636
           1       0.00      0.00      0.00      1894
           2       0.71      0.42      0.53     22403

    accuracy                           0.76     76933
   macro avg       0.50      0.45      0.46     76933
weighted avg       0.74      0.76      0.73     76933



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
