In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE #-> oversampling technique

df = pd.read_csv("../data/processed/data.csv")

X = df.drop(['literature_review'], axis=1)
y = df['literature_review']

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=42)

# oversampling with SMOTE
oversample = SMOTE(k_neighbors=2)
over_X, over_y = oversample.fit_resample(X_train,y_train)


models = {}
models['Logistic Regression'] = LogisticRegression(class_weight='balanced')
models['Support Vector Machines'] = SVC(class_weight='balanced')
models['Naive Bayes'] = BernoulliNB()
models['Decision Trees'] = DecisionTreeClassifier(class_weight='balanced')
models['Random Forest'] = RandomForestClassifier(class_weight='balanced')
models['K-Nearest Neighbor'] = KNeighborsClassifier()
models['BRF'] = BalancedRandomForestClassifier()

accuracy = {}
precision = {}
recall = {}
conf_mat = {}
f1 = {}

for key in models.keys():
    accuracy[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="accuracy"))
    precision[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="precision"))
    recall[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="recall"))
    f1[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="f1"))
    y_pred = cross_val_predict(models[key], X_train, y_train)
    conf_mat[key] = confusion_matrix(y_train, y_pred)
    print(key)
    print(f"f1: {f1[key]}, precision: {precision[key]}, recall: {recall[key]}, accuracy: {accuracy[key]}")

models['SMOTE'] = RandomForestClassifier()
accuracy['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="accuracy"))
precision['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="precision"))
recall['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="recall"))
f1['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="f1"))
y_pred = cross_val_predict(models['SMOTE'], over_X, over_y)
conf_mat['SMOTE'] = confusion_matrix(over_y, y_pred)
print("SMOTE")
print(f"f1: {f1['SMOTE']}, precision: {precision['SMOTE']}, recall: {recall['SMOTE']}, accuracy: {accuracy['SMOTE']}")


Logistic Regression
f1: 0.5848151848151849, precision: 0.9333333333333333, recall: 0.42777777777777776, accuracy: 0.8991704374057315
Support Vector Machines
f1: 0.5675213675213675, precision: 0.792063492063492, recall: 0.47222222222222215, accuracy: 0.8877073906485672


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Naive Bayes
f1: 0.0, precision: 0.0, recall: 0.0, accuracy: 0.8294871794871794
Decision Trees
f1: 0.4580116959064327, precision: 0.5, recall: 0.4722222222222222, accuracy: 0.8101809954751131


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest
f1: 0.15272727272727274, precision: 0.6, recall: 0.13611111111111113, accuracy: 0.8450226244343891


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


K-Nearest Neighbor
f1: 0.11272727272727275, precision: 0.4, recall: 0.06666666666666667, accuracy: 0.8410256410256409


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


BRF
f1: 0.4590659340659341, precision: 0.6849999999999999, recall: 0.49722222222222223, accuracy: 0.8566365007541478
SMOTE
f1: 0.7093624540886205, precision: 0.5546713347535265, recall: 0.9860465116279069, accuracy: 0.600437756497948
