In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE # -> model for highly imbalanced classes, test in the future
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

df = pd.read_csv("../data/processed/data_ref.csv")

X = df.drop(['literature_review'], axis=1)
y = df['literature_review']

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=0)

# oversampling with SMOTE
oversample = SMOTE(k_neighbors=2)
over_X, over_y = oversample.fit_resample(X_train,y_train)

models = {}
models['Logistic Regression'] = LogisticRegression()
models['Support Vector Machines'] = SVC()
models['Naive Bayes'] = BernoulliNB()
models['Decision Trees'] = DecisionTreeClassifier()
models['Random Forest'] = RandomForestClassifier()
models['K-Nearest Neighbor'] = KNeighborsClassifier()

accuracy = {}
precision = {}
recall = {}
conf_mat = {}
f1 = {}

for key in models.keys():
    accuracy[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="accuracy"))
    precision[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="precision"))
    recall[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="recall"))
    f1[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="f1"))
    y_pred = cross_val_predict(models[key], X_train, y_train)
    conf_mat[key] = confusion_matrix(y_train, y_pred)


models['SMOTE'] = RandomForestClassifier()
accuracy['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="accuracy"))
precision['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="precision"))
recall['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="recall"))
f1['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="f1"))
y_pred = cross_val_predict(models['SMOTE'], over_X, over_y)
conf_mat['SMOTE'] = confusion_matrix(over_y, y_pred)
print("SMOTE")
print(f"f1: {f1['SMOTE']}, precision: {precision['SMOTE']}, recall: {recall['SMOTE']}, accuracy: {accuracy['SMOTE']}")

models["SMOTE"].fit(over_X, over_y)
y_pred = models["SMOTE"].predict(X_test)
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

SMOTE
f1: 0.5849990510515478, precision: 0.827724241728899, recall: 0.4404558404558404, accuracy: 0.6712250712250712
0.14035087719298245
0.3333333333333333
0.08888888888888889
