In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE #-> oversampling technique

np.random.seed(42)

df = pd.read_csv("../data/processed/data.csv")
df.drop(['crossmark-restriction'], axis=1, inplace=True)

X = df.drop(['literature_review'], axis=1)
y = df['literature_review']

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=42)

# oversampling with SMOTE
oversample = SMOTE(k_neighbors=2)
over_X, over_y = oversample.fit_resample(X_train,y_train)


models = {}
models['Logistic Regression'] = LogisticRegression(class_weight='balanced')
models['Support Vector Machines'] = SVC(class_weight='balanced')
models['Naive Bayes'] = BernoulliNB()
models['Decision Trees'] = DecisionTreeClassifier(class_weight='balanced')
models['Random Forest'] = RandomForestClassifier(class_weight='balanced')
models['K-Nearest Neighbor'] = KNeighborsClassifier()
models['BRF'] = BalancedRandomForestClassifier()

accuracy = {}
precision = {}
recall = {}
conf_mat = {}
f1 = {}

for key in models.keys():
    accuracy[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="accuracy"))
    precision[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="precision"))
    recall[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="recall"))
    f1[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="f1"))
    y_pred = cross_val_predict(models[key], X_train, y_train)
    conf_mat[key] = confusion_matrix(y_train, y_pred)
    print(key)
    print(f"f1: {f1[key]}, precision: {precision[key]}, recall: {recall[key]}, accuracy: {accuracy[key]}")

models['SMOTE'] = RandomForestClassifier()
accuracy['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="accuracy"))
precision['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="precision"))
recall['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="recall"))
f1['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="f1"))
y_pred = cross_val_predict(models['SMOTE'], over_X, over_y)
conf_mat['SMOTE'] = confusion_matrix(over_y, y_pred)
print("SMOTE")
print(f"f1: {f1['SMOTE']}, precision: {precision['SMOTE']}, recall: {recall['SMOTE']}, accuracy: {accuracy['SMOTE']}")


ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values