## Train-test-split, parameter tuning, cross validation, final testing with SMOTE oversampling

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.over_sampling import SMOTE

warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

np.random.seed(42)

### Train test split

In [32]:
def split(source_file):
    df = pd.read_csv(source_file)
    X = df.drop(['literature_review'], axis=1)
    y = df['literature_review']
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=42)
    return X_train, X_test, y_train, y_test

### Parameter tuning with cross validation, smote and grid search

In [33]:
def tune_params(X_train,y_train,X_test,y_test,pipeline,params):
    grid_search = GridSearchCV(estimator=pipeline, param_grid=params, scoring=['f1','recall','precision'],cv=5, refit='f1')
    over = SMOTE()
    over_X, over_y = over.fit_resample(X_train, y_train)
    grid_search.fit(over_X, over_y)
    y_pred = grid_search.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(f"     f1: {f1}\n     recall: {recall}\n      precision: {precision}")

def train_and_test(source_path):
    X_train, X_test, y_train, y_test = split(source_path)


    # Logistic Regression
    lr_pipeline = imbpipeline([('sampling', SMOTE()),('scaler', StandardScaler()),('lr', LogisticRegression(max_iter=200))])
    lr_params = {'lr__penalty':['l1','l2'], 
                'lr__C':[1, 10, 100, 1000],
                'lr__class_weight': [None, 'balanced']}
    print('Logistic Regression:')
    tune_params(X_train, y_train, X_test, y_test, lr_pipeline, lr_params)
    
    # Support Vector Machines
    svm_pipeline = imbpipeline([('sampling', SMOTE()),('scaler', StandardScaler()),('svm', SVC())])
    svm_params = {'svm__C': [0.1, 1, 10],  
                'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                'svm__kernel': ['rbf'],
                'svm__class_weight': [None, 'balanced']}
    print('Support Vector Machines:')
    tune_params(X_train, y_train, X_test, y_test, svm_pipeline, svm_params)

    # Naive Bayes without parameter optimization
    nb = GaussianNB()
    oversample = SMOTE()
    over_X, over_y = oversample.fit_resample(X_train, y_train)
    nb.fit(over_X, over_y)
    y_pred = nb.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print('Naive Bayes:')
    print(f"     f1: {f1}\n     recall: {recall}\n      precision: {precision}")

    # Decision Trees
    dt_pipeline = imbpipeline([('sampling', SMOTE()),('dt', DecisionTreeClassifier())])
    dt_params = {'dt__criterion': ['gini', 'entropy'], 
                'dt__max_depth':range(1,10),
                'dt__class_weight': [None, 'balanced']}
    print('Decision Trees:')
    tune_params(X_train, y_train, X_test, y_test, dt_pipeline, dt_params)

    # Random Forest
    rf_pipeline = imbpipeline([('sampling', SMOTE()),('rf', RandomForestClassifier())])
    rf_params = {'rf__bootstrap': [True, False],
                 'rf__max_depth': [3, 6, 9, None],
                 'rf__max_features': ['auto', 'sqrt'],
                 'rf__n_estimators': [25, 50, 100, 150],
                 'rf__class_weight': [None, 'balanced']}
    print(f'Random Forest:')
    tune_params(X_train, y_train, X_test, y_test, rf_pipeline, rf_params)

    # k-nearest neighbor
    knn_pipeline = imbpipeline([('sampling', SMOTE()),('scaler', StandardScaler()),('knn', KNeighborsClassifier())])
    knn_params = {'knn__n_neighbors': range(1,10),  
                'knn__weights': ['uniform', 'distance']}
    print(f'K-nearest neighbor:')
    tune_params(X_train, y_train, X_test, y_test, knn_pipeline, knn_params)
    
    # Balanced Random Forest
    brf_pipeline = imbpipeline([('sampling', SMOTE()),('brf', BalancedRandomForestClassifier())])
    brf_params = {'brf__bootstrap': [True, False],
                 'brf__max_depth': [3, 6, 9, None],
                 'brf__max_features': ['auto', 'sqrt'],
                 'brf__n_estimators': [25, 50, 100, 150],
                 'brf__class_weight': [None, 'balanced']}
    print(f'Balanced Random Forest:')
    tune_params(X_train, y_train, X_test, y_test, brf_pipeline, brf_params)


In [34]:
print("- - - original dataset, keywords feature - - -")
train_and_test("../../data/processed/original_dataset/data_key.csv")

print("- - - original dataset, references feature - - -")
train_and_test("../../data/processed/original_dataset/data_ref.csv")

print("- - - original dataset, text mining feature - - -")
train_and_test("../../data/processed/original_dataset/data_tm.csv")

print("- - - extended dataset, keywords feature - - -")
train_and_test("../../data/processed/extended_dataset/data_key.csv")

print("- - - extended dataset, references feature - - -")
train_and_test("../../data/processed/extended_dataset/data_ref.csv")

print("- - - extended dataset, text mining feature - - -")
train_and_test("../../data/processed/extended_dataset/data_tm.csv")

- - - original dataset, keywords feature - - -
Logistic Regression:


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/sk

     f1: 0.3181818181818182
     recall: 0.4375
      precision: 0.25
Support Vector Machines:
     f1: 0.07407407407407407
     recall: 0.8125
      precision: 0.03880597014925373
Naive Bayes:
     f1: 0.05871559633027523
     recall: 1.0
      precision: 0.030245746691871456
Decision Trees:
     f1: 0.11049723756906077
     recall: 0.625
      precision: 0.06060606060606061
Random Forest:


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages

     f1: 0.34146341463414637
     recall: 0.4375
      precision: 0.28
K-nearest neighbor:
     f1: 0.25
     recall: 0.1875
      precision: 0.375
Balanced Random Forest:


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


     f1: 0.3181818181818182
     recall: 0.4375
      precision: 0.25
- - - original dataset, references feature - - -
Logistic Regression:


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/sk

     f1: 0.3181818181818182
     recall: 0.4375
      precision: 0.25
Support Vector Machines:
     f1: 0.07449856733524356
     recall: 0.8125
      precision: 0.03903903903903904
Naive Bayes:
     f1: 0.05871559633027523
     recall: 1.0
      precision: 0.030245746691871456
Decision Trees:
     f1: 0.32558139534883723
     recall: 0.4375
      precision: 0.25925925925925924
Random Forest:


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages

     f1: 0.3333333333333333
     recall: 0.4375
      precision: 0.2692307692307692
K-nearest neighbor:
     f1: 0.25
     recall: 0.1875
      precision: 0.375
Balanced Random Forest:


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


     f1: 0.3181818181818182
     recall: 0.4375
      precision: 0.25
- - - original dataset, text mining feature - - -
Logistic Regression:


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.

     f1: 0.13333333333333333
     recall: 0.3125
      precision: 0.0847457627118644
Support Vector Machines:


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


     f1: 0.125
     recall: 0.1875
      precision: 0.09375
Naive Bayes:
     f1: 0.08333333333333333
     recall: 0.3125
      precision: 0.04807692307692308
Decision Trees:


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret 

     f1: 0.1509433962264151
     recall: 0.5
      precision: 0.08888888888888889
Random Forest:


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret 

KeyboardInterrupt: 

### Training and testing with optimal parameters

In [None]:
'''
def train_test(estimator, y_test, X_test, y_train, X_train):
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(f"     f1: {f1}\n     recall: {recall}\n      precision: {precision}")

print("original dataset, keywords feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight=None, penalty='l2'), y_test1, X_test1, y_train1, X_train1)
print(" SVM:")
train_test(SVC(C=0.1, class_weight=None, gamma=0.01, kernel='rbf'), y_test1, X_test1, y_train1, X_train1)
print(" NB:")
train_test(GaussianNB(), y_test1, X_test1, y_train1, X_train1)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1), y_test1, X_test1, y_train1, X_train1)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight=None, max_depth=3, max_features='sqrt', n_estimators=25), y_test1, X_test1, y_train1, X_train1)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=9, weights='distance'), y_test1, X_test1, y_train1, X_train1)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=3, max_features='sqrt', n_estimators=100), y_test1, X_test1, y_train1, X_train1)


print("original dataset, references feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight='balanced', penalty='l2'), y_test2, X_test2, y_train2, X_train2)
print(" SVM:")
train_test(SVC(C=0.1, class_weight=None, gamma=0.01, kernel='rbf'), y_test2, X_test2, y_train2, X_train2)
print(" NB:")
train_test(GaussianNB(), y_test2, X_test2, y_train2, X_train2)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1), y_test2, X_test2, y_train2, X_train2)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight=None, max_depth=9, max_features='sqrt', n_estimators=25), y_test2, X_test2, y_train2, X_train2)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=9, weights='distance'), y_test2, X_test2, y_train2, X_train2)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=True, class_weight=None, max_depth=None, max_features='sqrt', n_estimators=25), y_test2, X_test2, y_train2, X_train2)


print("original dataset, text mining feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight=None, penalty='l2'), y_test3, X_test3, y_train3, X_train3)
print(" SVM:")
train_test(SVC(C=10, class_weight=None, gamma=0.0001, kernel='rbf'), y_test3, X_test3, y_train3, X_train3)
print(" NB:")
train_test(GaussianNB(), y_test3, X_test3, y_train3, X_train3)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5), y_test3, X_test3, y_train3, X_train3)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight='balanced', max_depth=None, max_features='sqrt', n_estimators=100), y_test3, X_test3, y_train3, X_train3)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=3, weights='distance'), y_test3, X_test3, y_train3, X_train3)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight=None, max_depth=None, max_features='sqrt', n_estimators=25), y_test3, X_test3, y_train3, X_train3)


print("extended dataset, keywords feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1000, class_weight='balanced', penalty='l2'), y_test4, X_test4, y_train4, X_train4)
print(" SVM:")
train_test(SVC(C=0.1, class_weight=None, gamma=0.01, kernel='rbf'), y_test4, X_test4, y_train4, X_train4)
print(" NB:")
train_test(GaussianNB(), y_test4, X_test4, y_train4, X_train4)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight='balanced', criterion='entropy', max_depth=8), y_test4, X_test4, y_train4, X_train4)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=False, class_weight=None, max_depth=6, max_features='sqrt', n_estimators=50), y_test4, X_test4, y_train4, X_train4)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=9, weights='distance'), y_test4, X_test4, y_train4, X_train4)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=6, max_features='sqrt', n_estimators=50), y_test4, X_test4, y_train4, X_train4)


print("extended dataset, references feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=100, class_weight='balanced', penalty='l2'), y_test5, X_test5, y_train5, X_train5)
print(" SVM:")
train_test(SVC(C=1, class_weight='balanced', gamma=0.001, kernel='rbf'), y_test5, X_test5, y_train5, X_train5)
print(" NB:")
train_test(GaussianNB(), y_test5, X_test5, y_train5, X_train5)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3), y_test5, X_test5, y_train5, X_train5)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=False, class_weight=None, max_depth=6, max_features='sqrt', n_estimators=25), y_test5, X_test5, y_train5, X_train5)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=9, weights='distance'), y_test5, X_test5, y_train5, X_train5)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=6, max_features='sqrt', n_estimators=100), y_test5, X_test5, y_train5, X_train5)


print("extended dataset, text mining feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight=None, penalty='l2'), y_test6, X_test6, y_train6, X_train6)
print(" SVM:")
train_test(SVC(C=0.1, class_weight=None, gamma=0.001, kernel='rbf'), y_test6, X_test6, y_train6, X_train6)
print(" NB:")
train_test(GaussianNB(), y_test6, X_test6, y_train6, X_train6)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=4), y_test6, X_test6, y_train6, X_train6)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=False, class_weight=None, max_depth=None, max_features='sqrt', n_estimators=50), y_test6, X_test6, y_train6, X_train6)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=5, weights='distance'), y_test6, X_test6, y_train6, X_train6)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=None, max_features='sqrt', n_estimators=150), y_test6, X_test6, y_train6, X_train6)

'''

original dataset, keywords feature
 LR:


NameError: name 'y_test1' is not defined