## Train-test-split, parameter tuning, cross validation, final testing with SMOTE oversampling

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.over_sampling import SMOTE

warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

np.random.seed(42)

### Train test split

In [3]:
def split(source_file):
    df = pd.read_csv(source_file)
    X = df.drop(['literature_review'], axis=1)
    y = df['literature_review']
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=42)
    return X_train, X_test, y_train, y_test

### Parameter tuning with cross validation, smote and grid search

In [4]:
def tune_params(X_train,y_train,pipeline,params):
    grid_search = GridSearchCV(estimator=pipeline, param_grid=params, scoring=['f1','recall','precision'],cv=5, refit='f1')
    grid_search.fit(X_train,y_train)
    return grid_search.best_params_, grid_search.score(X_train, y_train)

def cv(source_path):
    X_train, X_test, y_train, y_test = split(source_path)


    # Logistic Regression
    lr_pipeline = imbpipeline([('sampling', SMOTE()),('scaler', StandardScaler()),('lr', LogisticRegression(max_iter=200))])
    lr_params = {'lr__penalty':['l1','l2'], 
                'lr__C':[1, 10, 100, 1000],
                'lr__class_weight': [None, 'balanced']}
    lr_best_params, lr_score = tune_params(X_train, y_train, lr_pipeline, lr_params)
    print(f'Logistic Regression:\n best params: {lr_best_params}\n scores: {lr_score}')
    
    # Support Vector Machines
    svm_pipeline = imbpipeline([('sampling', SMOTE()),('scaler', StandardScaler()),('svm', SVC())])
    svm_params = {'svm__C': [0.1, 1, 10],  
                'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                'svm__kernel': ['rbf'],
                'svm__class_weight': [None, 'balanced']}
    svm_best_params, svm_score = tune_params(X_train, y_train, svm_pipeline, svm_params)
    print(f'Support Vector Machines:\n best params: {svm_best_params}\n scores: {svm_score}')

    # Naive Bayes without parameter optimization
    pipeline = imbpipeline(steps = [('sampling', SMOTE()), ('nb', GaussianNB())])
    stratified_kfold = StratifiedKFold(n_splits=5)
    f1_nb = np.mean(cross_val_score(pipeline, X_train, y_train, scoring = 'f1', cv=stratified_kfold))
    print(f"Naive Bayes: \n f1: {f1_nb}")

    # Decision Trees
    dt_pipeline = imbpipeline([('sampling', SMOTE()),('dt', DecisionTreeClassifier())])
    dt_params = {'dt__criterion': ['gini', 'entropy'], 
                'dt__max_depth':range(1,10),
                'dt__class_weight': [None, 'balanced']}
    dt_best_params, dt_score = tune_params(X_train, y_train, dt_pipeline, dt_params)
    print(f'Decision Trees:\n best params: {dt_best_params}\n scores: {dt_score}')

    # Random Forest
    rf_pipeline = imbpipeline([('sampling', SMOTE()),('rf', RandomForestClassifier())])
    rf_params = {'rf__bootstrap': [True, False],
                 'rf__max_depth': [3, 6, 9, None],
                 'rf__max_features': ['auto', 'sqrt'],
                 'rf__n_estimators': [25, 50, 100, 150],
                 'rf__class_weight': [None, 'balanced']}
    rf_best_params, rf_score = tune_params(X_train, y_train, rf_pipeline, rf_params)
    print(f'Random Forest:\n best params: {rf_best_params}\n scores: {rf_score}')

    # k-nearest neighbor
    knn_pipeline = imbpipeline([('sampling', SMOTE()),('scaler', StandardScaler()),('knn', KNeighborsClassifier())])
    knn_params = {'knn__n_neighbors': range(1,10),  
                'knn__weights': ['uniform', 'distance']}
    knn_best_params, knn_score = tune_params(X_train, y_train, knn_pipeline, knn_params)
    print(f'K-nearest neighbor:\n best params: {knn_best_params}\n scores: {knn_score}')
    
    # Balanced Random Forest
    brf_pipeline = imbpipeline([('sampling', SMOTE()),('brf', BalancedRandomForestClassifier())])
    brf_params = {'brf__bootstrap': [True, False],
                 'brf__max_depth': [3, 6, 9, None],
                 'brf__max_features': ['auto', 'sqrt'],
                 'brf__n_estimators': [25, 50, 100, 150],
                 'brf__class_weight': [None, 'balanced']}
    brf_best_params, brf_score = tune_params(X_train, y_train, brf_pipeline, brf_params)
    print(f'Balanced Random Forest: \n best params: {brf_best_params}\n scores: {brf_score}')

    return y_test, X_test, y_train, X_train


In [5]:
print("- - - original dataset, keywords feature - - -")
y_test1, X_test1, y_train1, X_train1 = cv("../../data/processed/original_dataset/data_key.csv")

print("- - - original dataset, references feature - - -")
y_test2, X_test2, y_train2, X_train2 = cv("../../data/processed/original_dataset/data_ref.csv")

print("- - - original dataset, text mining feature - - -")
y_test3, X_test3, y_train3, X_train3 = cv("../../data/processed/original_dataset/data_tm.csv")

print("- - - extended dataset, keywords feature - - -")
y_test4, X_test4, y_train4, X_train4 = cv("../../data/processed/extended_dataset/data_key.csv")

print("- - - extended dataset, references feature - - -")
y_test5, X_test5, y_train5, X_train5 = cv("../../data/processed/extended_dataset/data_ref.csv")

print("- - - extended dataset, text mining feature - - -")
y_test6, X_test6, y_train6, X_train6 = cv("../../data/processed/extended_dataset/data_tm.csv")

- - - original dataset, keywords feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/sk

Logistic Regression:
 best params: {'lr__C': 1, 'lr__class_weight': None, 'lr__penalty': 'l2'}
 scores: 0.3252032520325203
Support Vector Machines:
 best params: {'svm__C': 0.1, 'svm__class_weight': None, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}
 scores: 0.3409090909090909
Naive Bayes: 
 f1: 0.06245788623763514
Decision Trees:
 best params: {'dt__class_weight': None, 'dt__criterion': 'gini', 'dt__max_depth': 1}
 scores: 0.32941176470588235


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages

Random Forest:
 best params: {'rf__bootstrap': True, 'rf__class_weight': 'balanced', 'rf__max_depth': 3, 'rf__max_features': 'sqrt', 'rf__n_estimators': 150}
 scores: 0.3252032520325203
K-nearest neighbor:
 best params: {'knn__n_neighbors': 9, 'knn__weights': 'distance'}
 scores: 0.3764705882352941


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': True, 'brf__class_weight': 'balanced', 'brf__max_depth': 6, 'brf__max_features': 'sqrt', 'brf__n_estimators': 100}
 scores: 0.3418803418803419
- - - original dataset, references feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/sk

Logistic Regression:
 best params: {'lr__C': 1, 'lr__class_weight': 'balanced', 'lr__penalty': 'l2'}
 scores: 0.3252032520325203
Support Vector Machines:
 best params: {'svm__C': 0.1, 'svm__class_weight': None, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}
 scores: 0.3448275862068966
Naive Bayes: 
 f1: 0.0612789233565446
Decision Trees:
 best params: {'dt__class_weight': None, 'dt__criterion': 'gini', 'dt__max_depth': 1}
 scores: 0.32941176470588235


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages

Random Forest:
 best params: {'rf__bootstrap': True, 'rf__class_weight': None, 'rf__max_depth': 6, 'rf__max_features': 'sqrt', 'rf__n_estimators': 150}
 scores: 0.358974358974359
K-nearest neighbor:
 best params: {'knn__n_neighbors': 9, 'knn__weights': 'distance'}
 scores: 0.4


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': True, 'brf__class_weight': None, 'brf__max_depth': 3, 'brf__max_features': 'sqrt', 'brf__n_estimators': 100}
 scores: 0.33613445378151263
- - - original dataset, text mining feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/sk

Logistic Regression:
 best params: {'lr__C': 1, 'lr__class_weight': 'balanced', 'lr__penalty': 'l2'}
 scores: 0.42105263157894735
Support Vector Machines:
 best params: {'svm__C': 10, 'svm__class_weight': 'balanced', 'svm__gamma': 0.0001, 'svm__kernel': 'rbf'}
 scores: 0.26625386996904027
Naive Bayes: 
 f1: 0.10905654731741687
Decision Trees:
 best params: {'dt__class_weight': None, 'dt__criterion': 'gini', 'dt__max_depth': 3}
 scores: 0.07759303246239113


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages

Random Forest:
 best params: {'rf__bootstrap': False, 'rf__class_weight': 'balanced', 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
 scores: 0.5483870967741935
K-nearest neighbor:
 best params: {'knn__n_neighbors': 3, 'knn__weights': 'uniform'}
 scores: 0.2706270627062706


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': True, 'brf__class_weight': None, 'brf__max_depth': None, 'brf__max_features': 'sqrt', 'brf__n_estimators': 100}
 scores: 0.5531914893617021
- - - extended dataset, keywords feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/sk

Logistic Regression:
 best params: {'lr__C': 1000, 'lr__class_weight': 'balanced', 'lr__penalty': 'l2'}
 scores: 0.31896551724137934
Support Vector Machines:
 best params: {'svm__C': 0.1, 'svm__class_weight': None, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}
 scores: 0.3
Naive Bayes: 
 f1: 0.05724534553293087
Decision Trees:
 best params: {'dt__class_weight': 'balanced', 'dt__criterion': 'entropy', 'dt__max_depth': 9}
 scores: 0.31718061674008813


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages

Random Forest:
 best params: {'rf__bootstrap': True, 'rf__class_weight': 'balanced', 'rf__max_depth': 6, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
 scores: 0.31759656652360513
K-nearest neighbor:
 best params: {'knn__n_neighbors': 9, 'knn__weights': 'distance'}
 scores: 0.20754716981132076


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': False, 'brf__class_weight': None, 'brf__max_depth': 6, 'brf__max_features': 'sqrt', 'brf__n_estimators': 50}
 scores: 0.3217391304347826
- - - extended dataset, references feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/sk

Logistic Regression:
 best params: {'lr__C': 1000, 'lr__class_weight': None, 'lr__penalty': 'l2'}
 scores: 0.3261802575107296
Support Vector Machines:
 best params: {'svm__C': 1, 'svm__class_weight': None, 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
 scores: 0.31868131868131866
Naive Bayes: 
 f1: 0.05479773350736222
Decision Trees:
 best params: {'dt__class_weight': 'balanced', 'dt__criterion': 'gini', 'dt__max_depth': 3}
 scores: 0.32044198895027626


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages

Random Forest:
 best params: {'rf__bootstrap': False, 'rf__class_weight': 'balanced', 'rf__max_depth': 6, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
 scores: 0.32599118942731276
K-nearest neighbor:
 best params: {'knn__n_neighbors': 7, 'knn__weights': 'distance'}
 scores: 0.26785714285714285


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': True, 'brf__class_weight': 'balanced', 'brf__max_depth': 6, 'brf__max_features': 'sqrt', 'brf__n_estimators': 150}
 scores: 0.3318777292576419
- - - extended dataset, text mining feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/sk

Logistic Regression:
 best params: {'lr__C': 1, 'lr__class_weight': None, 'lr__penalty': 'l2'}
 scores: 0.37900874635568516
Support Vector Machines:
 best params: {'svm__C': 10, 'svm__class_weight': None, 'svm__gamma': 0.0001, 'svm__kernel': 'rbf'}
 scores: 0.26552462526766596
Naive Bayes: 
 f1: 0.062450701535413856
Decision Trees:
 best params: {'dt__class_weight': 'balanced', 'dt__criterion': 'gini', 'dt__max_depth': 6}
 scores: 0.2616033755274262


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages

Random Forest:
 best params: {'rf__bootstrap': False, 'rf__class_weight': None, 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__n_estimators': 25}
 scores: 0.5391849529780565
K-nearest neighbor:
 best params: {'knn__n_neighbors': 5, 'knn__weights': 'uniform'}
 scores: 0.18072289156626506


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': False, 'brf__class_weight': 'balanced', 'brf__max_depth': None, 'brf__max_features': 'sqrt', 'brf__n_estimators': 25}
 scores: 0.5391849529780565


### Training and testing with optimal parameters

In [3]:
def train_test(estimator, y_test, X_test, y_train, X_train):
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(f"     f1: {f1}\n     recall: {recall}\n      precision: {precision}")

print("original dataset, keywords feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight=None, penalty='l2'), y_test1, X_test1, y_train1, X_train1)
print(" SVM:")
train_test(SVC(C=0.1, class_weight=None, gamma=0.01, kernel='rbf'), y_test1, X_test1, y_train1, X_train1)
print(" NB:")
train_test(GaussianNB(), y_test1, X_test1, y_train1, X_train1)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1), y_test1, X_test1, y_train1, X_train1)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight=None, max_depth=3, max_features='sqrt', n_estimators=25), y_test1, X_test1, y_train1, X_train1)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=9, weights='distance'), y_test1, X_test1, y_train1, X_train1)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=3, max_features='sqrt', n_estimators=100), y_test1, X_test1, y_train1, X_train1)


print("original dataset, references feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight='balanced', penalty='l2'), y_test2, X_test2, y_train2, X_train2)
print(" SVM:")
train_test(SVC(C=0.1, class_weight=None, gamma=0.01, kernel='rbf'), y_test2, X_test2, y_train2, X_train2)
print(" NB:")
train_test(GaussianNB(), y_test2, X_test2, y_train2, X_train2)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1), y_test2, X_test2, y_train2, X_train2)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight=None, max_depth=9, max_features='sqrt', n_estimators=25), y_test2, X_test2, y_train2, X_train2)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=9, weights='distance'), y_test2, X_test2, y_train2, X_train2)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=True, class_weight=None, max_depth=None, max_features='sqrt', n_estimators=25), y_test2, X_test2, y_train2, X_train2)


print("original dataset, text mining feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight=None, penalty='l2'), y_test3, X_test3, y_train3, X_train3)
print(" SVM:")
train_test(SVC(C=10, class_weight=None, gamma=0.0001, kernel='rbf'), y_test3, X_test3, y_train3, X_train3)
print(" NB:")
train_test(GaussianNB(), y_test3, X_test3, y_train3, X_train3)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5), y_test3, X_test3, y_train3, X_train3)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight='balanced', max_depth=None, max_features='sqrt', n_estimators=100), y_test3, X_test3, y_train3, X_train3)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=3, weights='distance'), y_test3, X_test3, y_train3, X_train3)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight=None, max_depth=None, max_features='sqrt', n_estimators=25), y_test3, X_test3, y_train3, X_train3)


print("extended dataset, keywords feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1000, class_weight='balanced', penalty='l2'), y_test4, X_test4, y_train4, X_train4)
print(" SVM:")
train_test(SVC(C=0.1, class_weight=None, gamma=0.01, kernel='rbf'), y_test4, X_test4, y_train4, X_train4)
print(" NB:")
train_test(GaussianNB(), y_test4, X_test4, y_train4, X_train4)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight='balanced', criterion='entropy', max_depth=8), y_test4, X_test4, y_train4, X_train4)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=False, class_weight=None, max_depth=6, max_features='sqrt', n_estimators=50), y_test4, X_test4, y_train4, X_train4)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=9, weights='distance'), y_test4, X_test4, y_train4, X_train4)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=6, max_features='sqrt', n_estimators=50), y_test4, X_test4, y_train4, X_train4)


print("extended dataset, references feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=100, class_weight='balanced', penalty='l2'), y_test5, X_test5, y_train5, X_train5)
print(" SVM:")
train_test(SVC(C=1, class_weight='balanced', gamma=0.001, kernel='rbf'), y_test5, X_test5, y_train5, X_train5)
print(" NB:")
train_test(GaussianNB(), y_test5, X_test5, y_train5, X_train5)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3), y_test5, X_test5, y_train5, X_train5)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=False, class_weight=None, max_depth=6, max_features='sqrt', n_estimators=25), y_test5, X_test5, y_train5, X_train5)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=9, weights='distance'), y_test5, X_test5, y_train5, X_train5)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=6, max_features='sqrt', n_estimators=100), y_test5, X_test5, y_train5, X_train5)


print("extended dataset, text mining feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight=None, penalty='l2'), y_test6, X_test6, y_train6, X_train6)
print(" SVM:")
train_test(SVC(C=0.1, class_weight=None, gamma=0.001, kernel='rbf'), y_test6, X_test6, y_train6, X_train6)
print(" NB:")
train_test(GaussianNB(), y_test6, X_test6, y_train6, X_train6)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=4), y_test6, X_test6, y_train6, X_train6)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=False, class_weight=None, max_depth=None, max_features='sqrt', n_estimators=50), y_test6, X_test6, y_train6, X_train6)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=5, weights='distance'), y_test6, X_test6, y_train6, X_train6)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=None, max_features='sqrt', n_estimators=150), y_test6, X_test6, y_train6, X_train6)



original dataset, keywords feature
 LR:


NameError: name 'y_test1' is not defined