## Train-test-split, parameter tuning, cross validation, final testing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

np.random.seed(42)

### Train-test-split

In [2]:
def split(source_file):
    df = pd.read_csv(source_file)
    X = df.drop(['literature_review'], axis=1)
    y = df['literature_review']
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=42)
    return X_train, X_test, y_train, y_test

### Parameter tuning with cross-validation and grid search

In [3]:
def tune_params(X_train,y_train,pipeline,params):
    grid_search = GridSearchCV(estimator=pipeline, param_grid=params, scoring=['f1','recall','precision'],cv=5, refit='f1')
    grid_search.fit(X_train,y_train)
    return grid_search.best_params_, grid_search.score(X_train, y_train)

def cv(source_path):
    X_train, X_test, y_train, y_test = split(source_path)


    # Logistic Regression
    lr_pipeline = Pipeline([('scaler', StandardScaler()),('lr', LogisticRegression(max_iter=200))])
    lr_params = {'lr__penalty':['l1','l2'], 
                'lr__C':[1, 10, 100, 1000],
                'lr__class_weight': [None, 'balanced']}
    lr_best_params, lr_score = tune_params(X_train, y_train, lr_pipeline, lr_params)
    print(f'Logistic Regression:\n best params: {lr_best_params}\n scores: {lr_score}')

    # Support Vector Machines
    svm_pipeline = Pipeline([('scaler', StandardScaler()),('svm', SVC())])
    svm_params = {'svm__C': [0.1, 1, 10],  
                'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                'svm__kernel': ['rbf'],
                'svm__class_weight': [None, 'balanced']}
    svm_best_params, svm_score = tune_params(X_train, y_train, svm_pipeline, svm_params)
    print(f'Support Vector Machines:\n best params: {svm_best_params}\n scores: {svm_score}')

    # Naive Bayes
    f1_nb = np.mean(cross_val_score(GaussianNB(), X_train, y_train, scoring="f1"))
    print(f"Naive Bayes: \n f1: {f1_nb} -> no parameter optimization!")

    # Decision Trees
    dt_pipeline = Pipeline([('dt', DecisionTreeClassifier())])
    dt_params = {'dt__criterion': ['gini', 'entropy'], 
                'dt__max_depth':range(1,10),
                'dt__class_weight': [None, 'balanced']}
    dt_best_params, dt_score = tune_params(X_train, y_train, dt_pipeline, dt_params)
    print(f'Decision Trees:\n best params: {dt_best_params}\n scores: {dt_score}')

    # Random Forest
    rf_pipeline = Pipeline([('rf', RandomForestClassifier())])
    rf_params = {'rf__bootstrap': [True, False],
                 'rf__max_depth': [3, 6, 9, None],
                 'rf__max_features': ['auto', 'sqrt'],
                 'rf__n_estimators': [25, 50, 100, 150],
                 'rf__class_weight': [None, 'balanced']}
    rf_best_params, rf_score = tune_params(X_train, y_train, rf_pipeline, rf_params)
    print(f'Random Forest:\n best params: {rf_best_params}\n scores: {rf_score}')

    # k-nearest neighbor
    knn_pipeline = Pipeline([('scaler', StandardScaler()),('knn', KNeighborsClassifier())])
    knn_params = {'knn__n_neighbors': range(1,10),  
                'knn__weights': ['uniform', 'distance']}
    knn_best_params, knn_score = tune_params(X_train, y_train, knn_pipeline, knn_params)
    print(f'K-nearest neighbor:\n best params: {knn_best_params}\n scores: {knn_score}')
    
    # Balanced Random Forest
    brf_pipeline = imbpipeline([('brf', BalancedRandomForestClassifier())])
    brf_params = {'brf__bootstrap': [True, False],
                 'brf__max_depth': [3, 6, 9, None],
                 'brf__max_features': ['auto', 'sqrt'],
                 'brf__n_estimators': [25, 50, 100, 150],
                 'brf__class_weight': [None, 'balanced']}
    brf_best_params, brf_score = tune_params(X_train, y_train, brf_pipeline, brf_params)
    print(f'Balanced Random Forest: \n best params: {brf_best_params}\n scores: {brf_score}')

    return y_test, X_test, y_train, X_train




In [4]:
print("- - - original dataset, keywords feature - - -")
y_test1, X_test1, y_train1, X_train1 = cv("../../data/processed/original_dataset/data_key.csv")

print("- - - original dataset, references feature - - -")
y_test2, X_test2, y_train2, X_train2 = cv("../../data/processed/original_dataset/data_ref.csv")

print("- - - original dataset, text mining feature - - -")
y_test3, X_test3, y_train3, X_train3 = cv("../../data/processed/original_dataset/data_tm.csv")

print("- - - extended dataset, keywords feature - - -")
y_test4, X_test4, y_train4, X_train4 = cv("../../data/processed/extended_dataset/data_key.csv")

print("- - - extended dataset, references feature - - -")
y_test5, X_test5, y_train5, X_train5 = cv("../../data/processed/extended_dataset/data_ref.csv")

print("- - - extended dataset, text mining feature - - -")
y_test6, X_test6, y_train6, X_train6 = cv("../../data/processed/extended_dataset/data_tm.csv")


- - - original dataset, keywords feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/skle

Logistic Regression:
 best params: {'lr__C': 100, 'lr__class_weight': 'balanced', 'lr__penalty': 'l2'}
 scores: 0.32786885245901637
Support Vector Machines:
 best params: {'svm__C': 0.1, 'svm__class_weight': 'balanced', 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
 scores: 0.35555555555555557
Naive Bayes: 
 f1: 0.06365446743421634 -> no parameter optimization!
Decision Trees:
 best params: {'dt__class_weight': 'balanced', 'dt__criterion': 'gini', 'dt__max_depth': 2}
 scores: 0.3595505617977528


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/s

Random Forest:
 best params: {'rf__bootstrap': True, 'rf__class_weight': 'balanced', 'rf__max_depth': 3, 'rf__max_features': 'sqrt', 'rf__n_estimators': 25}
 scores: 0.3305785123966942
K-nearest neighbor:
 best params: {'knn__n_neighbors': 9, 'knn__weights': 'distance'}
 scores: 0.3116883116883117


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': True, 'brf__class_weight': None, 'brf__max_depth': 3, 'brf__max_features': 'sqrt', 'brf__n_estimators': 150}
 scores: 0.3076923076923077
- - - original dataset, references feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/skle

Logistic Regression:
 best params: {'lr__C': 100, 'lr__class_weight': 'balanced', 'lr__penalty': 'l2'}
 scores: 0.34146341463414637
Support Vector Machines:
 best params: {'svm__C': 0.1, 'svm__class_weight': 'balanced', 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
 scores: 0.37362637362637363
Naive Bayes: 
 f1: 0.0624755045531258 -> no parameter optimization!
Decision Trees:
 best params: {'dt__class_weight': 'balanced', 'dt__criterion': 'gini', 'dt__max_depth': 4}
 scores: 0.3953488372093023


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/s

Random Forest:
 best params: {'rf__bootstrap': True, 'rf__class_weight': 'balanced', 'rf__max_depth': 6, 'rf__max_features': 'sqrt', 'rf__n_estimators': 25}
 scores: 0.358974358974359
K-nearest neighbor:
 best params: {'knn__n_neighbors': 9, 'knn__weights': 'distance'}
 scores: 0.33766233766233766


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': True, 'brf__class_weight': None, 'brf__max_depth': 3, 'brf__max_features': 'sqrt', 'brf__n_estimators': 50}
 scores: 0.3230769230769231
- - - original dataset, text mining feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/skle

Logistic Regression:
 best params: {'lr__C': 1, 'lr__class_weight': None, 'lr__penalty': 'l2'}
 scores: 0.8958333333333334
Support Vector Machines:
 best params: {'svm__C': 0.1, 'svm__class_weight': 'balanced', 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
 scores: 0.7311827956989247
Naive Bayes: 
 f1: 0.10905654731741687 -> no parameter optimization!
Decision Trees:
 best params: {'dt__class_weight': None, 'dt__criterion': 'gini', 'dt__max_depth': 1}
 scores: 0.37333333333333335


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/s

Random Forest:
 best params: {'rf__bootstrap': True, 'rf__class_weight': 'balanced', 'rf__max_depth': 3, 'rf__max_features': 'sqrt', 'rf__n_estimators': 25}
 scores: 0.31527093596059114
K-nearest neighbor:
 best params: {'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
 scores: 0.9523809523809523


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': False, 'brf__class_weight': None, 'brf__max_depth': 3, 'brf__max_features': 'sqrt', 'brf__n_estimators': 50}
 scores: 0.19626168224299065
- - - extended dataset, keywords feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/skle

Logistic Regression:
 best params: {'lr__C': 1000, 'lr__class_weight': 'balanced', 'lr__penalty': 'l2'}
 scores: 0.29770992366412213
Support Vector Machines:
 best params: {'svm__C': 1, 'svm__class_weight': 'balanced', 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
 scores: 0.31521739130434784
Naive Bayes: 
 f1: 0.05789688329401904 -> no parameter optimization!
Decision Trees:
 best params: {'dt__class_weight': 'balanced', 'dt__criterion': 'gini', 'dt__max_depth': 2}
 scores: 0.31521739130434784


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/s

Random Forest:
 best params: {'rf__bootstrap': True, 'rf__class_weight': 'balanced', 'rf__max_depth': 6, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
 scores: 0.3261802575107296
K-nearest neighbor:
 best params: {'knn__n_neighbors': 6, 'knn__weights': 'distance'}
 scores: 0.18181818181818182


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': False, 'brf__class_weight': None, 'brf__max_depth': 3, 'brf__max_features': 'sqrt', 'brf__n_estimators': 25}
 scores: 0.30039525691699603
- - - extended dataset, references feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/skle

Logistic Regression:
 best params: {'lr__C': 1000, 'lr__class_weight': 'balanced', 'lr__penalty': 'l2'}
 scores: 0.3048327137546468
Support Vector Machines:
 best params: {'svm__C': 1, 'svm__class_weight': 'balanced', 'svm__gamma': 0.0001, 'svm__kernel': 'rbf'}
 scores: 0.31521739130434784
Naive Bayes: 
 f1: 0.05789688329401904 -> no parameter optimization!
Decision Trees:
 best params: {'dt__class_weight': 'balanced', 'dt__criterion': 'gini', 'dt__max_depth': 2}
 scores: 0.31868131868131866


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/s

Random Forest:
 best params: {'rf__bootstrap': False, 'rf__class_weight': 'balanced', 'rf__max_depth': 3, 'rf__max_features': 'sqrt', 'rf__n_estimators': 25}
 scores: 0.3035019455252918
K-nearest neighbor:
 best params: {'knn__n_neighbors': 6, 'knn__weights': 'distance'}
 scores: 0.2376237623762376


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': False, 'brf__class_weight': None, 'brf__max_depth': 3, 'brf__max_features': 'sqrt', 'brf__n_estimators': 25}
 scores: 0.29213483146067415
- - - extended dataset, text mining feature - - -


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/skle

Logistic Regression:
 best params: {'lr__C': 1, 'lr__class_weight': None, 'lr__penalty': 'l2'}
 scores: 0.8129032258064516
Support Vector Machines:
 best params: {'svm__C': 1, 'svm__class_weight': 'balanced', 'svm__gamma': 0.0001, 'svm__kernel': 'rbf'}
 scores: 0.7085714285714285
Naive Bayes: 
 f1: 0.062348842610025525 -> no parameter optimization!
Decision Trees:
 best params: {'dt__class_weight': None, 'dt__criterion': 'entropy', 'dt__max_depth': 6}
 scores: 0.5846153846153846


320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/luca/.local/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/luca/.local/lib/python3.11/site-packages/s

Random Forest:
 best params: {'rf__bootstrap': True, 'rf__class_weight': 'balanced', 'rf__max_depth': 3, 'rf__max_features': 'sqrt', 'rf__n_estimators': 25}
 scores: 0.3164179104477612
K-nearest neighbor:
 best params: {'knn__n_neighbors': 2, 'knn__weights': 'distance'}
 scores: 0.9529411764705882


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Balanced Random Forest: 
 best params: {'brf__bootstrap': True, 'brf__class_weight': None, 'brf__max_depth': 9, 'brf__max_features': 'sqrt', 'brf__n_estimators': 150}
 scores: 0.2573913043478261


### Training and testing with optimal parameters

In [7]:
def train_test(estimator, y_test, X_test, y_train, X_train):
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(f"     f1: {f1}\n     recall: {recall}\n      precision: {precision}")

print("original dataset, keywords feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=100, class_weight='balanced', penalty='l2'), y_test1, X_test1, y_train1, X_train1)
print(" SVM:")
train_test(SVC(C=0.1, class_weight='balanced', gamma=0.001, kernel='rbf'), y_test1, X_test1, y_train1, X_train1)
print(" NB:")
train_test(GaussianNB(), y_test1, X_test1, y_train1, X_train1)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=2), y_test1, X_test1, y_train1, X_train1)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight='balanced', max_depth=3, max_features='sqrt', n_estimators=150), y_test1, X_test1, y_train1, X_train1)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=9, weights='distance'), y_test1, X_test1, y_train1, X_train1)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=True, class_weight=None, max_depth=3, max_features='sqrt', n_estimators=100), y_test1, X_test1, y_train1, X_train1)



print("original dataset, references feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=100, class_weight='balanced', penalty='l2'), y_test2, X_test2, y_train2, X_train2)
print(" SVM:")
train_test(SVC(C=0.1, class_weight='balanced', gamma=0.001, kernel='rbf'), y_test2, X_test2, y_train2, X_train2)
print(" NB:")
train_test(GaussianNB(), y_test2, X_test2, y_train2, X_train2)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=4), y_test2, X_test2, y_train2, X_train2)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight='balanced', max_depth=3, max_features='sqrt', n_estimators=150), y_test2, X_test2, y_train2, X_train2)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=9, weights='distance'), y_test2, X_test2, y_train2, X_train2)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=True, class_weight=None, max_depth=3, max_features='sqrt', n_estimators=150), y_test2, X_test2, y_train2, X_train2)



print("original dataset, text mining feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight=None, penalty='l2'), y_test3, X_test3, y_train3, X_train3)
print(" SVM:")
train_test(SVC(C=0.1, class_weight='balanced', gamma=0.001, kernel='rbf'), y_test3, X_test3, y_train3, X_train3)
print(" NB:")
train_test(GaussianNB(), y_test3, X_test3, y_train3, X_train3)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1), y_test3, X_test3, y_train3, X_train3)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight='balanced', max_depth=3, max_features='sqrt', n_estimators=25), y_test3, X_test3, y_train3, X_train3)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=1, weights='uniform'), y_test3, X_test3, y_train3, X_train3)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight=None, max_depth=3, max_features='sqrt', n_estimators=150), y_test3, X_test3, y_train3, X_train3)



print("extended dataset, keywords feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1000, class_weight='balanced', penalty='l2'), y_test4, X_test4, y_train4, X_train4)
print(" SVM:")
train_test(SVC(C=1, class_weight='balanced', gamma=0.001, kernel='rbf'), y_test4, X_test4, y_train4, X_train4)
print(" NB:")
train_test(GaussianNB(), y_test4, X_test4, y_train4, X_train4)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=12), y_test4, X_test4, y_train4, X_train4)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight='balanced', max_depth=6, max_features='sqrt', n_estimators=25), y_test4, X_test4, y_train4, X_train4)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=6, weights='distance'), y_test4, X_test4, y_train4, X_train4)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight=None, max_depth=3, max_features='sqrt', n_estimators=25), y_test4, X_test4, y_train4, X_train4)

print("extended dataset, references feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight='balanced', penalty='l2'), y_test5, X_test5, y_train5, X_train5)
print(" SVM:")
train_test(SVC(C=1, class_weight='balanced', gamma=0.0001, kernel='rbf'), y_test5, X_test5, y_train5, X_train5)
print(" NB:")
train_test(GaussianNB(), y_test5, X_test5, y_train5, X_train5)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=2), y_test5, X_test5, y_train5, X_train5)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=3, max_features='sqrt', n_estimators=25), y_test5, X_test5, y_train5, X_train5)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=6, weights='distance'), y_test5, X_test5, y_train5, X_train5)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=True, class_weight=None, max_depth=3, max_features='sqrt', n_estimators=150), y_test5, X_test5, y_train5, X_train5)



print("extended dataset, text mining feature")
print(" LR:")
train_test(LogisticRegression(max_iter=200, C=1, class_weight=None, penalty='l2'), y_test6, X_test6, y_train6, X_train6)
print(" SVM:")
train_test(SVC(C=1, class_weight='balanced', gamma=0.0001, kernel='rbf'), y_test6, X_test6, y_train6, X_train6)
print(" NB:")
train_test(GaussianNB(), y_test6, X_test6, y_train6, X_train6)
print(" DT:")
train_test(DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6), y_test6, X_test6, y_train6, X_train6)
print(" RF:")
train_test(RandomForestClassifier(bootstrap=True, class_weight='balanced', max_depth=3, max_features='sqrt', n_estimators=25), y_test6, X_test6, y_train6, X_train6)
print(" kNN:")
train_test(KNeighborsClassifier(n_neighbors=2, weights='distance'), y_test6, X_test6, y_train6, X_train6)
print(" BRF:")
train_test(BalancedRandomForestClassifier(bootstrap=False, class_weight=None, max_depth=3, max_features='sqrt', n_estimators=100), y_test6, X_test6, y_train6, X_train6)


original dataset, keywords feature
 LR:
     f1: 0.3181818181818182
     recall: 0.4375
      precision: 0.25
 SVM:
     f1: 0.05714285714285714
     recall: 1.0
      precision: 0.029411764705882353
 NB:
     f1: 0.0
     recall: 0.0
      precision: 0.0
 DT:
     f1: 0.4117647058823529
     recall: 0.4375
      precision: 0.3888888888888889
 RF:
     f1: 0.3111111111111111
     recall: 0.4375
      precision: 0.2413793103448276
 kNN:
     f1: 0.2
     recall: 0.125
      precision: 0.5
 BRF:


  warn(
  warn(


     f1: 0.30434782608695654
     recall: 0.4375
      precision: 0.23333333333333334
original dataset, references feature
 LR:
     f1: 0.3404255319148936
     recall: 0.5
      precision: 0.25806451612903225
 SVM:
     f1: 0.05714285714285714
     recall: 1.0
      precision: 0.029411764705882353
 NB:
     f1: 0.0
     recall: 0.0
      precision: 0.0
 DT:
     f1: 0.43243243243243246
     recall: 0.5
      precision: 0.38095238095238093
 RF:
     f1: 0.30434782608695654
     recall: 0.4375
      precision: 0.23333333333333334
 kNN:
     f1: 0.18181818181818182
     recall: 0.125
      precision: 0.3333333333333333
 BRF:


  warn(
  warn(


     f1: 0.32
     recall: 0.5
      precision: 0.23529411764705882
original dataset, text mining feature
 LR:
     f1: 0.36363636363636365
     recall: 0.25
      precision: 0.6666666666666666
 SVM:
     f1: 0.0
     recall: 0.0
      precision: 0.0
 NB:
     f1: 0.125
     recall: 0.25
      precision: 0.08333333333333333
 DT:
     f1: 0.4
     recall: 0.375
      precision: 0.42857142857142855
 RF:
     f1: 0.3076923076923077
     recall: 0.375
      precision: 0.2608695652173913
 kNN:
     f1: 0.13333333333333333
     recall: 0.125
      precision: 0.14285714285714285
 BRF:


  warn(
  warn(


     f1: 0.1836734693877551
     recall: 0.5625
      precision: 0.10975609756097561
extended dataset, keywords feature
 LR:
     f1: 0.29473684210526313
     recall: 0.45161290322580644
      precision: 0.21875
 SVM:
     f1: 0.06791171477079797
     recall: 0.6451612903225806
      precision: 0.035842293906810034
 NB:
     f1: 0.0
     recall: 0.0
      precision: 0.0
 DT:
     f1: 0.2978723404255319
     recall: 0.45161290322580644
      precision: 0.2222222222222222
 RF:
     f1: 0.3146067415730337
     recall: 0.45161290322580644
      precision: 0.2413793103448276
 kNN:
     f1: 0.0
     recall: 0.0
      precision: 0.0
 BRF:
     f1: 0.28
     recall: 0.45161290322580644
      precision: 0.2028985507246377
extended dataset, references feature
 LR:
     f1: 0.29411764705882354
     recall: 0.4838709677419355
      precision: 0.2112676056338028
 SVM:


  warn(
  warn(


     f1: 0.0
     recall: 0.0
      precision: 0.0
 NB:
     f1: 0.0
     recall: 0.0
      precision: 0.0
 DT:
     f1: 0.3235294117647059
     recall: 0.3548387096774194
      precision: 0.2972972972972973
 RF:
     f1: 0.3125
     recall: 0.4838709677419355
      precision: 0.23076923076923078
 kNN:
     f1: 0.0
     recall: 0.0
      precision: 0.0
 BRF:


  warn(
  warn(


     f1: 0.30927835051546393
     recall: 0.4838709677419355
      precision: 0.22727272727272727
extended dataset, text mining feature
 LR:
     f1: 0.3333333333333333
     recall: 0.22580645161290322
      precision: 0.6363636363636364
 SVM:
     f1: 0.19047619047619047
     recall: 0.25806451612903225
      precision: 0.1509433962264151
 NB:
     f1: 0.16161616161616163
     recall: 0.25806451612903225
      precision: 0.11764705882352941
 DT:
     f1: 0.2978723404255319
     recall: 0.22580645161290322
      precision: 0.4375
 RF:
     f1: 0.2689075630252101
     recall: 0.5161290322580645
      precision: 0.18181818181818182
 kNN:
     f1: 0.23809523809523808
     recall: 0.16129032258064516
      precision: 0.45454545454545453
 BRF:


  warn(
  warn(


     f1: 0.20754716981132076
     recall: 0.7096774193548387
      precision: 0.12154696132596685
