## Train-test-split, parameter tuning, cross validation, final testing

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import UndefinedMetricWarning, FitFailedWarning
import matplotlib.pyplot as plt

warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
warnings.filterwarnings(action='ignore', category=FitFailedWarning)


np.random.seed(42)

### Train-test-split

In [None]:
def split(source_file):
    df = pd.read_csv(source_file)
    X = df.drop(['literature_review'], axis=1)
    y = df['literature_review']
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=42)
    return X_train, X_test, y_train, y_test

### Parameter tuning and testing
- various parameters are tested for each algorithm using sklearn's GridSearchCV
- the parameters are evaluated using cross validation
- the best parameters are used for training the model
- the model is then tested on the testing data

In [None]:
def tune_params(X_train,y_train,X_test,y_test,pipeline,params):
    grid_search = GridSearchCV(estimator=pipeline, param_grid=params, scoring=['f1','recall','precision'],cv=5, refit='f1')
    grid_search.fit(X_train,y_train)
    y_pred = grid_search.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(f"     f1: {f1}\n     recall: {recall}\n      precision: {precision}")
    return f1, recall, precision

def train_and_test(source_path):
    X_train, X_test, y_train, y_test = split(source_path)
    f1_scores = {}
    recall_scores = {}
    precision_scores = {}

    # Logistic Regression
    lr_pipeline = Pipeline([('scaler', StandardScaler()),('lr', LogisticRegression(max_iter=200))])
    lr_params = {'lr__penalty':['l1','l2'], 
                'lr__C':[1, 10, 100, 1000],
                'lr__class_weight': [None, 'balanced']}
    print('Logistic Regression:')
    f1, recall, precision = tune_params(X_train, y_train, X_test, y_test, lr_pipeline, lr_params)
    f1_scores['LR'] = f1
    recall_scores['LR'] = recall
    precision_scores['LR'] = precision

    # Support Vector Machines
    svm_pipeline = Pipeline([('scaler', StandardScaler()),('svm', SVC())])
    svm_params = {'svm__C': [0.1, 1, 10],  
                'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                'svm__kernel': ['rbf'],
                'svm__class_weight': [None, 'balanced']}
    print('Support Vector Machines:')
    f1, recall, precision = tune_params(X_train, y_train, X_test, y_test, svm_pipeline, svm_params)
    f1_scores['SVM'] = f1
    recall_scores['SVM'] = recall
    precision_scores['SVM'] = precision

    # Naive Bayes
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print('Naive Bayes:')
    print(f"     f1: {f1}\n     recall: {recall}\n      precision: {precision}")
    f1_scores['NB'] = f1
    recall_scores['NB'] = recall
    precision_scores['NB'] = precision
   
    # Decision Trees
    dt_pipeline = Pipeline([('dt', DecisionTreeClassifier())])
    dt_params = {'dt__criterion': ['gini', 'entropy'], 
                'dt__max_depth':range(1,10),
                'dt__class_weight': [None, 'balanced']}
    print('Decision Trees:')
    f1, recall, precision = tune_params(X_train, y_train, X_test, y_test, dt_pipeline, dt_params)
    f1_scores['DT'] = f1
    recall_scores['DT'] = recall
    precision_scores['DT'] = precision

    # Random Forest
    rf_pipeline = Pipeline([('rf', RandomForestClassifier())])
    rf_params = {'rf__bootstrap': [True, False],
                 'rf__max_depth': [3, 6, 9, None],
                 'rf__max_features': ['auto', 'sqrt'],
                 'rf__n_estimators': [25, 50, 100, 150],
                 'rf__class_weight': [None, 'balanced']}
    print(f'Random Forest:')
    f1, recall, precision = tune_params(X_train, y_train, X_test, y_test, rf_pipeline, rf_params)
    f1_scores['RF'] = f1
    recall_scores['RF'] = recall
    precision_scores['RF'] = precision


    # k-nearest neighbor
    knn_pipeline = Pipeline([('scaler', StandardScaler()),('knn', KNeighborsClassifier())])
    knn_params = {'knn__n_neighbors': range(1,10),  
                'knn__weights': ['uniform', 'distance']}
    print(f'K-nearest neighbor:')
    f1, recall, precision = tune_params(X_train, y_train, X_test, y_test, knn_pipeline, knn_params)
    f1_scores['kNN'] = f1
    recall_scores['kNN'] = recall
    precision_scores['kNN'] = precision
    
    # Balanced Random Forest
    brf_pipeline = imbpipeline([('brf', BalancedRandomForestClassifier())])
    brf_params = {'brf__bootstrap': [True, False],
                 'brf__max_depth': [3, 6, 9, None],
                 'brf__max_features': ['auto', 'sqrt'],
                 'brf__n_estimators': [25, 50, 100, 150],
                 'brf__class_weight': [None, 'balanced']}
    print(f'Balanced Random Forest:')
    f1, recall, precision = tune_params(X_train, y_train, X_test, y_test, brf_pipeline, brf_params)
    f1_scores['BRF'] = f1
    recall_scores['BRF'] = recall
    precision_scores['BRF'] = precision

    return f1_scores, recall_scores, precision_scores


### Graph for visualizing the results

In [None]:
def make_graph(f1, recall, precision):
    
    del f1['NB']
    del f1['kNN']
    del recall['NB']
    del recall['kNN']
    del precision['NB']
    del precision['kNN']
    
    plt.figure(figsize=(8,6))
    plt.bar(x=[0,5,10,15,20], height=list(f1.values()), width=0.9, color="tomato", label="f1")
    plt.bar(x=[1,6,11,16,21], height=list(precision.values()), width=0.9, color="dodgerblue", label="precision")
    plt.bar(x=[2,7,12,17,22], height=list(recall.values()), width=0.9, color="lime", label="recall")

    plt.xlabel("algorithm")
    plt.ylabel("metric score")

    plt.xticks(ticks=[1.5, 6.5, 11.5, 16.5, 21.5], labels=list(f1.keys()), rotation=45)
    plt.legend(loc='best')
    plt.show()


### All feature combinations are tested on both datasets

In [None]:
print("- - - original dataset, keywords feature - - -")
f1_scores, recall_scores, precision_scores = train_and_test("../data/processed/original_dataset/data_key.csv")
make_graph(f1_scores, recall_scores, precision_scores)

print("- - - original dataset, references feature - - -")
f1_scores, recall_scores, precision_scores = train_and_test("../data/processed/original_dataset/data_ref.csv")
make_graph(f1_scores, recall_scores, precision_scores)

print("- - - original dataset, text mining feature - - -")
f1_scores, recall_scores, precision_scores = train_and_test("../data/processed/original_dataset/data_tm.csv")
make_graph(f1_scores, recall_scores, precision_scores)

print("- - - extended dataset, keywords feature - - -")
f1_scores, recall_scores, precision_scores = train_and_test("../data/processed/extended_dataset/data_key.csv")
make_graph(f1_scores, recall_scores, precision_scores)

print("- - - extended dataset, references feature - - -")
f1_scores, recall_scores, precision_scores = train_and_test("../data/processed/extended_dataset/data_ref.csv")
make_graph(f1_scores, recall_scores, precision_scores)

print("- - - extended dataset, text mining feature - - -")
f1_scores, recall_scores, precision_scores = train_and_test("../data/processed/extended_dataset/data_tm.csv")
make_graph(f1_scores, recall_scores, precision_scores)
