# B.1 Classifier Training

## General Imports

In [None]:
# Install statements for all utilized libraries (uncomment which are needed)
#!pip3 install pandas # installs numpy with it 
#!pip3 install numpy
#!pip3 install pickle
#!pip3 install sklearn
#!pip3 install nltk

In [None]:
# Data Handling
import pandas as pd
import numpy as np

# ML
# Classifiers
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from modeltrainer import ModelTrainer

In [None]:
def get_clf_and_metrics(modeltrainer, param_grid, datasets):
    """
    Returns classifiers and metrics for each dataset
    
    Params:
    - modeltrainer: (modeltrainer.ModelTrainer)
    - param_grid: (dict)
    - datasets: (list-like)
    """
    
    clf_dict = dict()
    metric_dict = dict()

    for dataset in datasets:
        print('Currently training:', dataset)
        modeltrainer.load_dataset(dataset)

        clf_dict[dataset] = modeltrainer.get_best_model(param_grid)
        metric_dict[dataset] = modeltrainer.get_metric_dict()
        
    return clf_dict, metric_dict

In [None]:
datasets = ['yelp', 'subjectivity_objectivity', 'clickbait']

### SVM

In [None]:
# Create grid of parameters to search over for SVM
c_vals = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
svc_param_grid = [{'kernel': ['linear'], 
                   'C': c_vals}, 
                  {'kernel': ['poly'], 
                   'degree': [2,3], 
                   'C': c_vals}, 
                  {'kernel': ['rbf'], 
                   'gamma': [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2], 
                   'C': c_vals}]

In [None]:
# Create classifier, and then find best parameters via cv grid search
svc = SVC()
svc_modeltrainer = ModelTrainer(svc)

In [None]:
svc_clf_dict, svc_metric_dict = get_clf_and_metrics(svc_modeltrainer, svc_param_grid, datasets)

## Logistic Regression

In [None]:
# Creates LogisticRegression parameter grid
logreg_param_grid = [{'penalty': ['l2'], 
                      'C': [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]}, 
                     {'penalty': ['none']}]

In [None]:
# Create classifier, and modeltrainer
logreg = LogisticRegression()
logreg_modeltrainer = ModelTrainer(logreg)

In [None]:
logreg_clf_dict, logreg_metric_dict = get_clf_and_metrics(logreg_modeltrainer, logreg_param_grid, datasets)

## Random Forest

In [None]:
# Creates Random Forest parameter grid
rf_param_grid = {'n_estimators': [128, 256, 512, 1024, 2048, 4096, 8192, 16384], 
                           'max_features': [1]}

In [None]:
# Create classifier, and modeltrainer
rf = RandomForestClassifier()
rf_modeltrainer = ModelTrainer(rf)

In [None]:
rf_clf_dict, rf_metric_dict = get_clf_and_metrics(rf_modeltrainer, rf_param_grid, datasets)