# B.1 Classifier Training

## General Imports

In [None]:
# Install statements for all utilized libraries (uncomment which are needed)
#!pip3 install pandas # installs numpy with it 
#!pip3 install numpy
#!pip3 install pickle
#!pip3 install sklearn
#!pip3 install nltk

In [None]:
# Data Handling
import pandas as pd
import numpy as np

# Misc
import pickle # saving/loading metrics
import os # creating necessary directory structure

# ML
# Classifiers
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Helper functions
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Text Processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# nltk.download('stopwords')
# nltk.download('punkt')

## Helper Functions

In [None]:
def save_dict(dictionary, filename, verbose=False):
    '''
    Saves dictionary object a,s a pickle file for reloading and easy viewing
    
    Params:
    - dictionary (dict): data to be saved
    - filename (str): filename for dictionary to be stored in
    - verbose=False (bool): sepcifies if exact filename should be used. if False, .pickle 
        extension appended to filename if not already present
    Return:
    - filename (str): filename for dictionary to be stored in
    '''
    # Add .pickle filetype if necessary and requested
    if (not verbose) and ('.pickle' not in filename):
        filename += '.pickle'
        
    # Save file
    with open(filename, "wb") as outfile:  
        pickle.dump(dictionary, outfile)
        outfile.close()
    
    return filename
        
def load_dict(filename, verbose=False):
    '''
    Loads dictionary of metrics from given filename
    
    Params:
    - filename (str): file to load
    - verbose=False (bool): sepcifies if exact filename should be used. if False, 
        .pickle extension appended to filename if not already present
    Return
    - dictionary (dict): data found in file
    - None (None): return None val in case exception is raised and dictionary file does not exist
    '''
    # Add .pickle filetype if necessary and requested
    if (not verbose) and ('.pickle' not in filename):
        filename += '.pickle'
    
    # Load file if exists
    try:
        with open(filename, 'rb') as pickle_file: 
            dictionary = pickle.load(pickle_file) 
    except FileNotFoundError as e:
        print(e)
        return None
    
    return dictionary

## Data Loading

In [None]:
# Create dict to store {name: dataset}
dataset_dict = {}

In [None]:
# Create vectorizer that turns text samples into token vector 
vectorizer = CountVectorizer(analyzer='char', 
                             tokenizer=word_tokenize, 
                             stop_words=stopwords.words('english'))

# NOTE: Additional testing was done with a unique vectorizer for each data set where the maximum
# number of features considered was ~10% of the overall features (based on other studies implementations)
# but performance appeared to be poorer. Since there was amble time for training, decided to utilize
# all features

### Yelp Polarity

In [None]:
# Load yelp data sets
yelp_test_df = pd.read_csv('../data/yelp_review_polarity_csv/test.csv', names=['label', 'data']) 
yelp_train_df = pd.read_csv('../data/yelp_review_polarity_csv/train.csv', names=['label', 'data']) 

# Since yelp data set is already split into test and train, recombine
yelp_df = pd.concat([yelp_test_df, yelp_train_df])

# Data set is too large to work with in memory since I don't have 2TiB of RAM just lying around, 
# so we're cutting the data down into a more workable size
yelp_df = yelp_df.sample(n=32000,replace=False,axis='index')

# Change 1, 2 label to 0, 1 for uniformity with other data sets
# Data set has 1 for negative and 2 for positive, so we switch 0 to negative and 1 to positive
yelp_df['label'] = yelp_df['label'].apply(lambda label: 0 if label == 1 else 1)

# Transform data into vectorized format
yelp_df['data'] = vectorizer.fit_transform(yelp_df['data']).toarray()

# Transform df to np array for easier use & add info to dict
yelp_data = yelp_df.values
dataset_dict['yelp'] = yelp_data

### Subjectivity/Objectivity

In [None]:
# Load data sets
subjectivity_df = pd.read_csv('../data/subjectobject/subjectivity.txt', sep='\n', encoding='latin-1', names=['data'])
objectivity_df = pd.read_csv('../data/subjectobject/objectivity.txt', sep='\n', encoding='latin-1', names=['data'])

# Add labels (subjective is 0, objective is 1)
subjectivity_df['label'] = 0
objectivity_df['label'] = 1

# Combine data sets and rearrange columns for uniformity
sub_ob_df = pd.concat([subjectivity_df, objectivity_df])
sub_ob_df = sub_ob_df.reindex(columns=['label', 'data'])

# Transform data into vectorized format
sub_ob_df['data'] = vectorizer.fit_transform(sub_ob_df['data']).toarray()

# Transform df to np array, and add to dict
sub_ob_data = sub_ob_df.values
dataset_dict['sub_ob'] = sub_ob_data

### Clickbait

In [None]:
# Load data sets
clickbait_df = pd.read_csv('../data/clickbait/clickbait_data', sep='\n', names=['data'])
nonclickbait_df = pd.read_csv('../data/clickbait/non_clickbait_data', sep='\n', names=['data'])

# Add labels (clickbait is 0, non-clickbait is 1)
nonclickbait_df['label'] = 0
clickbait_df['label'] = 1

# Combine data sets and rearrange columns for uniformity
clickbait_df = pd.concat([clickbait_df, nonclickbait_df])
clickbait_df = clickbait_df.reindex(columns=['label', 'data'])

# Transform data into vectorized format
clickbait_df['data'] = vectorizer.fit_transform(clickbait_df['data']).toarray()

#Transform df to np array, and add to dict
clickbait_data = clickbait_df.values
dataset_dict['clickbait'] = clickbait_data

## Grid Searcher

In [None]:
def get_best_model(estimator, param_grid, dataset_dict, scoring='accuracy', n_jobs=1, verbose=1, save=True):
    '''
    Takes data nd model information and returns a dictionary of metrics on the best estimator for each data 
    set via grid search
    
    Params:
    - estimator: estimator object to use
    - param_grid (dict or list of dicts): values to perform grid search over
    - dataset_dict (dict): (name: dataset) paired dictionary for all datasets to return best estimator for
    - saving='accuracy' (str): specifies how to rank each estimator
    - n_jobs=1 (int): number of cores to run training on. -1 includes all cores
    - verbose=1 (int): specifies if output messages should be provided
    - save (bool): flag for if dictionary should be saved
    Returns:
    - clf: gridsearch object with best performance
    - metric_dict (dict): returns dataset of the form {(name, trial#): {metric_name: metric, model: best_estimator}}
    '''
    
    # Make sure proper data was passed in
    assert type(dataset_dict) == dict, 'Please pass in a correct dataset_dict'
    assert type(param_grid) in [list, set, tuple, dict], 'Unexpected data type passed in for param_grid'
    if type(param_grid) is not dict:
        assert type(param_grid[0]) == dict, 'Unexpected data type passed in for param_grid'
    
    metric_dict = {}
    clf = GridSearchCV(estimator=estimator, 
                       param_grid=param_grid, 
                       cv=5, n_jobs=n_jobs, 
                       verbose=verbose, 
                       scoring=scoring)
    
    # Analyze every data set (corresponding to whatever data input cells were ran)
    for name, dataset in dataset_dict.items():
        X, y = dataset[:, 1:], dataset[:, :1] #Treats first column as label
        for i in range(3): # Completes 3 trials
            X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)

            clf.fit(X_train, y_train.ravel()) # Fit training data to model

            # Gather training set metrics
            y_train_pred = clf.predict(X_train)
            acc_train = accuracy_score(y_train, y_train_pred)
            precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(y_train, y_train_pred)

            # Gather testing set metrics
            y_test_pred = clf.predict(X_test) # Predict test values using best parameters from classifier
            acc_test = accuracy_score(y_test, y_test_pred) # Get accuracy for predictions
            precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, y_test_pred)

            # Save metrics to dict for further analysis
            metric_dict[(name, i)] = {'acc_test': acc_test, 
                                      'acc_train': acc_train, 
                                      'precision_test': precision_test, 
                                      'precision_train': precision_train, 
                                      'recall_test': recall_test, 
                                      'recall_train': recall_train,
                                      'f1_test': f1_test, 
                                      'f1_train': f1_train, 
                                      'model': clf, 
                                      'cv_results': clf.cv_results_} # Add metrics to dict for analysis
            if save:
                # Save checkpoint results in case of hardware failure
                loc_str = estimator.__class__.__name__ # this just gets clf type (eg SVC, LogisticRegression, etc)
                
                # Checks if the output path already exists, and makes it if not
                if not os.path.isdir('../checkpoints/{}'.format(loc_str)):
                    print('Creating {} directory now'.format(loc_str))
                    os.mkdir(os.joinpath('..', 'checkpoints', loc_str))
                    save_dict(metric_dict, '../checkpoints/{loc_str}/{}_{}_{}.pickle'.format(loc_str, name, i))
    
    return clf, metric_dict

### SVM

In [None]:
# Create grid of parameters to search over for SVM
c_vals = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
param_grid_svc = [{'kernel': ['linear'], 
                   'C': c_vals}, 
                  {'kernel': ['poly'], 
                   'degree': [2,3], 
                   'C': c_vals}, 
                  {'kernel': ['rbf'], 
                   'gamma': [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2], 
                   'C': c_vals}]

In [None]:
# Create classifier, and then find best parameters via cv grid search
svc = SVC()
svm_clf, svm_metric_dict = 
    get_best_model(svc, param_grid_svc, dataset_dict)

In [None]:
# Create model & grid search object
clf_svc = GridSearchCV(estimator=svc, 
                       param_grid=param_grid_svm, 
                       cv=5, 
                       n_jobs=3, 
                       verbose=10, 
                       scoring='accuracy')

## Logistic Regression

In [None]:
# Creates LogisticRegression parameter grid
param_grid_logreg = [{'penalty': ['l2'], 
                      'C': [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]}, 
                     {'penalty': ['none']}]

In [None]:
# Create classifier, and then find best parameters via cv grid search
logreg = LogisticRegression()
logreg_clf, logreg_metric_dict = 
    get_best_model(logreg, param_grid_logreg, dataset_dict)

In [None]:
clf_logreg = GridSearchCV(estimator=logreg, 
                          param_grid=param_grid_logreg, 
                          cv=5, 
                          n_jobs=3, 
                          verbose=10, 
                          scoring='accuracy')

## Random Forest

In [None]:
# Creates Random Forest parameter grid
param_grid_randomforest = {'n_estimators': [128, 256, 512, 1024, 2048, 4096, 8192, 16384], 
                           'max_features': [1]}

In [None]:
# Create classifier, and then find best parameters via cv grid search
randomforest = RandomForestClassifier()
randomforest_clf, random_forest_metric_dict = 
    get_best_model(randomforest, param_grid_randomforest, dataset_dict, n_jobs=2, save=False)

In [None]:
clf_randomforest = GridSearchCV(estimator=randomforest, 
                                param_grid=param_grid_randomforest, 
                                cv=5, 
                                n_jobs=3, 
                                verbose=10, 
                                scoring='accuracy')