## General Imports

In [None]:
# Install statements for all utilized libraries (uncomment which are needed)
#!pip3 install pandas # installs numpy with it 
#!pip3 install numpy
#!pip3 install pickle
#!pip3 install sklearn
#!pip3 install nltk

In [1]:
# Data Handling
import pandas as pd
import numpy as np

# Misc
import pickle # saving/loading metrics

# ML
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

# Text Processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# nltk.download('stopwords')
# nltk.download('punkt')

## Helper Functions

In [2]:
def save_dict(dictionary, filename, verbose=False):
    '''
    Saves dictionary object as a pickle file for reloading and easy viewing
    
    Args:
    - dictionary (dict): data to be saved
    - filename (str): filename for dictionary to be stored in
    - verbose=False (bool): sepcifies if exact filename should be used. if False, .json extension appended to filename if not already present
    Return:
    - filename (str): filename for dictionary to be stored in
    '''
    if (not verbose) and ('.pickle' not in filename):
        filename += '.pickle'
        
    with open(filename, "wb") as outfile:  
        pickle.dump(dictionary, outfile)
        outfile.close()
    
    return filename
        
def load_dict(filename, verbose=False):
    '''
    Loads dictionary of metrics from given filename
    
    Args:
    - filename (str): file to load
    - verbose=False (bool): sepcifies if exact filename should be used. if False, .pickle extension appended to filename if not already present
    Return
    - dictionary (dict): data found in file
    - None (None): return None val in case exception is raised and dictionary file does not exist
    '''
    if (not verbose) and ('.pickle' not in filename):
        filename += '.pickle'

    try:
        with open(filename, 'rb') as pickle_file: 
            dictionary = pickle.load(pickle_file) 
    except FileNotFoundError as e:
        print(e)
        return None
    
    return dictionary

## Data Loading

In [3]:
# Create dict to store {name: dataset}
dataset_dict = {}

In [4]:
# Create vectorizer that turns text samples into token vector 
vectorizer = CountVectorizer(analyzer='word', tokenizer=word_tokenize, stop_words=stopwords.words('english'))

### Yelp Polarity

In [5]:
# Create vectorizer that turns text samples into token vector 
yelp_vectorizer = CountVectorizer(analyzer='word', tokenizer=word_tokenize, stop_words=stopwords.words('english'), max_features=6000)

# Load yelp data sets
yelp_test_df = pd.read_csv('../data/yelp_review_polarity_csv/test.csv', names=['label', 'data']) 
yelp_train_df = pd.read_csv('../data/yelp_review_polarity_csv/train.csv', names=['label', 'data']) 

# Since yelp data set is already split into test and train, recombine
yelp_df = pd.concat([yelp_test_df, yelp_train_df])

# Data set is too large to work with in memory since I don't have 2TiB of RAM just lying around, so we're cutting the data down
yelp_df = yelp_df.sample(n=16000,replace=False,axis='index')

# Change 1, 2 label to 0, 1 for uniformity with other data sets
# Data set has 1 for negative and 2 for positive, so we switch 0 to negative and 1 to positive
yelp_df['label'] = yelp_df['label'].apply(lambda label: 0 if label == 1 else 1)

#Vectorize
yelp_df['data'] = vectorizer.fit_transform(yelp_df['data']).toarray()

# Transform df to np array for easier use & add info to dict
yelp_data = yelp_df.values
dataset_dict['yelp'] = yelp_data



### Subjectivity/Objectivity

In [6]:
# Create vectorizer that turns text samples into token vector 
subob_vectorizer = CountVectorizer(analyzer='word', tokenizer=word_tokenize, stop_words=stopwords.words('english'), max_features=2200)

# Load data sets
subjectivity_df = pd.read_csv('../data/subjectobject/subjectivity.txt', sep='\n', encoding='latin-1', names=['data'])
objectivity_df = pd.read_csv('../data/subjectobject/objectivity.txt', sep='\n', encoding='latin-1', names=['data'])

# Add labels (subjective is 0, objective is 0)
subjectivity_df['label'] = 0
objectivity_df['label'] = 1

# Combine data sets and rearrange columns for uniformity
sub_ob_df = pd.concat([subjectivity_df, objectivity_df])
sub_ob_df = sub_ob_df.reindex(columns=['label', 'data'])

#Vectorize
sub_ob_df['data'] = vectorizer.fit_transform(sub_ob_df['data']).toarray()

#Transform df to np array, and add to dict
sub_ob_data = sub_ob_df.values
dataset_dict['sub_ob'] = sub_ob_data

### Clickbait

In [5]:
# Create vectorizer that turns text samples into token vector 
clickbait_vectorizer = CountVectorizer(analyzer='word', tokenizer=word_tokenize, stop_words=stopwords.words('english'), max_features=2500)

# Load data sets
clickbait_df = pd.read_csv('../data/clickbait/clickbait_data', sep='\n', names=['data'])
nonclickbait_df = pd.read_csv('../data/clickbait/non_clickbait_data', sep='\n', names=['data'])

# Add labels (subjective is 0, objective is 0)
nonclickbait_df['label'] = 0
clickbait_df['label'] = 1

# Combine data sets and rearrange columns for uniformity
clickbait_df = pd.concat([clickbait_df, nonclickbait_df])
clickbait_df = clickbait_df.reindex(columns=['label', 'data'])

#Vectorize
clickbait_df['data'] = vectorizer.fit_transform(clickbait_df['data']).toarray()

#Transform df to np array, and add to dict
clickbait_data = clickbait_df.values
dataset_dict['clickbait'] = clickbait_data



### SVM

In [6]:
# Create metric dict
svm_metric_dict = {}

In [7]:
# Create grid of parameters to search over for SVM
c_vals = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
param_grid_svm = [{'kernel': ['linear'], 'C': c_vals}, {'kernel': ['poly'], 'degree': [0,2,3], 'C': c_vals}, {'kernel': ['rbf'], 'gamma': [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2], 'C': c_vals}]

In [8]:
# Create model & grid search object
svc = SVC()
clf_svc = GridSearchCV(estimator=svc, param_grid=param_grid_svm, cv=5, n_jobs=3, verbose=10, scoring='accuracy')

In [None]:
for name, dataset in dataset_dict.items():
    # Get data
    X, y = dataset[:, 1:], dataset[:, :1] #Treats first column as label
    for i in range(3):
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
        
        clf_svc.fit(X_train, y_train.ravel()) # Fit training data to model
        
        # Train set performance
        y_train_pred = clf_svc.predict(X_train)
        acc_train = accuracy_score(y_train, y_train_pred)
        precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(y_train, y_train_pred)
        
        # Test set performance
        y_test_pred = clf_svc.predict(X_test) # Predict test values using best parameters from classifier
        acc_test = accuracy_score(y_test, y_test_pred) # Get accuracy for predictions
        precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, y_test_pred)
        
        svm_metric_dict[(name, i)] = {'acc_test': acc_test, 'acc_train': acc_train, 'precision_test': precision_test, 'precision_train': precision_train, 'recall_test': recall_test, 'recall_train': recall_train,
                                      'f1_test': f1_test, 'f1_test': f1_train, 'model_test': clf_svc, 'cv_results': clf_svc.cv_results_} # Add metrics to dict for analysis
        save_dict(svm_metric_dict, '../checkpoints/svm/svm_{}_{}.pickle'.format(name, i)) # Save checkpoint results in case of hardware failure

Fitting 5 folds for each of 143 candidates, totalling 715 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    1.3s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    1.5s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:    2.2s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    2.7s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    3.3s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    4.0s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:    4.9s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:    5.9s
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:    7.2s
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:    8.4s
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:    9.8s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:   11.2s
[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed:   13.0s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:   14.4s
[Parallel(

Fitting 5 folds for each of 143 candidates, totalling 715 fits


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    0.9s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:    1.6s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    2.1s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    2.8s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    3.5s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:    4.4s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:    5.3s
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:    6.8s
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:    7.9s
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:    9.4s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:   10.8s
[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed:   12.6s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:   14.0s
[Parallel(n_jobs=3)]: Done 175 tasks      | elapsed:   16.0s
[Parallel(n_jobs=3)]: Do

Fitting 5 folds for each of 143 candidates, totalling 715 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    0.9s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:    1.6s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    2.1s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    2.8s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    3.5s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:    4.4s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:    5.4s
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:    6.8s
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:    8.0s
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:    9.5s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:   11.0s
[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed:   12.8s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:   14.2s
[Parallel(

In [None]:
###### DEPRACATED FOR MULTITHREAD SKLEARN GRID SEARCH, KEPT IN CASE OF MEASURING OTHER METRICS
from sklearn.model_selection import KFold, ParameterGrid
from tqdm import tqdm

# Cycle across each param combo
performance_dict = {}
for param_dict in tqdm(list(ParameterGrid(param_grid))):
    performance = 0
    C, degree, gamma, kernel = param_dict.values()
    if ((kernel in ('linear', 'rbf') and degree > 0) or  # Don't want to run linear or rbf with polynomial degrees (degree will be ignored but we'll get duplicate trials)
        (kernel == 'poly' and degree == 0) or # Don't want polynomial with degree 0
        (kernel in ('linear', 'poly') and gamma > 0) or # Don't want linear or poly with gamma param
        (kernel == 'rbf' and gamma == 0)): # Don't want rbf with 0 gamma
        continue
    # Do k fold validation
    for train, validate in kf.split(X_letter_train):
        X_letter_train_cross, X_letter_val_cross, y_letter_train_cross, y_letter_val_cross = X_letter_train[train], X_letter_train[validate], y_letter_train[train], y_letter_train[validate] # get data folds
        svm_letter = SVC(C=C, degree=degree, kernel=kernel) # create the model #NOTE: not scaling because all data appears to follow the same scaling regardless
        svm_letter.fit(X_letter_train_cross, y_letter_train_cross.ravel()) # fit the model
        y_letter_val_cross_pred = svm_letter.predict(X_letter_val_cross) # predict validation data
        performance += accuracy_score(y_letter_val_cross, y_letter_val_cross_pred) # keep track of performance
    # Average the performance
    performance /= 5
    
    # Add performance info to dict
    performance_dict[(C, degree, kernel)] = performance

## Logistic Regression

In [8]:
logreg_metric_dict = {}

In [9]:
param_grid_logreg = [{'penalty': ['l2'], 'C': [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]}, {'penalty': ['none']}]

In [10]:
logreg = LogisticRegression()
clf_logreg = GridSearchCV(estimator=logreg, param_grid=param_grid_logreg, cv=5, n_jobs=3, verbose=10, scoring='accuracy')

In [15]:
for name, dataset in dataset_dict.items():
    X, y = dataset[:, 1:], dataset[:, :1]
    for i in range(3):
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
        
        clf_logreg.fit(X_train, y_train.ravel())
        
        # Train set performance
        y_train_pred = clf_logreg.predict(X_train)
        acc_train = accuracy_score(y_train, y_train_pred)
        precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(y_train, y_train_pred)
        
        # Test set performance
        y_test_pred = clf_logreg.predict(X_test) # Predict test values using best parameters from classifier
        acc_test = accuracy_score(y_test, y_test_pred) # Get accuracy for predictions
        precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, y_test_pred)
        
        logreg_metric_dict[(name, i)] = {'acc_test': acc_test, 'acc_train': acc_train, 'precision_test': precision_test, 'precision_train': precision_train, 'recall_test': recall_test, 'recall_train': recall_train,
                                      'f1_test': f1_test, 'f1_train': f1_train, 'model': clf_logreg, 'cv_results': clf_logreg.cv_results_} # Add metrics to dict for analysis
        save_dict(logreg_metric_dict, '../checkpoints/logreg/logreg_{}_{}.pickle'.format(name, i))

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.0186s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0565s.) Setting batch_size=4.
[Parallel(n_jobs=3)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0299s.) Setting batch_size=8.
[Parallel(n_jobs=3)]: Done  50 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0555s.) Setting batch_size=16.
[Parallel(n_jobs=3)]: Done  70 out of  70 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.0114s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s
[Pa

Fitting 5 folds for each of 14 candidates, totalling 70 fits
Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=3)]: Batch computation too fast (0.0141s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0186s.) Setting batch_size=4.
[Parallel(n_jobs=3)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0384s.) Setting batch_size=8.
[Parallel(n_jobs=3)]: Done  50 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0534s.) Setting batch_size=16.
[Parallel(n_jobs=3)]: Done  70 out of  70 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.0091s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s


Fitting 5 folds for each of 14 candidates, totalling 70 fits
Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=3)]: Batch computation too fast (0.0153s.) Setting batch_size=4.
[Parallel(n_jobs=3)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0279s.) Setting batch_size=8.
[Parallel(n_jobs=3)]: Done  50 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0548s.) Setting batch_size=16.
[Parallel(n_jobs=3)]: Done  70 out of  70 | elapsed:    0.2s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.0107s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0184s.) Setting batch_size=4.
[Parallel(n_jobs=3)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Batch c

Fitting 5 folds for each of 14 candidates, totalling 70 fits
Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=3)]: Done  70 out of  70 | elapsed:    0.2s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.0075s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0190s.) Setting batch_size=4.
[Parallel(n_jobs=3)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0206s.) Setting batch_size=8.
[Parallel(n_jobs=3)]: Done  50 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0630s.) Setting batch_size=16.
[Parallel(n_jobs=3)]: Done  70 out of  70 | elapsed:    0.1s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(r

Fitting 5 folds for each of 14 candidates, totalling 70 fits
Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=3)]: Batch computation too fast (0.0129s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0123s.) Setting batch_size=4.
[Parallel(n_jobs=3)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0199s.) Setting batch_size=8.
[Parallel(n_jobs=3)]: Done  50 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0676s.) Setting batch_size=16.
[Parallel(n_jobs=3)]: Done  70 out of  70 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.0104s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0165s.) Setting batch_size

In [None]:
dictionary = load_dict('../checkpoints/svm/svm_clickbait_0.pickle')
dictionary