In [1]:
import sys
import spacy
import re
import pickle
import numpy as np
import pandas as pd
import scipy as sp
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS 
from collections import Counter
from plotnine import *
from pandas.tseries.offsets import MonthBegin
from yellowbrick.features import Rank2D
import feather
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [2]:
class EstimatorSelectionHelper:
    
    def __init__(self, models, params, params2, proportion_iterations):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.params2 = params2
        self.keys = models.keys()
        self.grid_searches = {}
        self.proportion_iterations = proportion_iterations
    
    def fit(self, X, y, cv=3, n_jobs=-1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for {}.".format(key))
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs
            
            
    
    def randomized_fit(self, X, y, cv=3, n_jobs=-1, verbose=1, scoring='f1'):
 
        params = self.params
        param_labels = []
        df = list(pd.DataFrame(self.params2).prod())
        
        for key in self.keys:
            param_labels.append(key)
        
        zipped = dict(zip(param_labels, df))
        
        for key in self.keys:
            print(key)
            if str(key) == 'RandomForestClassifier':
                n_iter = int(round(zipped[key]*(self.proportion_iterations**20))) 
                print("Running RandomizedSearchCV for {} with {} iterations".format(key, n_iter))
            else:
                n_iter = int(round(zipped[key]*self.proportion_iterations)) 
                print("Running RandomizedSearchCV for {} with {} iterations".format(key, n_iter))
            model=self.models[key]
            params = self.params[key]
            rs = RandomizedSearchCV(model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring, return_train_score=True, n_iter=n_iter)
            rs.fit(X,y)
            self.grid_searches[key] = rs
       
    
    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})
                      
        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))
        
        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        
        return df[columns]


In [5]:
# Read in data
pitchfork_data = feather.read_dataframe('/home/michelle/Documents/data projects/pitch_fork_data_analysis/pitchfork_tfidf.feather')


ArrowIOError: Failed to open local file: /home/michelle/Documents/data projects/pitch_fork_data_analysis/pitchfork_tfidf.feather , error: No such file or directory

In [None]:
# Get x-y values
X=pitchfork_data.iloc[:,6:-1]
y = pitchfork_data.category.values

In [None]:
# Test-Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(1,1000))

In [None]:
# Oversample because class imbalance and also dimensionality reasons.
X_res, y_res=SMOTE().fit_sample(X_train,y_train)

In [None]:
models1 = {
    'RandomForestClassifier': RandomForestClassifier(),
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': linear_model.LogisticRegression(),
    'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier()
}

params1 = {
    'RandomForestClassifier': {'n_estimators':np.arange(1,650,50),
             'max_depth': np.arange(2,31,2),
             'min_samples_leaf': np.arange(2,31,1),
             'min_samples_split': np.arange(2,31,1)},
    'MultinomialNB':{'alpha': np.linspace(0.5,1,10),
                    'fit_prior': [True],
                    'class_prior': [None]},
    'LogisticRegression':{'C': np.logspace(0, 4, 15),
                         'penalty':['l1','l2']},
    'SVC': {'C': np.linspace(0.5,1,10),
           'kernel': ['linear', 'poly', 'rbf']},
    'KNeighborsClassifier': {'n_neighbors': np.arange(2,16,1)}
}

params2 = {
    'RandomForestClassifier': {'n_estimators':len(np.arange(1,650,50)),
             'max_depth': len(np.arange(2,31,2)),
             'min_samples_leaf': len(np.arange(2,31,1)),
             'min_samples_split': len(np.arange(2,31,1))},
    
    'MultinomialNB':{'alpha': len(np.linspace(0.5,1,10)),
                    'fit_prior': len([True]),
                    'class_prior': len([None])},
    
    'LogisticRegression':{'C': len(np.logspace(0, 4, 15)),
                         'penalty':len(['l1','l2'])},
    
             
    'SVC': {'C': len(np.linspace(0.5,1,10)),
           'kernel': len(['linear', 'poly', 'rbf'])},
    
    'KNeighborsClassifier': {'n_neighbors': len(np.arange(2,16,1))}
}



In [None]:
helper1 = EstimatorSelectionHelper(models1, params1, params2, proportion_iterations=.1)
helper1.randomized_fit(X_res, y_res, scoring='f1', n_jobs=-1, cv=5)

In [None]:
helper1.score_summary(sort_by='max_score').to_csv('model_scores.csv', index=False)