In [1]:
import sys
import spacy
import re
import pickle
import numpy as np
import pandas as pd
import scipy as sp
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS 
from collections import Counter
from plotnine import *
from pandas.tseries.offsets import MonthBegin
from yellowbrick.features import Rank2D
import feather
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import KElbowVisualizer
import pickle
from yellowbrick.text import FreqDistVisualizer
from yellowbrick.text import TSNEVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve
from yellowbrick.classifier import ConfusionMatrix, ROCAUC
from sklearn.preprocessing import StandardScaler
from yellowbrick.model_selection import ValidationCurve
from sklearn.metrics import classification_report

In [2]:
class EstimatorSelectionHelper:
    
    def __init__(self, models, params, params2, proportion_iterations):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.params2 = params2
        self.keys = models.keys()
        self.grid_searches = {}
        self.proportion_iterations = proportion_iterations
    
    def fit(self, X, y, cv=3, n_jobs=-1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for {}.".format(key))
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs
            
            
    
    def randomized_fit(self, X, y, cv=3, n_jobs=-1, verbose=1, scoring='f1'):
 
        params = self.params
        param_labels = []
        df = list(pd.DataFrame(self.params2).prod())
        
        for key in self.keys:
            param_labels.append(key)
        
        zipped = dict(zip(param_labels, df))
        
        for key in self.keys:
            print(key)
            if str(key) == 'RandomForestClassifier':
                n_iter = int(round(zipped[key]*(self.proportion_iterations**4.5)))
                print("Running RandomizedSearchCV for {} with {} iterations".format(key, n_iter))
            else:
                n_iter = int(round(zipped[key]*self.proportion_iterations)) 
                print("Running RandomizedSearchCV for {} with {} iterations".format(key, n_iter))
            model=self.models[key]
            params = self.params[key]
            rs = RandomizedSearchCV(model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring, return_train_score=True, n_iter=n_iter)
            rs.fit(X,y)
            self.grid_searches[key] = rs
       
    
    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})
                      
        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))
        
        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        
        return df[columns]


In [3]:
# Read in data
pitchfork_data = feather.read_dataframe('/home/michelle/Documents/data projects/pitchfork_data_analysis/data/processed/pitchfork_tfidf_aug30.feather')
#pitchfork_data = pd.read_csv('/home/michelle/Documents/data projects/pitchfork_data_analysis/data/processed/pitchfork_tfidf_aug30.csv')

In [4]:
pitchfork_data['category'] = pitchfork_data['category'].fillna(0)
pitchfork_data.head()

Unnamed: 0,publication_date,author,artist_name,album_x,label,score,review,like,song,album_1,...,shudder,sear,poke,hawk,dfa,torn,gibbard,juxtapos,album_y,category
0,2018-08-28 05:00:00,Evan Rytlewski,Devon Welsh,Dream Songs,You Are Accepted,6.9,For a songwriter who’s always been fixated on ...,0.131949,0.107358,0.021111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1,2018-08-28 05:00:00,Philip Sherburne,Bamba Pana,Poaa,Nyege Nyege Tapes,7.2,"At first glance, Kampala, Uganda’s Nyege Nyege...",0.088597,0.028834,0.014175,...,0.065385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,2018-08-28 05:00:00,Dean Van Nguyen,Nef the Pharaoh,The Big Chang Theory,KILFMB,7.3,Nef the Pharaoh takes the weight of his hometo...,0.080508,0.018341,0.036066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
3,2018-08-28 05:00:00,Stuart Berman,Matthew “Doc” Dunn,Lightbourn,Cosmic Range,8.0,"For over a decade now, Matthew “Doc” Dunn has ...",0.116932,0.016649,0.049109,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
4,2018-08-27 05:00:00,Ian Cohen,Interpol,Marauder,Matador,6.1,More than any of their frenemies in the 2000s ...,0.046965,0.010699,0.052599,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antics,1.0


In [5]:
# Get x-y values
X=pitchfork_data.iloc[:,7:-2]
y = pitchfork_data.category.values

In [6]:
del pitchfork_data

In [None]:

scaler = StandardScaler()
X  = scaler.fit_transform(X,y)

In [7]:
# Test-Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=np.random.randint(1,1000))

In [8]:
# Oversample because class imbalance and also dimensionality reasons.
X_res, y_res=SMOTE().fit_sample(X_train,y_train)

In [9]:
models1 = {
    #'MultinomialNB': MultinomialNB(),
    'RandomForestClassifier': RandomForestClassifier(),
    #'KNeighborsClassifier': KNeighborsClassifier()
}

params1 = {
    #'MultinomialNB':{'alpha': np.linspace(0.5,20,50),
      #              'fit_prior': [True],
      #              'class_prior': [None]},
    'RandomForestClassifier': {'n_estimators':np.arange(1,650,40),
             'max_depth': np.arange(2,31,2),
             'min_samples_leaf': np.arange(2,31,1),
             'min_samples_split': np.arange(2,31,1)},
    #'KNeighborsClassifier': {'n_neighbors': np.arange(2,10,1)}
}

params2 = {
    #'MultinomialNB':{'alpha': len(np.linspace(0.5,20,50)),
      #              'fit_prior': len([True]),
      #              'class_prior': len([None])},
    'RandomForestClassifier': {'n_estimators':len(np.arange(1,650,40)),
             'max_depth': len(np.arange(2,31,2)),
             'min_samples_leaf': len(np.arange(2,31,1)),
             'min_samples_split': len(np.arange(2,31,1))},
    #'KNeighborsClassifier': {'n_neighbors': len(np.arange(2,16,1))}
}



In [13]:
X

Unnamed: 0,like,song,album_1,band,sound,music,record,track,guitar,rock,...,mp3,shortcom,shudder,sear,poke,hawk,dfa,torn,gibbard,juxtapos
0,0.131949,0.107358,0.021111,0.055243,0.042117,0.112961,0.068587,0.023139,0.028090,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0,0.000000
1,0.088597,0.028834,0.014175,0.000000,0.197955,0.075847,0.015351,0.031073,0.000000,0.000000,...,0.000000,0.0,0.065385,0.0,0.000000,0.0,0.0,0.00000,0.0,0.000000
2,0.080508,0.018341,0.036066,0.000000,0.017988,0.057895,0.078116,0.039530,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0,0.000000
3,0.116932,0.016649,0.049109,0.042836,0.000000,0.017518,0.017728,0.017942,0.043563,0.023551,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0,0.000000
4,0.046965,0.010699,0.052599,0.055055,0.031481,0.000000,0.022785,0.000000,0.000000,0.060538,...,0.047998,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0,0.000000
5,0.093361,0.017724,0.017427,0.000000,0.104300,0.093247,0.000000,0.019100,0.069563,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0,0.000000
6,0.161456,0.000000,0.016438,0.043016,0.032795,0.087960,0.017802,0.018017,0.065619,0.047300,...,0.000000,0.0,0.000000,0.0,0.074428,0.0,0.0,0.00000,0.0,0.000000
7,0.103580,0.026219,0.012889,0.000000,0.064287,0.055175,0.013959,0.098892,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0,0.000000
8,0.103580,0.026219,0.012889,0.000000,0.064287,0.055175,0.013959,0.098892,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0,0.000000
9,0.103580,0.026219,0.012889,0.000000,0.064287,0.055175,0.013959,0.098892,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0,0.000000


In [None]:
models1 = {
    'RandomForestClassifier': RandomForestClassifier(),
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': linear_model.LogisticRegression(),
    'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier()
}

params1 = {
    'RandomForestClassifier': {'n_estimators':np.arange(1,650,50),
             'max_depth': np.arange(2,31,2),
             'min_samples_leaf': np.arange(2,31,1),
             'min_samples_split': np.arange(2,31,1)},
    'MultinomialNB':{'alpha': np.linspace(0.5,1,10),
                    'fit_prior': [True],
                    'class_prior': [None]},
    'LogisticRegression':{'C': np.logspace(0, 4, 15),
                         'penalty':['l1','l2']},
    'SVC': {'C': np.linspace(0.5,1,10),
           'kernel': ['linear', 'poly', 'rbf']},
    'KNeighborsClassifier': {'n_neighbors': np.arange(2,10,1)}
}

params2 = {
    'RandomForestClassifier': {'n_estimators':len(np.arange(1,650,50)),
             'max_depth': len(np.arange(2,31,2)),
             'min_samples_leaf': len(np.arange(2,31,1)),
             'min_samples_split': len(np.arange(2,31,1))},
    
    'MultinomialNB':{'alpha': len(np.linspace(0.5,1,10)),
                    'fit_prior': len([True]),
                    'class_prior': len([None])},
    
    'LogisticRegression':{'C': len(np.logspace(0, 4, 15)),
                         'penalty':len(['l1','l2'])},
    
             
    'SVC': {'C': len(np.linspace(0.5,1,10)),
           'kernel': len(['linear', 'poly', 'rbf'])},
    
    'KNeighborsClassifier': {'n_neighbors': len(np.arange(2,16,1))}
}



In [11]:
helper1 = EstimatorSelectionHelper(models1, params1, params2, proportion_iterations=.15)
helper1.randomized_fit(X_res, y_res, scoring='f1', n_jobs=-1, cv=5)
#helper1.randomized_fit(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)

RandomForestClassifier
Running RandomizedSearchCV for RandomForestClassifier with 42 iterations
Fitting 5 folds for each of 42 candidates, totalling 210 fits


JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/home/michelle/anaconda3/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/home/michelle/anaconda3/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x7fa8c8ec3660, file "/...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/home/michelle/anaconda3/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/home/michelle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/home/michel.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7fa8c8ec3660, file "/...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/home/michelle/anaconda3/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/home/michelle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/home/michel.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    481         if self.poller is not None:
    482             self.poller.start()
    483         self.kernel.start()
    484         self.io_loop = ioloop.IOLoop.current()
    485         try:
--> 486             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    487         except KeyboardInterrupt:
    488             pass
    489 
    490 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    122         except (RuntimeError, AssertionError):
    123             old_loop = None
    124         try:
    125             self._setup_logging()
    126             asyncio.set_event_loop(self.asyncio_loop)
--> 127             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Uni...EventLoop running=True closed=False debug=False>>
    128         finally:
    129             asyncio.set_event_loop(old_loop)
    130 
    131     def stop(self):

...........................................................................
/home/michelle/anaconda3/lib/python3.6/asyncio/base_events.py in run_forever(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
    417             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    418                                    finalizer=self._asyncgen_finalizer_hook)
    419         try:
    420             events._set_running_loop(self)
    421             while True:
--> 422                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_UnixS...EventLoop running=True closed=False debug=False>>
    423                 if self._stopping:
    424                     break
    425         finally:
    426             self._stopping = False

...........................................................................
/home/michelle/anaconda3/lib/python3.6/asyncio/base_events.py in _run_once(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
   1427                         logger.warning('Executing %s took %.3f seconds',
   1428                                        _format_handle(handle), dt)
   1429                 finally:
   1430                     self._current_handle = None
   1431             else:
-> 1432                 handle._run()
        handle._run = <bound method Handle._run of <Handle BaseAsyncIOLoop._handle_events(11, 1)>>
   1433         handle = None  # Needed to break cycles when an exception occurs.
   1434 
   1435     def _set_coroutine_wrapper(self, enabled):
   1436         try:

...........................................................................
/home/michelle/anaconda3/lib/python3.6/asyncio/events.py in _run(self=<Handle BaseAsyncIOLoop._handle_events(11, 1)>)
    140             self._callback = None
    141             self._args = None
    142 
    143     def _run(self):
    144         try:
--> 145             self._callback(*self._args)
        self._callback = <bound method BaseAsyncIOLoop._handle_events of <tornado.platform.asyncio.AsyncIOMainLoop object>>
        self._args = (11, 1)
    146         except Exception as exc:
    147             cb = _format_callback_source(self._callback, self._args)
    148             msg = 'Exception in callback {}'.format(cb)
    149             context = {

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py in _handle_events(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, fd=11, events=1)
    112             self.writers.remove(fd)
    113         del self.handlers[fd]
    114 
    115     def _handle_events(self, fd, events):
    116         fileobj, handler_func = self.handlers[fd]
--> 117         handler_func(fileobj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fileobj = <zmq.sugar.socket.Socket object>
        events = 1
    118 
    119     def start(self):
    120         try:
    121             old_loop = asyncio.get_event_loop()

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    445             return
    446         zmq_events = self.socket.EVENTS
    447         try:
    448             # dispatch events:
    449             if zmq_events & zmq.POLLIN and self.receiving():
--> 450                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    451                 if not self.socket:
    452                     return
    453             if zmq_events & zmq.POLLOUT and self.sending():
    454                 self._handle_send()

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    475             else:
    476                 raise
    477         else:
    478             if self._recv_callback:
    479                 callback = self._recv_callback
--> 480                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    481         
    482 
    483     def _handle_send(self):
    484         """Handle a send event."""

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    427         close our socket."""
    428         try:
    429             # Use a NullContext to ensure that all StackContexts are run
    430             # inside our blanket exception handler rather than outside.
    431             with stack_context.NullContext():
--> 432                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    433         except:
    434             gen_log.error("Uncaught exception in ZMQStream callback",
    435                           exc_info=True)
    436             # Re-raise the exception so that IOLoop.handle_callback_exception

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': "helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 8, 31, 12, 2, 13, 121463, tzinfo=tzutc()), 'msg_id': '258b9528821b4dac84482d21f021e3ae', 'msg_type': 'execute_request', 'session': 'a5f72f95a3294beeacb03d84e49209df', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '258b9528821b4dac84482d21f021e3ae', 'msg_type': 'execute_request', 'parent_header': {}})
    228             self.log.warn("Unknown message type: %r", msg_type)
    229         else:
    230             self.log.debug("%s: %s", msg_type, msg)
    231             self.pre_handler_hook()
    232             try:
--> 233                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'a5f72f95a3294beeacb03d84e49209df']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': "helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 8, 31, 12, 2, 13, 121463, tzinfo=tzutc()), 'msg_id': '258b9528821b4dac84482d21f021e3ae', 'msg_type': 'execute_request', 'session': 'a5f72f95a3294beeacb03d84e49209df', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '258b9528821b4dac84482d21f021e3ae', 'msg_type': 'execute_request', 'parent_header': {}}
    234             except Exception:
    235                 self.log.error("Exception in message handler:", exc_info=True)
    236             finally:
    237                 self.post_handler_hook()

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'a5f72f95a3294beeacb03d84e49209df'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': "helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 8, 31, 12, 2, 13, 121463, tzinfo=tzutc()), 'msg_id': '258b9528821b4dac84482d21f021e3ae', 'msg_type': 'execute_request', 'session': 'a5f72f95a3294beeacb03d84e49209df', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '258b9528821b4dac84482d21f021e3ae', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code="helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    203 
    204         self._forward_input(allow_stdin)
    205 
    206         reply_content = {}
    207         try:
--> 208             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = "helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)"
        store_history = True
        silent = False
    209         finally:
    210             self._restore_input()
    211 
    212         if res.error_before_exec is not None:

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=("helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)",), **kwargs={'silent': False, 'store_history': True})
    532             )
    533         self.payload_manager.write_payload(payload)
    534 
    535     def run_cell(self, *args, **kwargs):
    536         self._last_traceback = None
--> 537         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ("helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)",)
        kwargs = {'silent': False, 'store_history': True}
    538 
    539     def _showtraceback(self, etype, evalue, stb):
    540         # try to preserve ordering of tracebacks and print statements
    541         sys.stdout.flush()

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)", store_history=True, silent=False, shell_futures=True)
   2657         -------
   2658         result : :class:`ExecutionResult`
   2659         """
   2660         try:
   2661             result = self._run_cell(
-> 2662                 raw_cell, store_history, silent, shell_futures)
        raw_cell = "helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)"
        store_history = True
        silent = False
        shell_futures = True
   2663         finally:
   2664             self.events.trigger('post_execute')
   2665             if not silent:
   2666                 self.events.trigger('post_run_cell', result)

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in _run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)", store_history=True, silent=False, shell_futures=True)
   2780                 self.displayhook.exec_result = result
   2781 
   2782                 # Execute the user code
   2783                 interactivity = 'none' if silent else self.ast_node_interactivity
   2784                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2785                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2786                 
   2787                 self.last_execution_succeeded = not has_raised
   2788                 self.last_execution_result = result
   2789 

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-11-431c3d50f6bd>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 7fa83dcd5b00, executi...rue silent=False shell_futures=True> result=None>)
   2904                     return True
   2905 
   2906             for i, node in enumerate(to_run_interactive):
   2907                 mod = ast.Interactive([node])
   2908                 code = compiler(mod, cell_name, "single")
-> 2909                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7fa83dccf150, file "<ipython-input-11-431c3d50f6bd>", line 2>
        result = <ExecutionResult object at 7fa83dcd5b00, executi...rue silent=False shell_futures=True> result=None>
   2910                     return True
   2911 
   2912             # Flush softspace
   2913             if softspace(sys.stdout, 0):

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7fa83dccf150, file "<ipython-input-11-431c3d50f6bd>", line 2>, result=<ExecutionResult object at 7fa83dcd5b00, executi...rue silent=False shell_futures=True> result=None>)
   2958         outflag = True  # happens in more places, so it's easier as default
   2959         try:
   2960             try:
   2961                 self.hooks.pre_run_code_hook()
   2962                 #rprint('Running code', repr(code_obj)) # dbg
-> 2963                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7fa83dccf150, file "<ipython-input-11-431c3d50f6bd>", line 2>
        self.user_global_ns = {'ADASYN': <class 'imblearn.over_sampling.adasyn.ADASYN'>, 'ConfusionMatrix': <class 'yellowbrick.classifier.confusion_matrix.ConfusionMatrix'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'Counter': <class 'collections.Counter'>, 'EstimatorSelectionHelper': <class '__main__.EstimatorSelectionHelper'>, 'FreqDistVisualizer': <class 'yellowbrick.text.freqdist.FrequencyVisualizer'>, 'In': ['', 'import sys\nimport spacy\nimport re\nimport pickle\n...from sklearn.metrics import classification_report', 'class EstimatorSelectionHelper:\n    \n    def __i...t in columns]\n        \n        return df[columns]', "# Read in data\npitchfork_data = feather.read_dat...alysis/data/processed/pitchfork_tfidf_aug30.csv')", "pitchfork_data['category'] = pitchfork_data['category'].fillna(0)\npitchfork_data.head()", '# Get x-y values\nX=pitchfork_data.iloc[:,7:-2]\ny = pitchfork_data.category.values', 'del pitchfork_data', '# Test-Train Split\nX_train, X_test, y_train, y_t...ize=0.30, random_state=np.random.randint(1,1000))', '# Oversample because class imbalance and also di...\nX_res, y_res=SMOTE().fit_sample(X_train,y_train)', "models1 = {\n    #'MultinomialNB': MultinomialNB(...ifier': {'n_neighbors': len(np.arange(2,16,1))}\n}", "helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)", "helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)"], 'KElbowVisualizer': <class 'yellowbrick.cluster.elbow.KElbowVisualizer'>, 'KMeans': <class 'sklearn.cluster.k_means_.KMeans'>, 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, ...}
        self.user_ns = {'ADASYN': <class 'imblearn.over_sampling.adasyn.ADASYN'>, 'ConfusionMatrix': <class 'yellowbrick.classifier.confusion_matrix.ConfusionMatrix'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'Counter': <class 'collections.Counter'>, 'EstimatorSelectionHelper': <class '__main__.EstimatorSelectionHelper'>, 'FreqDistVisualizer': <class 'yellowbrick.text.freqdist.FrequencyVisualizer'>, 'In': ['', 'import sys\nimport spacy\nimport re\nimport pickle\n...from sklearn.metrics import classification_report', 'class EstimatorSelectionHelper:\n    \n    def __i...t in columns]\n        \n        return df[columns]', "# Read in data\npitchfork_data = feather.read_dat...alysis/data/processed/pitchfork_tfidf_aug30.csv')", "pitchfork_data['category'] = pitchfork_data['category'].fillna(0)\npitchfork_data.head()", '# Get x-y values\nX=pitchfork_data.iloc[:,7:-2]\ny = pitchfork_data.category.values', 'del pitchfork_data', '# Test-Train Split\nX_train, X_test, y_train, y_t...ize=0.30, random_state=np.random.randint(1,1000))', '# Oversample because class imbalance and also di...\nX_res, y_res=SMOTE().fit_sample(X_train,y_train)', "models1 = {\n    #'MultinomialNB': MultinomialNB(...ifier': {'n_neighbors': len(np.arange(2,16,1))}\n}", "helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)", "helper1 = EstimatorSelectionHelper(models1, para...(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)"], 'KElbowVisualizer': <class 'yellowbrick.cluster.elbow.KElbowVisualizer'>, 'KMeans': <class 'sklearn.cluster.k_means_.KMeans'>, 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, ...}
   2964             finally:
   2965                 # Reset our crash handler in place
   2966                 sys.excepthook = old_excepthook
   2967         except SystemExit as e:

...........................................................................
/home/michelle/Documents/data projects/pitchfork_data_analysis/notebooks/<ipython-input-11-431c3d50f6bd> in <module>()
      1 helper1 = EstimatorSelectionHelper(models1, params1, params2, proportion_iterations=.15)
----> 2 helper1.randomized_fit(X_res, y_res, scoring='f1', n_jobs=-1, cv=5)
      3 #helper1.randomized_fit(X_train, y_train, scoring='f1', n_jobs=-1, cv=5)

...........................................................................
/home/michelle/Documents/data projects/pitchfork_data_analysis/notebooks/<ipython-input-2-898b4ca2ff12> in randomized_fit(self=<__main__.EstimatorSelectionHelper object>, X=array([[0.15006067, 0.11395462, 0.01400506, ...,....., 0.        , 0.        ,
        0.        ]]), y=array([0., 0., 0., ..., 1., 1., 1.]), cv=5, n_jobs=-1, verbose=1, scoring='f1')
     44                 n_iter = int(round(zipped[key]*self.proportion_iterations)) 
     45                 print("Running RandomizedSearchCV for {} with {} iterations".format(key, n_iter))
     46             model=self.models[key]
     47             params = self.params[key]
     48             rs = RandomizedSearchCV(model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring, return_train_score=True, n_iter=n_iter)
---> 49             rs.fit(X,y)
     50             self.grid_searches[key] = rs
     51        
     52     
     53     def score_summary(self, sort_by='mean_score'):

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self=RandomizedSearchCV(cv=5, error_score='raise',
  ...return_train_score=True, scoring='f1', verbose=1), X=array([[0.15006067, 0.11395462, 0.01400506, ...,....., 0.        , 0.        ,
        0.        ]]), y=array([0., 0., 0., ..., 1., 1., 1.]), groups=None, **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method StratifiedKFold.split of Stratifie...ld(n_splits=5, random_state=None, shuffle=False)>
        X = array([[0.15006067, 0.11395462, 0.01400506, ...,....., 0.        , 0.        ,
        0.        ]])
        y = array([0., 0., 0., ..., 1., 1., 1.])
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Fri Aug 31 08:11:03 2018
PID: 17956                Python 3.6.5: /home/michelle/anaconda3/bin/python
...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), memmap([[0.15006067, 0.11395462, 0.01400506, ......., 0.        , 0.        ,
         0.        ]]), array([0., 0., 0., ..., 1., 1., 1.]), {'score': make_scorer(f1_score)}, array([    0,     1,     2, ..., 21716, 21717, 21718]), array([15108, 15109, 15110, ..., 24129, 24130, 24131]), 1, {'max_depth': 22, 'min_samples_leaf': 17, 'min_samples_split': 13, 'n_estimators': 641}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), memmap([[0.15006067, 0.11395462, 0.01400506, ......., 0.        , 0.        ,
         0.        ]]), array([0., 0., 0., ..., 1., 1., 1.]), {'score': make_scorer(f1_score)}, array([    0,     1,     2, ..., 21716, 21717, 21718]), array([15108, 15109, 15110, ..., 24129, 24130, 24131]), 1, {'max_depth': 22, 'min_samples_leaf': 17, 'min_samples_split': 13, 'n_estimators': 641})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': True}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), X=memmap([[0.15006067, 0.11395462, 0.01400506, ......., 0.        , 0.        ,
         0.        ]]), y=array([0., 0., 0., ..., 1., 1., 1.]), scorer={'score': make_scorer(f1_score)}, train=array([    0,     1,     2, ..., 21716, 21717, 21718]), test=array([15108, 15109, 15110, ..., 24129, 24130, 24131]), verbose=1, parameters={'max_depth': 22, 'min_samples_leaf': 17, 'min_samples_split': 13, 'n_estimators': 641}, fit_params={}, return_train_score=True, return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    453 
    454     try:
    455         if y_train is None:
    456             estimator.fit(X_train, **fit_params)
    457         else:
--> 458             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method BaseForest.fit of RandomForestClas...e=None, verbose=0,
            warm_start=False)>
        X_train = memmap([[0.15006067, 0.11395462, 0.01400506, ......., 0.        , 0.        ,
         0.        ]])
        y_train = array([0., 0., 0., ..., 1., 1., 1.])
        fit_params = {}
    459 
    460     except Exception as e:
    461         # Note fit time as time until error
    462         fit_time = time.time() - start_time

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py in fit(self=RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), X=array([[0.15006067, 0.11395462, 0.01400506, ...,... 0.        ,
        0.        ]], dtype=float32), y=array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]]), sample_weight=None)
    323             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
    324                              backend="threading")(
    325                 delayed(_parallel_build_trees)(
    326                     t, self, X, y, sample_weight, i, len(trees),
    327                     verbose=self.verbose, class_weight=self.class_weight)
--> 328                 for i, t in enumerate(trees))
        i = 640
    329 
    330             # Collect newly grown trees
    331             self.estimators_.extend(trees)
    332 

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=1), iterable=<generator object BaseForest.fit.<locals>.<genexpr>>)
    774         self.n_completed_tasks = 0
    775         try:
    776             # Only set self._iterating to True if at least a batch
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
        self.dispatch_one_batch = <bound method Parallel.dispatch_one_batch of Parallel(n_jobs=1)>
        iterator = <generator object BaseForest.fit.<locals>.<genexpr>>
    780                 self._iterating = True
    781             else:
    782                 self._iterating = False
    783 

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self=Parallel(n_jobs=1), iterator=<generator object BaseForest.fit.<locals>.<genexpr>>)
    620             tasks = BatchedCalls(itertools.islice(iterator, batch_size))
    621             if len(tasks) == 0:
    622                 # No more tasks available in the iterator: tell caller to stop.
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
        self._dispatch = <bound method Parallel._dispatch of Parallel(n_jobs=1)>
        tasks = <sklearn.externals.joblib.parallel.BatchedCalls object>
    626                 return True
    627 
    628     def _print(self, msg, msg_args):
    629         """Display the message on stout or stderr depending on verbosity"""

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self=Parallel(n_jobs=1), batch=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    583         self.n_dispatched_tasks += len(batch)
    584         self.n_dispatched_batches += 1
    585 
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
        job = undefined
        self._backend.apply_async = <bound method SequentialBackend.apply_async of <...lib._parallel_backends.SequentialBackend object>>
        batch = <sklearn.externals.joblib.parallel.BatchedCalls object>
        cb = <sklearn.externals.joblib.parallel.BatchCompletionCallBack object>
    589         self._jobs.append(job)
    590 
    591     def dispatch_next(self):
    592         """Dispatch more data for parallel processing

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self=<sklearn.externals.joblib._parallel_backends.SequentialBackend object>, func=<sklearn.externals.joblib.parallel.BatchedCalls object>, callback=<sklearn.externals.joblib.parallel.BatchCompletionCallBack object>)
    106             raise ValueError('n_jobs == 0 in Parallel has no meaning')
    107         return 1
    108 
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
        result = undefined
        func = <sklearn.externals.joblib.parallel.BatchedCalls object>
    112         if callback:
    113             callback(result)
    114         return result
    115 

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self=<sklearn.externals.joblib._parallel_backends.ImmediateResult object>, batch=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    327 
    328 class ImmediateResult(object):
    329     def __init__(self, batch):
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
        self.results = undefined
        batch = <sklearn.externals.joblib.parallel.BatchedCalls object>
    333 
    334     def get(self):
    335         return self.results
    336 

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _parallel_build_trees>, (DecisionTreeClassifier(class_weight=None, criter...         random_state=843533618, splitter='best'), RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), array([[0.15006067, 0.11395462, 0.01400506, ...,... 0.        ,
        0.        ]], dtype=float32), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]]), None, 420, 641), {'class_weight': None, 'verbose': 0})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _parallel_build_trees>
        args = (DecisionTreeClassifier(class_weight=None, criter...         random_state=843533618, splitter='best'), RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), array([[0.15006067, 0.11395462, 0.01400506, ...,... 0.        ,
        0.        ]], dtype=float32), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]]), None, 420, 641)
        kwargs = {'class_weight': None, 'verbose': 0}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _parallel_build_trees(tree=DecisionTreeClassifier(class_weight=None, criter...         random_state=843533618, splitter='best'), forest=RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), X=array([[0.15006067, 0.11395462, 0.01400506, ...,... 0.        ,
        0.        ]], dtype=float32), y=array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]]), sample_weight=None, tree_idx=420, n_trees=641, verbose=0, class_weight=None)
    116                 warnings.simplefilter('ignore', DeprecationWarning)
    117                 curr_sample_weight *= compute_sample_weight('auto', y, indices)
    118         elif class_weight == 'balanced_subsample':
    119             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
    120 
--> 121         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
        tree.fit = <bound method DecisionTreeClassifier.fit of Deci...        random_state=843533618, splitter='best')>
        X = array([[0.15006067, 0.11395462, 0.01400506, ...,... 0.        ,
        0.        ]], dtype=float32)
        y = array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])
        sample_weight = None
        curr_sample_weight = array([0., 3., 1., ..., 1., 1., 1.])
    122     else:
    123         tree.fit(X, y, sample_weight=sample_weight, check_input=False)
    124 
    125     return tree

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/tree/tree.py in fit(self=DecisionTreeClassifier(class_weight=None, criter...         random_state=843533618, splitter='best'), X=array([[0.15006067, 0.11395462, 0.01400506, ...,... 0.        ,
        0.        ]], dtype=float32), y=array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]]), sample_weight=array([0., 3., 1., ..., 1., 1., 1.]), check_input=False, X_idx_sorted=None)
    785 
    786         super(DecisionTreeClassifier, self).fit(
    787             X, y,
    788             sample_weight=sample_weight,
    789             check_input=check_input,
--> 790             X_idx_sorted=X_idx_sorted)
        X_idx_sorted = None
    791         return self
    792 
    793     def predict_proba(self, X, check_input=True):
    794         """Predict class probabilities of the input samples X.

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/tree/tree.py in fit(self=DecisionTreeClassifier(class_weight=None, criter...         random_state=843533618, splitter='best'), X=array([[0.15006067, 0.11395462, 0.01400506, ...,... 0.        ,
        0.        ]], dtype=float32), y=array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]]), sample_weight=array([0., 3., 1., ..., 1., 1., 1.]), check_input=False, X_idx_sorted=None)
    135             y = np.reshape(y, (-1, 1))
    136 
    137         self.n_outputs_ = y.shape[1]
    138 
    139         if is_classification:
--> 140             check_classification_targets(y)
        y = array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])
    141             y = np.copy(y)
    142 
    143             self.classes_ = []
    144             self.n_classes_ = []

...........................................................................
/home/michelle/anaconda3/lib/python3.6/site-packages/sklearn/utils/multiclass.py in check_classification_targets(y=array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]]))
    167     y : array-like
    168     """
    169     y_type = type_of_target(y)
    170     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    171                       'multilabel-indicator', 'multilabel-sequences']:
--> 172         raise ValueError("Unknown label type: %r" % y_type)
        y_type = 'continuous'
    173 
    174 
    175 def type_of_target(y):
    176     """Determine the type of data indicated by the target.

ValueError: Unknown label type: 'continuous'
___________________________________________________________________________

In [None]:
helper1.score_summary(sort_by='mean_score').to_csv('model_scores_aug30.csv', index=False)

In [None]:
helper1.score_summary(sort_by='mean_score')

In [None]:
clf = RandomForestClassifier(n_estimators=551,min_impurity_split=27, min_samples_leaf=18, max_depth=20)
clf = clf.fit(X_train,y_train)
pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test, pred))

### Diagnostics

In [None]:
fig = plt.figure(figsize=(8,8), dpi=250)
cv = StratifiedKFold(12)
sizes = np.linspace(0.3, 1.0, 10)

viz = LearningCurve(
    RandomForestClassifier(n_estimators=81,min_samples_split=2, min_samples_leaf=5, max_depth=30), cv=cv, train_sizes=sizes,
    scoring='accuracy', n_jobs=4
)

# Fit and poof the visualizer
viz.fit(X, y)
viz.poof()

In [None]:
fig = plt.figure(figsize=(8,8), dpi=250)
visualizer = ROCAUC(RandomForestClassifier(n_estimators=81,min_samples_split=2, min_samples_leaf=5, max_depth=30))

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
g = visualizer.poof()   

In [None]:
# The ConfusionMatrix visualizer taxes a model
fig = plt.figure(figsize=(8,8), dpi=250)
cm = ConfusionMatrix(RandomForestClassifier(n_estimators=81,min_samples_split=2, min_samples_leaf=5, max_depth=30), percent=True)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm.score(X_test, y_test)

# How did we do?
cm.poof()

### Text Viz

In [None]:
corpus = pickle.load( open( "/home/michelle/Documents/data projects/pitchfork_data_analysis/data/processed/corpus.p", "rb" ) )

In [None]:
fig = plt.figure(figsize=(8,25), dpi=250)

vectorizer = CountVectorizer()
docs       = vectorizer.fit_transform(corpus)
features   = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer(features=features, n=100)
visualizer.fit(docs)
visualizer.poof()

### Clustering (Kmeans and TNSE)

In [None]:
fig = plt.figure(figsize=(8,8), dpi=250)
visualizer = KElbowVisualizer(KMeans(), k=(2,10), metric='distortion')
visualizer.fit(X_train) # Fit the training data to the visualizer
visualizer.poof() # Draw/show/poof the data

In [None]:
# Instantiate the clustering model and visualizer
fig = plt.figure(figsize=(15,15), dpi=275)
model = KMeans(6)
visualizer = SilhouetteVisualizer(model)

visualizer.fit(X_train) # Fit the training data to the visualizer
visualizer.poof() # Draw/show/poof the data

In [None]:
fig = plt.figure(figsize=(16,16), dpi=275)
tfidf  = TfidfVectorizer()
docs   = tfidf.fit_transform(corpus)
tsne = TSNEVisualizer(labels=["documents"], decompose='svd')
tsne.fit(docs)
tsne.poof()