# Notes:
This notebooks shows how to use scikit-learn pipelines .

Performs a simple grid search

# Imports

In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import category_encoders as ce
from sklearn.pipeline import make_pipeline ,make_union, FeatureUnion, Pipeline
from sklearn.model_selection import GridSearchCV
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.externals.joblib import Memory
from tempfile import mkdtemp
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

# Read Files

In [2]:
submission_df = pd.read_csv('sampleSubmission.csv')

In [3]:
submission_df.head()

Unnamed: 0,id,s1,s2,s3,s4,s5,w1,w2,w3,w4,...,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15
0,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv("test.csv")

In [5]:
train_df.columns

Index(['id', 'tweet', 'state', 'location', 's1', 's2', 's3', 's4', 's5', 'w1',
       'w2', 'w3', 'w4', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7', 'k8', 'k9',
       'k10', 'k11', 'k12', 'k13', 'k14', 'k15'],
      dtype='object')

s= sentiment  
w = when   
k = kind  

In [6]:
len(train_df)

77946

In [7]:
sample_df = pd.read_csv('sampleSubmission.csv')

In [8]:
sample_df.head()

Unnamed: 0,id,s1,s2,s3,s4,s5,w1,w2,w3,w4,...,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15
0,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
train_df.columns

Index(['id', 'tweet', 'state', 'location', 's1', 's2', 's3', 's4', 's5', 'w1',
       'w2', 'w3', 'w4', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7', 'k8', 'k9',
       'k10', 'k11', 'k12', 'k13', 'k14', 'k15'],
      dtype='object')

# Pipeline

In [10]:
cat_cols = ['state', 'location']
cat_cols = ['state']
#cat_cols = ['location']

text_col= 'tweet'

In [11]:
def get_text_cols(X):
    return X[text_col]
def get_cat_cols(X):
    return X.loc[:,cat_cols]
def custom_scorer(model,X,y):
    y_predicted = model.predict(X)
    
    rms = sqrt(mean_squared_error(y, y_predicted))
    return rms
def trim_minor(df,threshold=5):
    
    def helper(sf):
        
        sf = sf.fillna('unknown')
        counts = sf.value_counts()
        repl = counts[counts <= threshold].index
        sf = sf.replace(repl, 'uncommon')
        #return df
        return sf
    return df.apply(helper,axis=1)

In [12]:
#train_df.iloc[:,['tweet']]

In [13]:
#get_text_cols(train_df)

helper pipeline to select columns to act on

In [14]:
get_cat = make_pipeline(FunctionTransformer(get_cat_cols, validate=False)) 
get_text = make_pipeline(FunctionTransformer(get_text_cols, validate=False)) 

In [54]:
# pipeline for categorical variables
category_pipeline = Pipeline([
    ('get_cat',get_cat)
    #,('trim_minor',FunctionTransformer(trim_minor, validate=False))
    ,('onehot_cat_encode', ce.OneHotEncoder(cols = cat_cols))
])

# pipeline for text feature
text_pipeline =  Pipeline([
    ('get_text', get_text)
    ,('tfidf',TfidfVectorizer(min_df=5))
    #, ('best', TruncatedSVD(n_components=50))
])


cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

# union the pipeline 
features_pipeline = FeatureUnion(transformer_list=[
    ("text_pipeline", text_pipeline)
    ,
    ("category_pipeline", category_pipeline)
    ] )


# actual ml pipeline
# acts on features that are generated
ml_pipeline = Pipeline(steps=[
    ('features', features_pipeline),
   # ('feature_selection', SelectKBest(f_regression, k=1000)),
     

    ('clf', Ridge())],memory=memory)



# Parameters to grid search on
# {pipline_step_name}__{attribte}
param_grid = { 'clf__alpha':(0.5,1)
     ,'features__text_pipeline__tfidf__max_features': (None, 5000, 10000, 50000)
     ,'features__text_pipeline__tfidf__max_df': (0.5, 0.75, 1.0)
     , 'features__text_pipeline__tfidf__ngram_range': [(1, 1), (1, 2),(1, 3) ]
    }





In [55]:
X_train = train_df
y_train = train_df.iloc[:,4:]

validate the categorical pipeline looks valid

In [56]:
category_pipeline.fit_transform(X_train).shape

(77946, 52)

validate the text pipeline looks valid

In [57]:
text_pipeline.fit_transform(X_train.head(10)).shape

(10, 1)

validate the features pipeline looks valid

In [58]:
features_pipeline.fit_transform(X_train)

<77946x9007 sparse matrix of type '<class 'numpy.float64'>'
	with 1062335 stored elements in Compressed Sparse Row format>

In [59]:
train_df['location'].value_counts()


Chicago                          884
Las Vegas                        749
Portland, OR                     732
New York                         719
Kansas City                      661
Detroit                          621
Denver                           620
Milwaukee                        602
Atlanta                          599
Seattle-Tacoma                   578
Pittsburgh                       568
Indianapolis                     563
Boston (Manchester)              556
Philadelphia                     535
Baltimore                        527
Los Angeles                      525
Columbus, OH                     520
Salt Lake City                   519
Nashville                        501
Minneapolis-St. Paul             476
Oklahoma City                    470
Phoenix (Prescott)               465
Charlotte                        443
Louisville                       436
Raleigh-Durham (Fayetvlle)       417
Hartford & New Haven             415
Albuquerque-Santa Fe             414
B

# Cross validate

In [60]:
scores = cross_val_score(ml_pipeline,X_train,y_train,cv=3,
                         scoring=custom_scorer, verbose =10, n_jobs=-1)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
25982   40117                      Warm weather..... awesome...!   
25983   40118  San 

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   22.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   22.1s finished


model seems steady

In [61]:
scores

array([ 0.16373877,  0.16421466,  0.16348622])

# Grid Search
Perform Random Grid Search (faster)     
Perform Grid Search ( slow)    

Regular Grid search will search all combinations     


In [62]:
random_search_final = RandomizedSearchCV(ml_pipeline, param_distributions=param_grid, cv=3
                                ,scoring=custom_scorer, verbose=2, n_jobs=-1)
random_search_final.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] features__text_pipeline__tfidf__ngram_range=(1, 3), features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__max_df=0.75, clf__alpha=1 
[CV] features__text_pipeline__tfidf__ngram_range=(1, 3), features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__max_df=0.75, clf__alpha=1 
[CV] features__text_pipeline__tfidf__ngram_range=(1, 3), features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__max_df=0.75, clf__alpha=1 
[CV] features__text_pipeline__tfidf__ngram_range=(1, 1), features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__max_df=0.5, clf__alpha=0.5 
[CV] features__text_pipeline__tfidf__ngram_range=(1, 1), features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__max_df=0.5, clf__alpha=0.5 
[CV] features__text_pipeline__tfidf__ngram_range=(1, 1), features__text_pipeline__tfidf__max_features=

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
0           1                Jazz for a Rainy Afternoon:  {link}   
1           2                   RT: @mention: I love rainy days.   
2           3  Good Morning Chicago! Time to kick the Windy C...   
3           6  Preach l

7      0.000  0.193  0.000  0.000  0.807  1.000  0.000  0.000  0.000  0.00  ...)
________________________________________________fit_transform_one - 7.6s, 0.1min
________________________________________________fit_transform_one - 7.5s, 0.1min
________________________________________________fit_transform_one - 7.5s, 0.1min
_______________________________________________fit_transform_one - 29.5s, 0.5min
_______________________________________________fit_transform_one - 29.4s, 0.5min
_______________________________________________fit_transform_one - 32.6s, 0.5min
_______________________________________________fit_transform_one - 32.7s, 0.5min
_______________________________________________fit_transform_one - 32.8s, 0.5min
[CV]  features__text_pipeline__tfidf__ngram_range=(1, 1), features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__max_df=0.5, clf__alpha=0.5, total= 1.1min
[CV] features__text_pipeline__tfidf__ngram_range=(1, 3), features__text_pipeline__tfidf_

[CV] features__text_pipeline__tfidf__ngram_range=(1, 3), features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__max_df=1.0, clf__alpha=1 
[CV] features__text_pipeline__tfidf__ngram_range=(1, 1), features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__max_df=1.0, clf__alpha=0.5 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,           id                    

7      0.000  0.193  0.000  0.000  0.807  1.000  0.000  0.000  0.000...)
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
25982   40117                      Warm weather..... awesome...!   
25983   40118  San Antonio, Texas Weather :: 71F A FEW CLOUDS...   
25984   40119  Morning.I shall be walking silently to the sho...   
25985   40120  hopes ev

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
25982   40117                      Warm weather..... awesome...!   
25983   40118  San Antonio, Texas Weather :: 71F A FEW CLOUDS...   
25984   40119  Morning.I shall be walking silently to the sho...   
25985   40120  hopes everyone had a great weekend, the warmer...   
25986   40124  #WEATHER:  11

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
_______________________________________________fit_transform_one - 16.4s, 0.3min
_______________________________________________fit_transform_one - 16.6s, 0.3min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
0           1                Jazz for a Rainy Afternoon:  {link} 

7      0.000  0.193  0.000  0.000  0.807  1.000  0.000  0.000  0.000  0.00  ...)
_______________________________________________fit_transform_one - 15.8s, 0.3min
_______________________________________________fit_transform_one - 16.4s, 0.3min
[CV]  features__text_pipeline__tfidf__ngram_range=(1, 2), features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__max_df=0.5, clf__alpha=0.5, total= 1.3min
[CV] features__text_pipeline__tfidf__ngram_range=(1, 2), features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__max_df=1.0, clf__alpha=0.5 
[CV]  features__text_pipeline__tfidf__ngram_range=(1, 2), features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__max_df=0.5, clf__alpha=0.5, total= 1.3min
[CV]  features__text_pipeline__tfidf__ngram_range=(1, 2), features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__max_df=0.5, clf__alpha=0.5, total= 1.3min
[CV] features__text_pipeline__tfidf__ng

7      0.000  0.193  0.000  0.000  0.807  1.000  0.000  0.000  0.000...)
[CV]  features__text_pipeline__tfidf__ngram_range=(1, 2), features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__max_df=1.0, clf__alpha=1, total= 1.1min
________________________________________________fit_transform_one - 6.4s, 0.1min
________________________________________________fit_transform_one - 6.7s, 0.1min
________________________________________________fit_transform_one - 6.4s, 0.1min
_______________________________________________fit_transform_one - 12.8s, 0.2min
[CV]  features__text_pipeline__tfidf__ngram_range=(1, 2), features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__max_df=1.0, clf__alpha=0.5, total= 1.1min
[CV]  features__text_pipeline__tfidf__ngram_range=(1, 2), features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__max_df=1.0, clf__alpha=0.5, total= 1.1min
[CV]  features__text_pipeline__tfidf__ngram_range=(1, 

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  4.9min finished


________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
0           1                Jazz for a Rainy Afternoon:  {link}   
1           2                   RT: @mention: I love rainy days.   
2           3  Good Morning Chicago! Time to kick the Windy C...   
3           6  Preach lol! :) RT @mention: #alliwantis this t...   
4           9               

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(memory=Memory(cachedir='/var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib'),
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontran...t_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'clf__alpha': (0.5, 1), 'features__text_pipeline__tfidf__max_features': (None, 5000, 10000, 50000), 'features__text_pipeline__tfidf__max_df': (0.5, 0.75, 1.0), 'features__text_pipeline__tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True,
          scoring=<function custom_scorer at 0x11ef27ea0>, verbose=2)

In [63]:
random_search_final.best_score_ 

0.16527139989685405

In [64]:
random_search_final.best_params_


{'clf__alpha': 0.5,
 'features__text_pipeline__tfidf__max_df': 1.0,
 'features__text_pipeline__tfidf__max_features': 10000,
 'features__text_pipeline__tfidf__ngram_range': (1, 1)}

Grid Search

In [65]:
grid_search_final = GridSearchCV(ml_pipeline, param_grid, cv=3
                                ,scoring=custom_scorer, verbose=2, n_jobs=-1)

In [66]:
grid_search_final.fit(X_train,y_train);


Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 1) 
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 1) 
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 1) 
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 2) 
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 2) 
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__m

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
0           1                Jazz for a Rainy Afternoon:  {link}   
1           2                   RT: @mention: I love rainy days.   
2           3  Good Morning Chicago! Time to kick the Windy C...   
3           6  Preach l

7      0.000  0.193  0.000  0.000  0.807  1.000  0.000  0.000  0.000  0.00  ...)
________________________________________________fit_transform_one - 6.7s, 0.1min
________________________________________________fit_transform_one - 6.8s, 0.1min
________________________________________________fit_transform_one - 6.7s, 0.1min
_______________________________________________fit_transform_one - 15.8s, 0.3min
_______________________________________________fit_transform_one - 15.8s, 0.3min
_______________________________________________fit_transform_one - 15.7s, 0.3min
_______________________________________________fit_transform_one - 28.8s, 0.5min
_______________________________________________fit_transform_one - 28.7s, 0.5min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  59.1s
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline

[Memory]    2.1s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/b7ce172f6577a45aed5aa759c334064a
[Memory]    2.2s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/bf50a587ad165f97f263877932f6b0d2
___________________________________fit_transform_one cache loaded - 7.7s, 0.1min
___________________________________fit_transform_one cache loaded - 7.6s, 0.1min
___________________________________fit_transform_one cache loaded - 7.3s, 0.1min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  56.6s
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 1) 
_______________

[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 2) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
25982   40117                      Warm weather..... awesome...!   
25983   40118  San Antonio, Texas Weather :: 71F A FEW CLOUDS..

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
0           1                Jazz for a Rainy Afternoon:  {link}   
1           2                   RT: @mention: I love rainy days.   
2           3  Good Morning Chicago! Time to kick the Windy C...   
3           6  Preach l

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 2), total= 1.3min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 2) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, 

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  4.8min


[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  48.7s
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 3) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,           id   

7      0.000  0.193  0.000  0.000  0.807  1.000  0.000  0.000  0.000...)
_______________________________________________fit_transform_one - 15.2s, 0.3min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 3), total= 1.5min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 1) 
_______________________________________________fit_transform_one - 28.5s, 0.5min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  54.8s
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 2) 
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 2), total= 1.4min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 2) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline',

[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 2), total= 1.4min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 1) 
[Memory]    2.2s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/234b3072dc9fbc0debc97191a6f16f5a
___________________________________fit_transform_one cache loaded - 0.7s, 0.0min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 2), total= 1.4min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  49.4s
[CV

[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 3) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
25982   40117                      Warm weather..... awesome...!   
25983   40118  San Antonio, Texas Weather :: 71F A FEW CLOUDS..

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
_______________________________________________fit_transform_one - 26.3s, 0.4min
_______________________________________________fit_transform_one - 26.3s, 0.4min
________________________________________________fit_transform_one - 6.8s, 0.1min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  49.4s
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 1) 
_______________________________________________fit_transform_one - 25.7s, 0.4min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     

7      0.000  0.193  0.000  0.000  0.807  1.000  0.000  0.000  0.000  0.00  ...)
________________________________________________fit_transform_one - 6.8s, 0.1min
_______________________________________________fit_transform_one - 14.2s, 0.2min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  55.4s
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 2) 
_______________________________________________fit_transform_one - 14.4s, 0.2min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransfor

7      0.000  0.193  0.000  0.000  0.807  1.000  0.000  0.000  0.000...)
_______________________________________________fit_transform_one - 14.0s, 0.2min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 3), total= 1.3min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 1) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nva

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
________________________________________________fit_transform_one - 7.0s, 0.1min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 1), total= 1.0min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 2) 
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 2), total= 1.3min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 2) 
_______________________________________________fit_transform_one - 16.7s, 0.3min
_______________________________________________________

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
0           1                Jazz for a Rainy Afternoon:  {link}   
1           2                   RT: @mention: I love rainy days.   
2           3  Good Morning Chicago! Time to kick the Windy C...   
3           6  Preach l

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 2), total= 1.4min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 2) 
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 2), total= 1.4min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 2) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline',

7      0.000  0.193  0.000  0.000  0.807  1.000  0.000  0.000  0.000  0.00  ...)
_______________________________________________fit_transform_one - 28.2s, 0.5min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 3), total= 1.7min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 3), total= 1.7min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 3) 
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 1) 
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=50000, features__tex

[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 2) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,           id                                              tweet  \
0          1                Jazz for a Rainy Afternoon:  {link}   
1          2                   RT: @mention: I love rainy days.   

7      0.000  0.193  0.000  0.000  0.807  1.000  0.000  0.000  0.000  0.00  ...)
_______________________________________________fit_transform_one - 27.1s, 0.5min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  56.9s
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 3) 
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 3), total= 1.7min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 1) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one..

[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 3), total= 1.5min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 1) 
___________________________________fit_transform_one cache loaded - 7.7s, 0.1min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df

25989  0.000  0.000  0.000  1.000  0.000  0.803  0.000  0.197  0.000...)
________________________________________________fit_transform_one - 6.6s, 0.1min
_______________________________________________fit_transform_one - 16.4s, 0.3min
[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 2), total= 1.1min
[CV] clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 2) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x

[CV]  clf__alpha=0.5, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 3), total= 1.2min
[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 1) 
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id     

[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 3) 
[Memory]    2.5s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/e7f1aa8f1e2259738d0871b90dce3302
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  44.9s
[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 3) 
[Memory]    2.5s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/b2b90a3221171462ef7072138f0315d3
___________________________________fit_transform_one cache loaded - 9.0s, 0.2min
________________

[Memory]    1.9s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/25cd020e817b6edfbe2fcd55456aeaa5
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 3), total= 1.2min
[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 1) 
[Memory]    1.9s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/41fbe08c06daf53c25c62d27fdb722f9
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_

___________________________________fit_transform_one cache loaded - 3.3s, 0.1min
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  41.6s
[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=50000, features__text_pipeline__tfidf__ngram_range=(1, 3) 
___________________________________fit_transform_one cache loaded - 3.4s, 0.1min
[Memory]    2.0s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/e311f61beacb872fc72f14e0bbbaa258
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 3), total=  57.5s
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=0.5, features__text_pipeline__tfidf__max_features=100

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 23.5min


[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 1) 
__________________________________fit_transform_one cache loaded - 10.4s, 0.2min
[Memory]    2.6s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/9ad3056fbcde6a1e6c6f19aa0ebe8ac7
[Memory]    2.3s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/e3d0fc31b29b110c77743b32f4f0f265
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
___________________________________fit_transform_one cache loaded - 8.5s, 0.1min
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  47.2s
[CV] clf__alpha=1,

___________________________________fit_transform_one cache loaded - 2.9s, 0.0min
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 2), total=  51.5s
[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 2) 
[Memory]    1.9s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/faf067639f3f7d0a287de3af1652af26
___________________________________fit_transform_one cache loaded - 2.8s, 0.0min
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 3), total=  57.2s
[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=10

[Memory]    2.0s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/bcc0ad5ccecebf102b6d22c6e09bb937
[Memory]    1.9s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/6d59cf4a10c74c98c5c02d1e1d9786c3
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=0.75, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 3), total= 1.1min
[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 1) 
[Memory]    2.0s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/f5ee8c2debb2b1c855d0748a7d10dd67
___________________________________fit_transform_one cache loaded 

[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 2) 
___________________________________fit_transform_one cache loaded - 0.4s, 0.0min
[Memory]    1.7s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/3171a036a27df2068dae0b4869e0825f
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=None, features__text_pipeline__tfidf__ngram_range=(1, 2), total=  56.0s
[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=5000, features__text_pipeline__tfidf__ngram_range=(1, 2) 
___________________________________fit_transform_one cache loaded - 2.6s, 0.0min
[Memory]    1.9s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/skle

[Memory]    2.0s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/06ca9c6ddb6f77976e41e964a35a6484
___________________________________fit_transform_one cache loaded - 6.9s, 0.1min
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 1), total=  42.0s
[CV] clf__alpha=1, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max_features=10000, features__text_pipeline__tfidf__ngram_range=(1, 3) 
[Memory]    2.1s, 0.0min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/6c1ccbb33bc7229dad1b6a7ffc99e989
___________________________________fit_transform_one cache loaded - 7.2s, 0.1min
[CV]  clf__alpha=1, features__text_pipeline__tfidf__max_df=1.0, features__text_pipeline__tfidf__max

[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 32.1min finished


________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, p...nvariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0))]))],
       transformer_weights=None), 
None,            id                                              tweet  \
0           1                Jazz for a Rainy Afternoon:  {link}   
1           2                   RT: @mention: I love rainy days.   
2           3  Good Morning Chicago! Time to kick the Windy C...   
3           6  Preach lol! :) RT @mention: #alliwantis this t...   
4           9               

In [67]:
grid_search_final.cv_results_

{'mean_fit_time': array([  57.92931962,   85.71529706,  109.21862721,   54.30048585,
          63.78980629,   68.02425591,   61.86378042,   77.24804926,
          86.57167586,   47.34535297,   77.27189843,  101.12304123,
          55.9948225 ,   82.28364205,   77.59160328,   47.38702027,
          65.7147247 ,   77.73662623,   53.98978662,   74.79910962,
          90.09771268,   58.74133627,   81.24825629,   98.36368736,
          48.71032596,   78.03168535,   97.29771169,   53.54094275,
          56.17054566,   83.95942434,   50.86479902,   62.88117266,
          69.55876835,   59.6170524 ,   90.46728468,  113.94268044,
          44.86507559,   61.37211482,   71.13301388,   43.38858827,
          47.71288927,   53.26783808,   41.49842699,   50.31380407,
          53.78443694,   40.45612057,   57.94502211,   67.49982945,
          41.67899831,   61.45649735,   70.29114072,   41.35244735,
          48.3945144 ,   53.54619304,   42.38895782,   55.04562895,
          62.78277405,   45.810

In [68]:
grid_search_final.best_score_ 

0.16527156408818214

In [69]:
grid_search_final.best_params_


{'clf__alpha': 0.5,
 'features__text_pipeline__tfidf__max_df': 0.5,
 'features__text_pipeline__tfidf__max_features': 10000,
 'features__text_pipeline__tfidf__ngram_range': (1, 1)}

# Best Model Fit

We got the best model parameters...
We can feed those params to our model

In [70]:
#set model to the best parameters
ml_pipeline.set_params(**grid_search_final.best_params_)

ml_pipeline.fit(X_train, y_train)

[Memory]25326.2s, 422.1min: Loading _fit_transform_one from /var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib/sklearn/pipeline/_fit_transform_one/60cb625d784336162213cc3a5f4fff5d
___________________________________fit_transform_one cache loaded - 0.3s, 0.0min


Pipeline(memory=Memory(cachedir='/var/folders/3d/419lqnj12c12g6cz4gvfjpzr0000gn/T/tmpy_1wgph4/joblib'),
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('get_text', Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text_cols at 0x11ef27bf8>, inv_kw_args=...it_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [79]:
def prepare_submission(pipeline, test_df):
    X = test_df
    predictions = pipeline.predict(X)
    
    # clip predictions to 0 - 1 
    predictions= np.clip(predictions,a_min=0,a_max=1)
    predictions[:,0:5] = normalize(predictions[:,0:5],norm='l1')
    predictions[:,5:9] = normalize(predictions[:,5:9],norm='l1')

    
    predictions_with_id = np.array(np.hstack([np.matrix(test_df['id']).T, predictions])) 

    df2 = pd.DataFrame(data=predictions_with_id, columns = submission_df.columns)
    df2['id'] = df2['id'].apply(int)
    #df2.to_csv('decision_tree_clipping.csv.gz',index=False, compression='gzip' )

    return df2


In [80]:
predictions_df  = prepare_submission(ml_pipeline,test_df)

In [81]:
predictions_df.head()

Unnamed: 0,id,s1,s2,s3,s4,s5,w1,w2,w3,w4,...,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15
0,4,0.15417,0.070399,0.239984,0.396625,0.138821,0.71704,0.061181,0.221779,0.0,...,0.0,0.378085,0.0,0.017459,0.0,0.0,0.0,0.983876,0.0,0.006567
1,5,0.02882,0.0,0.700449,0.050447,0.220284,0.319604,0.38613,0.124363,0.169904,...,0.0,0.101319,0.000913,0.0,0.06034,0.0,0.789838,0.044056,0.034122,0.001901
2,7,0.126428,0.226436,0.250252,0.005471,0.391413,0.313201,0.157639,0.31546,0.2137,...,0.0,0.680008,0.0,0.059049,0.0,0.126116,0.142295,0.094789,0.01122,0.0
3,8,0.00471,0.015946,0.945525,0.0,0.033819,0.886991,0.031734,0.068891,0.012383,...,0.0,0.149415,0.001108,0.221851,0.00338,0.0,0.022805,0.037947,0.0,0.614244
4,12,0.016823,0.255462,0.282913,0.077193,0.36761,0.355046,0.235713,0.288336,0.120904,...,0.001002,0.478992,0.008492,0.036766,0.244263,0.012653,0.086399,0.13173,0.0,0.003996


In [82]:
predictions_df.to_csv('decision_tree_pipeline_grid_state.csv.gz',index=False, compression='gzip' )