Do the imports. Packages to import are 
> pandas  
> OneHotEncoder  
> LogisticRegression  
> MakeColumnTransformer  
> make_pipeline 
> CountVectorizer

In [235]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer

Make the list of columns we want in the model, the 'Parch' (the number of parents or siblings they had on the trip), the amount they paid to get on the boat, the port they got on the boat at, and thier gender. 

In [236]:
cols = ['Parch', 'Fare', 'Embarked', 'Sex']

Read the data into a Pandas DateFrame from the website abbreviated version of the kaggle website. Import the \
training set. Then assign the columns and the target variable to X and y, respectively. 

In [237]:
url = 'http://bit.ly/kaggletrain'
df = pd.read_csv(url, nrows=10)

In [238]:
X = df[cols]
y = df['Survived']

Download and assign the testing set to the X_new variable.

In [239]:
url_test = 'http://bit.ly/kaggletest'
df_new = pd.read_csv(url_test, nrows=10)
X_new = df_new[cols]

Instantiate the OneHotEncoder.

In [240]:
ohe = OneHotEncoder()
ohe.fit(df[['Embarked','Sex']])
ohe.categories_

[array(['C', 'Q', 'S'], dtype=object), array(['female', 'male'], dtype=object)]

Construct and instatiate the column transformer. 

In [241]:
ct = make_column_transformer(
        (ohe, ['Embarked', 'Sex']),
        remainder='passthrough')

Run the column transformer to test.

In [242]:
ct.fit_transform(X)

array([[ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  0.    , 71.2833],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    ,  0.    ,  7.925 ],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    ,  0.    , 53.1   ],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    ,  8.05  ],
       [ 0.    ,  1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  8.4583],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    , 51.8625],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  1.    , 21.075 ],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    ,  2.    , 11.1333],
       [ 1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  0.    , 30.0708]])

Create the logistic Regression model and assign a solver for small data sets. 

In [243]:
logreg = LogisticRegression(solver='liblinear', random_state=1)

Make a pipeline and the fit the new model and make predictions.

In [244]:
pipe = make_pipeline(ct, logreg)

In [245]:
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                n_values=None,
                                                                sparse=True),
                                                  ['Embarked', 'Sex'])],
                      

Examine the coefficients.

In [246]:
pipe.named_steps.logisticregression.coef_

array([[ 0.26491287, -0.19848033, -0.22907928,  1.0075062 , -1.17015293,
         0.20056557,  0.01597307]])

In [247]:
pipe.predict(X_new)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

Now use text data. First immport the text tranformer module, instantiate it, and vectorize the variable 'Name' into a document-test matrix. Remember that CountVectorizer expects one-dimensional input.

In [248]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
dtm = vect.fit_transform(df['Name'])

Examine the feature names.

In [249]:
print(vect.get_feature_names())

['achem', 'adele', 'allen', 'berg', 'bradley', 'braund', 'briggs', 'cumings', 'elisabeth', 'florence', 'futrelle', 'gosta', 'harris', 'heath', 'heikkinen', 'henry', 'jacques', 'james', 'john', 'johnson', 'laina', 'leonard', 'lily', 'master', 'may', 'mccarthy', 'miss', 'moran', 'mr', 'mrs', 'nasser', 'nicholas', 'oscar', 'owen', 'palsson', 'peel', 'thayer', 'timothy', 'vilhelmina', 'william']


Put the data into a DataFrame. 

In [250]:
pd.DataFrame(data=dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,achem,adele,allen,berg,bradley,braund,briggs,cumings,elisabeth,florence,...,nasser,nicholas,oscar,owen,palsson,peel,thayer,timothy,vilhelmina,william
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,1,0,1,1,0,1,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
9,1,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


Update X to include 'Name'

In [251]:
cols = ['Parch','Fare','Embarked','Sex','Name']
X = df[cols]

Update the ColumnTransformer without the passthrough to examine in a DataFrame. 

In [252]:
ct = make_column_transformer(
        (ohe, ['Embarked','Sex']),
        (vect, 'Name'))

In [253]:
pd.DataFrame(data=ct.fit_transform(X).toarray(), columns=ct.get_feature_names())

Unnamed: 0,onehotencoder__x0_C,onehotencoder__x0_Q,onehotencoder__x0_S,onehotencoder__x1_female,onehotencoder__x1_male,countvectorizer__achem,countvectorizer__adele,countvectorizer__allen,countvectorizer__berg,countvectorizer__bradley,...,countvectorizer__nasser,countvectorizer__nicholas,countvectorizer__oscar,countvectorizer__owen,countvectorizer__palsson,countvectorizer__peel,countvectorizer__thayer,countvectorizer__timothy,countvectorizer__vilhelmina,countvectorizer__william
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Make column transformer and add in the *passthrough* statement.

In [254]:
ct = make_column_transformer(
        (ohe, ['Embarked','Sex']),
        (vect, 'Name'),
         remainder='passthrough')

Run the ColumnTransformer

In [255]:
ct.fit_transform(X)

<10x47 sparse matrix of type '<class 'numpy.float64'>'
	with 78 stored elements in Compressed Sparse Row format>

Now, update the pipe.

In [256]:
pipe = make_pipeline(ct, logreg)

Fit the pipeline and examine the steps. 

In [257]:
pipe.fit(X, y);
# pipe.named_steps

Update the testing data frame to include the new column. 

In [258]:
X_new = df_new[cols]
X_new

Unnamed: 0,Parch,Fare,Embarked,Sex,Name
0,0,7.8292,Q,male,"Kelly, Mr. James"
1,0,7.0,S,female,"Wilkes, Mrs. James (Ellen Needs)"
2,0,9.6875,Q,male,"Myles, Mr. Thomas Francis"
3,0,8.6625,S,male,"Wirz, Mr. Albert"
4,1,12.2875,S,female,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)"
5,0,9.225,S,male,"Svensson, Mr. Johan Cervin"
6,0,7.6292,Q,female,"Connolly, Miss. Kate"
7,1,29.0,S,male,"Caldwell, Mr. Albert Francis"
8,0,7.2292,C,female,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)"
9,0,24.15,S,male,"Davies, Mr. John Samuel"


Make predictions.

In [259]:
pipe.predict(X_new)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

## Part 5

Adding age. Introduces missing values problems. 

In [260]:
cols.append('Age')

In [261]:
cols

['Parch', 'Fare', 'Embarked', 'Sex', 'Name', 'Age']

In [262]:
X = df[cols]

In [263]:
#pipe.fit(X, y)

Throws error: ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

Options:  
dropna() -- This is good if there are only a small number of missing and you know that the process that lead to the missing data is totally random. If you do this do it before peeling off your X so you get the y's  

dropna(axis='columns') drops the variable, the whole column. So in this case it would drop 'Age'.  

The alternative is to impute missing values. 

Import the package that imputes values, instantiate it, and run it on the variable 'Age'. 

In [264]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer()
imp.fit_transform(X[['Age']]) #used default mean

array([[22.        ],
       [38.        ],
       [26.        ],
       [35.        ],
       [35.        ],
       [28.11111111],
       [54.        ],
       [ 2.        ],
       [27.        ],
       [14.        ]])

print out the value it used as the imputed value.

In [265]:
imp.statistics_

array([28.11111111])

Will replace any NaN with this, the average. 

In [266]:
ct = make_column_transformer(
        (ohe, ['Embarked','Sex']),
        (vect, 'Name'),
        (imp, ['Age']),
         remainder='passthrough')

In [267]:
pipe = make_pipeline(ct, logreg)

In [268]:
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                n_values=None,
                                                                sparse=True),
                                                  ['Embarked', 'Sex']),
                       

In [269]:
pipe.named_steps

{'columntransformer': ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                   transformer_weights=None,
                   transformers=[('onehotencoder',
                                  OneHotEncoder(categorical_features=None,
                                                categories=None, drop=None,
                                                dtype=<class 'numpy.float64'>,
                                                handle_unknown='error',
                                                n_values=None, sparse=True),
                                  ['Embarked', 'Sex']),
                                 ('countvectorizer',
                                  CountVectorizer(analyzer='word', bin...
                                                  input='content',
                                                  lowercase=True, max_df=1.0,
                                                  max_features=None, min_df=1,
                  

Now modify the test data and make predictions

In [270]:
X_new = df_new[cols]
pipe.predict(X_new)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0])

If X_new has missing values, the value that will be imputed is the mean of age in X, not in X_new. It will fill in the value discovered from the training data, not the testing data. We are only allowed to learn from training data, not the testing data. This is analogous to the vectorizing and dummy values. 

Now we try a new imputer that in addition to imputing values to missing data, creates a binary variable indicating whether the value is imputed. This can be used as a new variable on the theory that the process which lead to data being missing was non-random and causally relevant to the process under investigation. 

In [271]:
imp_indicator = SimpleImputer(add_indicator=True)

In [272]:
imp_indicator.fit_transform(X[['Age']])

array([[22.        ,  0.        ],
       [38.        ,  0.        ],
       [26.        ,  0.        ],
       [35.        ,  0.        ],
       [35.        ,  0.        ],
       [28.11111111,  1.        ],
       [54.        ,  0.        ],
       [ 2.        ,  0.        ],
       [27.        ,  0.        ],
       [14.        ,  0.        ]])

Note the 1 where the value was imputed. This is useful when you think that having 'missingness' is a predictive feature itself. 'Missing not at random', MNAR, is common and my be useful to use as a variable. There is knn imputer and other imputers but they are more complex. 

## Part 6

Switch to the full data set to introduce more NaN problems. Import the whole data set. 

In [273]:
df = pd.read_csv('http://bit.ly/kaggletrain')

In [274]:
df.shape

(891, 12)

And the testing data.

In [275]:
df_new = pd.read_csv('http://bit.ly/kaggletest')

In [276]:
df_new.shape

(418, 11)

Has only 11 columns because it doesn't have the target value.

Count up the number of values that are missing.

In [277]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Embarked has missing values in the training set but not in the testing set. 

In [278]:
df_new.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

`Fare` has missing values in the testing set but not in the training set. 

Make the X and y variables with the new variables.

In [279]:
X = df[cols]
y = df['Survived']

Update the column transformer with the new variable, Age, in it.

In [280]:
ct = make_column_transformer(
        (ohe, ['Embarked','Sex']),
        (vect, 'Name'),
        (imp, ['Age']),
         remainder='passthrough')

In [281]:
# ct.fit_transform(X, y)  Fails because Embarked has NaN positions

The above code fails because there are missing values in the Embarked variable. So first imput a constant. There is no way to 'average' the values of a categorical variable so we have to impute a constant. 

In [282]:
imp_constant = SimpleImputer(strategy='constant', fill_value='missing')

This is imputing values. It is just imputing the value 'missing', not trying to guess what the real value would have been. 

Now we create a new pipeline that only imputes missing values and then creates the dummy variable.

In [283]:
imp_ohe = make_pipeline(imp_constant, ohe)

Now we have made a pipeling that only has imputation and one_hot_encoding, a transformer only pipeline. 

In [284]:
imp_ohe.fit_transform(X[['Embarked']])

<891x4 sparse matrix of type '<class 'numpy.float64'>'
	with 891 stored elements in Compressed Sparse Row format>

It uses constant imputation on Embarked and then passes the result to the one-hot-encoder. 

Rule for transformer: all pipeline steps except the final step must be transformers. The last step can be either a transformer or a model.  

Now we modify the pipeline to have the ohe be an imp_ohe. Embarked has missing values but Sex does not. But even though the Sex variable does not have any missing values in the training set it has some in the texting set. Passing it to the `Sex` variable to the one_hot_encoder will make it be coded when the missing values are encountered in the testing data set. 

In [285]:
ct = make_column_transformer(
        (imp_ohe, ['Embarked','Sex']),
        (vect, 'Name'),
        (imp, ['Age']),
         remainder='passthrough')

In [286]:
ct.fit_transform(X)

<891x1518 sparse matrix of type '<class 'numpy.float64'>'
	with 7328 stored elements in Compressed Sparse Row format>

The second problem we needed to solve is that 'Fare' has missing values in test but did not have them in train. That is an easy problem to fix. We just add 'Fare' into the imputer and the imputer learns an imputation value from the training data, even though it will only use them on the test data. 

In [287]:
ct = make_column_transformer(
        (imp_ohe, ['Embarked','Sex']),
        (vect, 'Name'),
        (imp, ['Age', 'Fare']),
         remainder='passthrough')

In [288]:
ct.fit_transform(X)

<891x1518 sparse matrix of type '<class 'numpy.float64'>'
	with 7328 stored elements in Compressed Sparse Row format>

In [289]:
pipe = make_pipeline(ct, logreg)
pipe.fit(X, y);

In [290]:
X_new = df_new[cols]
pipe.predict(X_new)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [291]:
#imputed values for age and fare
ct.named_transformers_.simpleimputer.statistics_

array([29.69911765, 32.20420797])

So the average age of the people in the data set was 29.7 and the average fare was 32.2.

## Recap at http://bit.ly/complex-pipeline

Why wouldn't we do all these data transformations in Pandas and save sklearn for the modeling phase. There are 3 drawbacks.  

1) you can't use CountVectorizer with Pandas.  

2) you could do the one-hot-encoding in pandas and add them back in when you get into sklearn.  

3) If you do the dummy variables in Pandas then you are going to have 'data leakage' in the next stage when you split into X_train and X_test. Your model evaluation procedure will not be a simulation of reality. Your imputation values will be learned from the entire data set. You will have learned from what is eventually be the testing set. Your model has to learn only from the training set. This will happen with all kinds of transformations if you do the transformations in Pandas. 

Data leakage: learning from the testing data that you are not allowed to know.  

sklearn prevents that by having separate steps for all the transformers that can operate on just the training data.  

Aren't there column transformations that you can't easily do in sklearn. Like multiplying two columns together.  

# Part 7 

Re-evaluating the model with cross-validation. We should have been doing this all through but we only had 5 cases. 

In [292]:
from sklearn.model_selection import cross_val_score

In [293]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.8114787748152608

Running cross val score on the pipeline it splits the data leaving 4/5 to the transformations, training and estimating the model and then do another split. It always does the transformations after the split to avoid data leakage. 

### Tuning

You might want to tune the logistic regression hyper parameters and the transformations. To do this we use grid-search. 'hyper-parameters' are set by you, 'parameters' are learned by the model. Since we are tuning a pipeline we need the parameter names. We have to have the parameter names that are used in the pipeline so we get them from the pipe. 

In [294]:
pipe.named_steps.keys()

dict_keys(['columntransformer', 'logisticregression'])

Now we make a dictionary called 'params' and add as the values the values we want to try for that parameter. We will tune the 'penalty' and the C parameter for the logistic regression. The penalty takes the values l1 (the letter 'l' and the number 1) and l2.  

In [295]:
params = {}
params['logisticregression__penalty'] = ['l1', 'l2']
params['logisticregression__C'] = [0.1, 1, 10]

Import the grid search function from sklearn.

In [296]:
from sklearn.model_selection import GridSearchCV

Note that it looks like cross_val_score. We specify the pipe, the parameters, the number of 'folds' we want it to cross-validata, and the scoring method for evaluating the models. Set up the grid-search object and fit the model.

In [297]:
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);

Store the results in a data frame and sort the values according to the test score.

In [298]:
results = pd.DataFrame(grid.cv_results_)
results.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,param_logisticregression__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.032313,0.006308,0.008949,0.001055,10.0,l1,"{'logisticregression__C': 10, 'logisticregress...",0.821229,0.821229,0.820225,0.792135,0.853107,0.821549,0.019263,1
2,0.023507,0.000808,0.008263,0.000247,1.0,l1,"{'logisticregression__C': 1, 'logisticregressi...",0.815642,0.821229,0.797753,0.792135,0.847458,0.814815,0.019519,2
3,0.022361,0.000882,0.00809,0.000173,1.0,l2,"{'logisticregression__C': 1, 'logisticregressi...",0.798883,0.826816,0.803371,0.786517,0.841808,0.811448,0.019987,3
5,0.025153,0.004034,0.013276,0.009856,10.0,l2,"{'logisticregression__C': 10, 'logisticregress...",0.782123,0.804469,0.808989,0.797753,0.853107,0.809203,0.023684,4
1,0.021262,0.000266,0.008359,0.000336,0.1,l2,"{'logisticregression__C': 0.1, 'logisticregres...",0.798883,0.804469,0.764045,0.775281,0.80791,0.790123,0.017305,5
0,0.023885,0.00332,0.010132,0.002394,0.1,l1,"{'logisticregression__C': 0.1, 'logisticregres...",0.787709,0.804469,0.764045,0.758427,0.79661,0.782267,0.018048,6


Tuning transformers

In [299]:
pipe.named_steps.columntransformer

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('pipeline',
                                 Pipeline(memory=None,
                                          steps=[('simpleimputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value='missing',
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0)),
                                                 ('onehotencoder',
                                                  OneHotEncoder(categorical_features=None,
                                                             

Adding `.named_transformers_` provides a more compact output.

In [300]:
pipe.named_steps.columntransformer.named_transformers_

{'pipeline': Pipeline(memory=None,
          steps=[('simpleimputer',
                  SimpleImputer(add_indicator=False, copy=True,
                                fill_value='missing', missing_values=nan,
                                strategy='constant', verbose=0)),
                 ('onehotencoder',
                  OneHotEncoder(categorical_features=None, categories=None,
                                drop=None, dtype=<class 'numpy.float64'>,
                                handle_unknown='error', n_values=None,
                                sparse=True))],
          verbose=False),
 'countvectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                 lowercase=True, max_df=1.0, max_features=None, min_df=1,
                 ngram_range=(1, 1), preprocessor=None, stop_words=None,
                 strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
      

With making a dummy variable you have the option to drop the first category. markham recommends not doing it because 'reasons'. But here in the parameters we want to send to grid_search, we want to try both to see if it makes any difference. So we run the model once with all the categories for the dummy variables and once dropping the first category. 

In [301]:
params['columntransformer__pipeline__onehotencoder__drop'] = [None, 'first']

Test whether n_grams make a difference by specifying the model with tokenizing words or pairs of words.

In [302]:
params['columntransformer__countvectorizer__ngram_range'] = [(1,1), (1,2)]

Test whether adding an indicator of having imputed a value or not makes a difference. 

In [303]:
params['columntransformer__simpleimputer__add_indicator'] = [False, False]

Inspect the final product by printing out the params.

In [311]:
params

{'logisticregression__penalty': ['l1', 'l2'],
 'logisticregression__C': [0.1, 1, 10],
 'columntransformer__pipeline__onehotencoder__drop': [None, 'first'],
 'columntransformer__countvectorizer__ngram_range': [(1, 1), (1, 2)],
 'columntransformer__simpleimputer__add_indicator': [False, False]}

You can make this more elegant by giving the steps custom names by using 'Pipeline' instead of 'make_pipeline' and 'ColumnTransformer' instead of 'make_column_transformer'. 

In [312]:
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=Tru

In [313]:
grid.best_score_

0.8282828282828283

In [314]:
grid.best_params_

{'columntransformer__countvectorizer__ngram_range': (1, 2),
 'columntransformer__pipeline__onehotencoder__drop': None,
 'columntransformer__simpleimputer__add_indicator': False,
 'logisticregression__C': 10,
 'logisticregression__penalty': 'l1'}

In [315]:
grid.predict(X_new)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,

Grid automatically choses the best parameters and uses those for the predictions.

The double underscore signals ? Because we need to distinguish between pipeline steps and parameter names. 