pipeline:
1) allows to properly cross validate a process, rather than just a model (including pre-processing as well as model building)
2) grid search / randomized search of both tuning parameters for model and preprocessing steps.  for example you want to check model parameters plus parameters linked to strategy for handling missing values. you can do a grid search and check both!

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [101]:
df = pd.read_csv('http://bit.ly/kaggletrain') # titanic dataset
# df.head(3)
print(df.columns)
print(df.isna().sum())
print('shape:',df.shape)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
shape: (891, 12)


In [102]:
# we select a couple of features, for teching purposes
# survived: target. embarked: port they embarked from
df = df.loc[df.Embarked.notna(),['Survived', 'Pclass','Sex','Embarked' ]]
# excluding rows where values are missing
print('shape: ',df.shape)
df.head()

shape:  (889, 4)


Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


In [103]:
# cross validate a model that uses only Pclass
X = df.loc[:, ['Pclass']]
y = df.Survived
print(X.shape)
print(y.shape)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs')
from sklearn.model_selection import cross_val_score
print(cross_val_score(logreg, X, y, cv=5, scoring='accuracy').mean())
# we are doing 5-fold cross validation
print(y.value_counts(normalize=True))  # compare with null accuracy, the accuracy we would get just always predicting the most frequent class
# we want to include more columns: sex and embarked column. how to encode them?
# dummy/onehot encoding (same thing). 
from sklearn.preprocessing import OneHotEncoder
df["Sex"] = df["Sex"].map({'male': 0, 'female': 1})
ohe = OneHotEncoder(sparse=False) # has fit/transform method. and fit_transform to do it at the same time.
y1 = ohe.fit_transform(df[["Sex"]])


(889, 1)
(889,)
0.6783406335301212
0    0.617548
1    0.382452
Name: Survived, dtype: float64


In [104]:
print(df.Embarked.unique())
y2 = df["Embarked"] = df["Embarked"].map({'C': 0, 'S': 1, 'Q': 3})
print(df['Embarked'].head(10))


['S' 'C' 'Q']
0    1
1    0
2    1
3    1
4    1
5    3
6    1
7    1
8    1
9    0
Name: Embarked, dtype: int64


In [105]:
# now, re define X
df = pd.read_csv('http://bit.ly/kaggletrain') # titanic dataset
df = df.loc[df.Embarked.notna(),['Survived', 'Pclass','Sex','Embarked' ]]
# excluding rows where values are missing
print('shape: ',df.shape)
X = df.drop('Survived', axis='columns')
X.head(3)

shape:  (889, 4)


Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S


In [106]:
df["Sex"] = df["Sex"].map({'male': 0, 'female': 1})
ohe = OneHotEncoder(sparse=False) # has fit/transform method. and fit_transform to do it at the same time.
y1 = ohe.fit_transform(df[["Sex"]])
df["Embarked"] = df["Embarked"].map({'C': 0, 'S': 1, 'Q': 3})
ohe2 = OneHotEncoder(sparse=False) # has fit/transform method. and fit_transform to do it at the same time.
y2 = ohe2.fit_transform(df[["Embarked"]])
print(y1, '///', y2)

[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [1. 0.]
 [1. 0.]] /// [[0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [130]:
X = df.drop("Survived", axis=1)
X.head()
print(X.shape)
print(y1.shape)
print(type(y1))
print(y2.shape)
print(type(y2))
y3 = np.array(df[["Pclass"]])
print(y3.shape)
print(type(y3))

(889, 3)
(889, 2)
<class 'numpy.ndarray'>
(889, 3)
<class 'numpy.ndarray'>
(889, 1)
<class 'numpy.ndarray'>


In [134]:
X = np.concatenate((y1,y2, y3),axis=1)
print(X)

[[1. 0. 0. 1. 0. 3.]
 [0. 1. 1. 0. 0. 1.]
 [0. 1. 0. 1. 0. 3.]
 ...
 [0. 1. 0. 1. 0. 3.]
 [1. 0. 1. 0. 0. 1.]
 [1. 0. 0. 0. 1. 3.]]


In [142]:
print(X.shape)
print(y.shape)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs')
from sklearn.model_selection import cross_val_score
print(cross_val_score(logreg, X, y, cv=5, scoring='accuracy').mean())
# so we can see that like this, accuracy went way up! 

(889, 6)
(889,)
0.7727924839713071


In [None]:
# so in the new version of sklearn, step of encoding the feature is done through pipeline
# that can be treated ust like a model. 