# Simple pipelines

In [1]:
import pandas as pd

boston = pd.read_csv("data/boston.csv")

In [2]:
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,4.03,34.7
3,0.03237,0.0,,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,5.33,36.2


In [3]:
boston.isnull().sum()

CRIM       20
ZN          0
INDUS      30
CHAS        0
NOX         0
RM          0
AGE         0
DIS         0
RAD         0
TAX         0
PTRATIO     0
LSTAT       0
target      0
dtype: int64

In [4]:
boston.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,target
count,486.0,506.0,476.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.65305,11.363636,11.139538,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,12.653063,22.532806
std,8.719588,23.322453,6.881949,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,1.73,5.0
25%,0.082268,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,6.95,17.025
50%,0.26042,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,11.36,21.2
75%,3.689388,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,37.97,50.0


In [5]:
boston.shape

(506, 13)

#### Spliting the dataset into training and testing sets

In [9]:
X= boston.drop('target', axis=1)
y= boston.target

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state=42 # just so we had the smae data
)

In [21]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((354, 12), (152, 12), (354,), (152,))

These will have missing values, more like real world 

In [22]:
import numpy as np

data_with_missing = pd.DataFrame([[7, 2, 3], [4, np.nan, 6], [None, 5, 9]], columns = ["a", "b", "c"])
data_with_missing

Unnamed: 0,a,b,c
0,7.0,2.0,3
1,4.0,,6
2,,5.0,9


In [23]:
from sklearn.impute import SimpleImputer

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

imputed_data = imp_median.fit_transform(data_with_missing)

pd.DataFrame(imputed_data)

Unnamed: 0,0,1,2
0,7.0,2.0,3.0
1,4.0,3.5,6.0
2,5.5,5.0,9.0


In [24]:
X_train.isnull().sum()

CRIM       13
ZN          0
INDUS      22
CHAS        0
NOX         0
RM          0
AGE         0
DIS         0
RAD         0
TAX         0
PTRATIO     0
LSTAT       0
dtype: int64

In [26]:
X_train = imp_median.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=X.columns)

X_train.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
LSTAT      0
dtype: int64

In [27]:
imp_median.statistics_

array([2.5915e-01, 0.0000e+00, 9.1250e+00, 0.0000e+00, 5.3800e-01,
       6.2275e+00, 7.7350e+01, 3.1073e+00, 5.0000e+00, 3.3000e+02,
       1.8600e+01, 1.0925e+01])

In [37]:
pd.DataFrame(imp_median.statistics_, index=X.columns)

Unnamed: 0,0
CRIM,0.25915
ZN,0.0
INDUS,9.125
CHAS,0.0
NOX,0.538
RM,6.2275
AGE,77.35
DIS,3.1073
RAD,5.0
TAX,330.0


### Scaling features

In [29]:
X_train.mean()

CRIM         3.388759
ZN          11.403955
INDUS       11.015537
CHAS         0.073446
NOX          0.557259
RM           6.325672
AGE         68.799718
DIS          3.765874
RAD          9.437853
TAX        407.042373
PTRATIO     18.277966
LSTAT       12.421130
dtype: float64

In [30]:
X_train.std()

CRIM         8.283270
ZN          22.608457
INDUS        6.744483
CHAS         0.261237
NOX          0.116792
RM           0.719211
AGE         27.665360
DIS          2.126032
RAD          8.639971
TAX        166.522237
PTRATIO      2.256792
LSTAT        7.112402
dtype: float64

### Pipelines


In [39]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


pipe = Pipeline(steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy = "median")),
    ("scaler", StandardScaler())
])

this is what we skip with the above code

In [35]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns = X.columns)

<br>

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [42]:
piped_X_train =pipe.fit_transform(X_train)
piped_X_train

array([[-0.40607919, -0.50512499, -1.31189334, ..., -1.11279004,
         0.18727079, -1.01531611],
       [-0.39378043, -0.50512499, -0.14930128, ...,  0.15008778,
        -0.21208981, -0.05366252],
       [-0.38896392, -0.50512499, -0.60958677, ..., -1.0466393 ,
        -0.16771641, -0.31132373],
       ...,
       [-0.37835771,  3.03838247, -1.3371348 , ..., -0.76399522,
        -0.56707702, -0.90549329],
       [-0.37835771, -0.50512499, -0.28070536, ...,  1.5572945 ,
         0.8528718 ,  1.52750437],
       [-0.38203174, -0.50512499, -0.3645961 , ..., -0.13857001,
         1.16348561, -0.25218837]])

In [44]:
pipe.transform(X_test)

array([[-0.39859209, -0.50512499, -1.03423725, ..., -0.66777595,
        -0.74457062, -0.47605794],
       [-0.40286456,  1.26662874, -0.68382636, ..., -0.92035152,
        -0.30083661, -1.25185755],
       [-0.39690438, -0.50512499,  2.48323476, ...,  1.82791117,
         0.8084984 ,  0.79535229],
       ...,
       [-0.34598414, -0.50512499, -0.71500699, ..., -0.60162521,
        -0.38958342, -1.09697922],
       [ 0.2650419 , -0.50512499,  1.05189537, ...,  1.5572945 ,
         0.8528718 ,  0.53487511],
       [ 0.79018815, -0.50512499,  1.05189537, ...,  1.5572945 ,
         0.8528718 ,  0.56585078]])

In [45]:
# Call the fit_tranform on the training set but only call the trainsform on the test_set.

The `Pipeline` stores the parameters and settings neccessary to perform the transformation. Now we could use this pipeline to transform the test data. Note carefully here that we only want to use the pipeline to transform the test data. We do not want to re-fit the pipeline to the test data!

# Baseline model pipeline

In [48]:
from sklearn.dummy import DummyRegressor

pipe_linreg = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler()),
    ("regressor", DummyRegressor(strategy = "mean"))
])

pipe_linreg.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()), ('regressor', DummyRegressor())])

In [50]:
from sklearn.metrics import mean_absolute_error

y_train_predict = pipe_linreg.predict(X_train)
y_test_predict = pipe_linreg.predict(X_test)
print("MAE train", mean_absolute_error(y_train, y_train_predict))
print("MAE test", mean_absolute_error(y_test, y_test_predict))

MAE train 6.839375658335729
MAE test 6.533251561106156
