# 3. INTERMEDIATE MACHINE LEARNING

# 3.4. PIPELINE

In [1]:
import pandas as pd

In [2]:
#  astuce pour afficher toutes les lignes sans les ...
pd.set_option('display.max_row', 80)
#  astuce pour afficher toutes les colonnes dans les head()
pd.set_option('display.max_column', 80)

## CHARGEMENT DES DATASETS FULL ET TEST

In [3]:
# Chargement des datasets
dataset_input_path = 'C:/Users/PC Maison/4-KAGGLE\KAGGLE_DEV/KAGGLE_COURS_3-MACHINE_LEARNING_INTERMEDIATE/'
X_full = pd.read_csv(dataset_input_path + 'home-data-for-ml-course/input/train.csv', index_col='Id')
X_test_full = pd.read_csv(dataset_input_path + 'home-data-for-ml-course/input/test.csv', index_col='Id')

## TARGET ET FEATURES

In [4]:
# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

## TRAIN SET et VAL SET

In [5]:
from sklearn.model_selection import train_test_split

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)



## NUMERIQUES / CATEGORICAL FEATURES

In [6]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality 
# (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# TEST1 - PREPROCESSING strategy='constant'

## PREPROCESSING

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## MODELISATION

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

## ENTRAINEMENT DU MODELE SUR LE TRAIN SET

In [9]:
from sklearn.pipeline import Pipeline

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBa...
                                                 

## PREDICTIONS SUR LE VAL SET

In [10]:
# Preprocessing of validation data, get predictions sur le VAL SET
preds = clf.predict(X_valid)

## SCORING

In [11]:
from sklearn.metrics import mean_absolute_error

print('MAE 1 strategy=constant : ', mean_absolute_error(y_valid, preds))

MAE 1 strategy=constant :  17861.780102739725


# TEST2 - PREPROCESSING strategy='constant', fill_value=-99

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant', fill_value=-99)

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# SCORING

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions sur le VAL SET
preds = clf.predict(X_valid)

print('MAE 2 strategy=constant fill_value=-99 : ', 
      mean_absolute_error(y_valid, preds))

MAE 2 strategy=constant fill_value=-99 :  17721.58565068493


In [14]:
# MAE 1 strategy=constant :  17861.780102739725
# MAE 2 strategy=constant fill_value=-99 :  17721.58565068493
# CONCLUSION : MAE 2 meilleur que MAE 1 

# TEST3 - PREPROCESSING strategy='mean'

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# SCORING

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions sur le VAL SET
preds = clf.predict(X_valid)

print('MAE 3 strategy=mean : ', 
      mean_absolute_error(y_valid, preds))

MAE 3 strategy=mean :  17648.417157534244


In [17]:
# MAE 1 strategy=constant :  17861.780102739725
# MAE 2 strategy=constant fill_value=-99 :  17721.58565068493
# MAE 3 strategy=mean :  17648.417157534244
# CONCLUSION : MAE 3 meilleur

# TEST4 - PREPROCESSING strategy='median'

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# SCORING

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions sur le VAL SET
preds = clf.predict(X_valid)

print('MAE 4 strategy=median : ', 
      mean_absolute_error(y_valid, preds))

MAE 4 strategy=median :  17553.371061643833


In [20]:
# MAE 1 strategy=constant :  17861.780102739725
# MAE 2 strategy=constant fill_value=-99 :  17721.58565068493
# MAE 3 strategy=mean :  17648.417157534244
# MAE 4 strategy=median :  17553.371061643833
# CONCLUSION : MAE 4 meilleur

# TEST5 - PREPROCESSING strategy='most_frequent'

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# SCORING

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions sur le VAL SET
preds = clf.predict(X_valid)

print('MAE 5 strategy=most_frequent : ', 
      mean_absolute_error(y_valid, preds))

MAE 5 strategy=most_frequent :  17599.683287671236


In [23]:
# MAE 1 strategy=constant :  17861.780102739725
# MAE 2 strategy=constant fill_value=-99 :  17721.58565068493
# MAE 3 strategy=mean :  17648.417157534244
# MAE 4 strategy=median :  17553.371061643833
# MAE 5 strategy=most_frequent :  17599.683287671236
# CONCLUSION : MAE 4 meilleur ==> strategy=median le mieux

# TEST6 - PREPROCESSING num : strategy='median' + scaler

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# SCORING

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions sur le VAL SET
preds = clf.predict(X_valid)

print('MAE 6 strategy=médian + scaler: ', 
      mean_absolute_error(y_valid, preds))

MAE 6 strategy=médian + scaler:  17584.598458904107


In [26]:
# MAE 1 strategy=constant :  17861.780102739725
# MAE 2 strategy=constant fill_value=-99 :  17721.58565068493
# MAE 3 strategy=mean :  17648.417157534244
# MAE 4 strategy=median :  17553.371061643833
# MAE 5 strategy=most_frequent :  17599.683287671236
# MAE 6 strategy=médian + scaler:  17584.598458904107
# CONCLUSION : MAE 4 meilleur ==> strategy=median le mieux

# PREPROCESSING FINAL

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# ENTRAINEMENT ET SCORING

In [32]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

# Define model
model = RandomForestRegressor(n_estimators=240, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions sur le VAL SET
preds = clf.predict(X_valid)

print('MAE finale : ', 
      mean_absolute_error(y_valid, preds))

MAE finale :  17321.705722031962


In [29]:
# MAE finale :  17553.371061643833 - 100
# MAE finale :  17416.561839530335 - 140
# MAE finale :  17364.280456621007 - 150
# MAE finale :  17374.819191600574 - 152
# MAE finale :  17361.738920225624 - 153
# MAE finale :  17343.579211884007 - 154 
# MAE finale :  17344.333694211222 - 155
# MAE finale :  17357.489704074465 - 156
# MAE finale :  17352.879773116438 - 160
# MAE finale :  17384.869319097503 - 170
# MAE finale :  17385.53157534247  - 200
# MAE finale :  17321.705722031962 - 240 -- best

# PREDICTIONS

In [31]:
# Preprocessing of test data, fit model
# preds_test = model.predict(X_test)