# Pre-production

Create a pipeline

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

import joblib

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
data = pd.read_csv("./data/houseprice.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data, 
                                                    data['SalePrice'],
                                                    test_size=0.1,
                                                    random_state=0
                                                   )

In [5]:
X_train.shape, X_test.shape

((1314, 81), (146, 81))

### Selected features

In [6]:
features = pd.read_csv("./data/selected_features.csv", header=None)

# Add a custom feature if required
features = [x for x in features[0]] + ['LotFrontage']

In [7]:
print(len(features), features[:3])

22 ['MSSubClass', 'MSZoning', 'Neighborhood']


### Missing values

In [8]:
categorical_cols_with_na = [c 
                for c in features 
                if X_train[c].isna().sum()
                and X_train[c].dtype == 'O'
               ]

In [9]:
for col in categorical_cols_with_na:
    print(f"{col:12}: {data[col].isna().sum()/len(data) : .3f}")

BsmtQual    :  0.025
BsmtExposure:  0.026
FireplaceQu :  0.473
GarageType  :  0.055
GarageFinish:  0.055


In [10]:
def fill_categorical_na(df, columns:list):
    df_copy = df.copy()
    df_copy[columns] = df[columns].fillna('Missing')
    return df_copy

In [11]:
X_train = fill_categorical_na(X_train, categorical_cols_with_na)

X_test  = fill_categorical_na(X_test, categorical_cols_with_na)

In [12]:
X_train[categorical_cols_with_na].isna().sum()

BsmtQual        0
BsmtExposure    0
FireplaceQu     0
GarageType      0
GarageFinish    0
dtype: int64

In [13]:
X_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [14]:
numerical_cols_with_na = [c 
                          for c in features
                          if X_train[c].isna().sum() 
                          and X_train[c].dtype !='O']

In [15]:
for col in numerical_cols_with_na:
    print(f"{col:12}: {data[col].isna().sum()/len(data) : .3f}")

LotFrontage :  0.177


#### Important: persisting the mean value for NA imputation

As you will see in future sections, one of the key pieces of deploying the model is "Model Validation". Model validation refers to corroborating that the deployed model and the model built during research, are identical. The entire pipeline needs to produce identical results.

Therefore, in order to check at the end of the process that the feature engineering pipelines are identical, we will save -we will persist-, the mean value of the variable, so that we can use it at the end, to corroborate our models.

In [16]:
# Text above says 'mean', but they seem to be storing 'mode'

mode_dict = {}

for col in numerical_cols_with_na:
    mode = X_train[col].mode()[0]
    mode_dict[col] = mode

    # note  that the additional binary variable was not selected, 
    # so we don't need this step any more
    
    # X_train[col +'_na'] = np.where(X_train[col].isnull(), 1, 0)
    # X_test[col+'_na'] = np.where(X_test[col].isnull(), 1, 0)
    
    X_train[col].fillna(mode, inplace=True)
    X_test[col].fillna(mode, inplace=True)    

In [17]:
mode_dict

{'LotFrontage': 60.0}

### Temporal variables

In [18]:
def elapsed_years(df, target_col, comparison_col='YrSold'):
    df[col] = df['YrSold'] - df[col]
    return df

In [19]:
X_train = elapsed_years(X_train, 'YearRemodAdd')
X_test = elapsed_years(X_test, 'YearRemodAdd')

### Numerical variables

In [20]:
cols_to_log_transform = ['LotFrontage', '1stFlrSF', 'GrLivArea', 'SalePrice']

for col in cols_to_log_transform:
    X_train[col] = np.log(X_train[col])
    X_test[col] = np.log(X_test[col])

### Categorical variables

In [21]:
categorical_cols = [col
                   for col in features
                   if X_train[col].dtype=='O'
                   ]
categorical_cols

['MSZoning',
 'Neighborhood',
 'RoofStyle',
 'BsmtQual',
 'BsmtExposure',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'PavedDrive']

In [30]:
def find_frequent_labels(df, col:str, ignore_lte:float):
    df = df.copy()
    tmp = df.groupby(col)['SalePrice'].count() / len(df)
    return tmp[tmp>ignore_lte].index

frequent_labels = {}

for col in categorical_cols:
    freq_values_in_col = find_frequent_labels(X_train, col, 0.01)
    frequent_labels[col] = freq_values_in_col
    
    X_train[col] = np.where(X_train[col].isin(freq_values_in_col), X_train[col], 'Rare')
    X_test[col] = np.where(X_test[col].isin(freq_values_in_col), X_test[col], 'Rare')

np.save('./data/frequent_labels.npy', frequent_labels)

In [31]:
frequent_labels

{'MSZoning': Index(['FV', 'RH', 'RL', 'RM'], dtype='object', name='MSZoning'),
 'Neighborhood': Index(['Blmngtn', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor',
        'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NWAmes',
        'NoRidge', 'NridgHt', 'OldTown', 'Rare', 'SWISU', 'Sawyer', 'SawyerW',
        'Somerst', 'StoneBr', 'Timber'],
       dtype='object', name='Neighborhood'),
 'RoofStyle': Index(['Gable', 'Hip', 'Rare'], dtype='object', name='RoofStyle'),
 'BsmtQual': Index(['Ex', 'Fa', 'Gd', 'Missing', 'TA'], dtype='object', name='BsmtQual'),
 'BsmtExposure': Index(['Av', 'Gd', 'Missing', 'Mn', 'No'], dtype='object', name='BsmtExposure'),
 'HeatingQC': Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object', name='HeatingQC'),
 'CentralAir': Index(['N', 'Y'], dtype='object', name='CentralAir'),
 'KitchenQual': Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object', name='KitchenQual'),
 'FireplaceQu': Index(['Ex', 'Fa', 'Gd', 'Missing', 'Po', 'TA'], dtype='object', n

### String cat to numbers

In [33]:
# this function will assign discrete values to the strings of the variables, 
# so that the smaller value corresponds to the smaller mean of target

def replace_categories_with_numbers(train, test, col, target):
    train = train.copy()
    test = test.copy()
    
    ordered_labels = train.groupby([col])[target].mean().sort_values().index
    ordinal_label = {k:i for i,k in enumerate(ordered_labels)}
    
    train[col] = train[col].map(ordinal_label)
    test[col] = test[col].map(ordinal_label)
    
    return ordinal_label, train, test

In [34]:
ordinal_label_dict = {}

for col in categorical_cols:
    ordinal_label, X_train, X_test = replace_categories_with_numbers(X_train, X_test, col, 'SalePrice')
    ordinal_label_dict[col] = ordinal_label

np.save('./data/ordinal_label.npy', ordinal_label_dict)

In [35]:
ordinal_label_dict

{'MSZoning': {'Rare': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4},
 'Neighborhood': {'IDOTRR': 0,
  'MeadowV': 1,
  'BrDale': 2,
  'Edwards': 3,
  'BrkSide': 4,
  'OldTown': 5,
  'Sawyer': 6,
  'SWISU': 7,
  'NAmes': 8,
  'Mitchel': 9,
  'SawyerW': 10,
  'Rare': 11,
  'NWAmes': 12,
  'Gilbert': 13,
  'Blmngtn': 14,
  'CollgCr': 15,
  'Crawfor': 16,
  'ClearCr': 17,
  'Somerst': 18,
  'Timber': 19,
  'StoneBr': 20,
  'NridgHt': 21,
  'NoRidge': 22},
 'RoofStyle': {'Gable': 0, 'Rare': 1, 'Hip': 2},
 'BsmtQual': {'Missing': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
 'BsmtExposure': {'Missing': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
 'HeatingQC': {'Rare': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
 'CentralAir': {'N': 0, 'Y': 1},
 'KitchenQual': {'Fa': 0, 'TA': 1, 'Gd': 2, 'Ex': 3},
 'FireplaceQu': {'Po': 0, 'Missing': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
 'GarageType': {'Missing': 0,
  'Rare': 1,
  'Detchd': 2,
  'Basment': 3,
  'Attchd': 4,
  'BuiltIn': 5},
 'GarageFinish': {'Missing': 0, 'Unf

### Anything na left in train/test?

In [37]:
[c for c in features if X_train[c].isna().sum()]

[]

In [38]:
[c for c in features if X_test[c].isna().sum()]

[]

### Feature Scaling

In [39]:
y_train = X_train['SalePrice']
y_test = X_test['SalePrice']

In [40]:
scaler = MinMaxScaler()
scaler.fit(X_train[features])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [41]:
joblib.dump(scaler, "./data/scaler.pkl")

['./data/scaler.pkl']

In [42]:
X_train = pd.DataFrame(scaler.transform(X_train[features]), columns=features)
X_test = pd.DataFrame(scaler.transform(X_test[features]), columns=features)

In [44]:
# No 'Id' or 'SalePrice'
X_train.columns

Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'BsmtQual', 'BsmtExposure', 'HeatingQC',
       'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'KitchenQual',
       'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars',
       'PavedDrive', 'LotFrontage'],
      dtype='object')

### Train Model

In [45]:
from sklearn.linear_model import Lasso

In [53]:
# Remember, y_train is already log transformed

In [46]:
linear_model = Lasso(alpha=0.005, random_state=0)
linear_model.fit(X_train, y_train)

Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=0,
      selection='cyclic', tol=0.0001, warm_start=False)

In [49]:
joblib.dump(linear_model, './data/lasso_regression.pkl')

['./data/lasso_regression.pkl']

### Performance check

In [50]:
def print_performance(model, X, y):
    pred = linear_model.predict(X)

    mse = mean_squared_error(
        np.exp(y),
        np.exp(pred), 
    )
    print(f"MSE : {mse}")
    print(f"RMSE: {mse**0.5}")

In [54]:
# Train
print_performance(linear_model, X_train, y_train)

MSE : 1087050173.7522295
RMSE: 32970.44394229822


In [55]:
# Test
print_performance(linear_model, X_test, y_test)

MSE : 1403585729.4125714
RMSE: 37464.459550520296


In [56]:
print('Average house price: ', np.exp(y_train).median())

Average house price:  163000.00000000012
