## Step 1 - Data Ingestion


In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

In [101]:
import pandas as pd
df = pd.read_csv("training_set.csv", na_values=["","NA"], keep_default_na=False)

## Target is SalesPrice

## Step 2 - Perform Basic Data Quality Checks

In [3]:
df.duplicated().sum()

np.int64(0)

In [4]:
df.shape

(1460, 81)

In [5]:
df = df.drop_duplicates(keep="first").reset_index(drop=True)
df.shape

(1460, 81)

In [6]:
m = df.isna().sum()
m

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [7]:
m[m > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Step 3 - Separate X and Y (SalesPrice)
and remove unneccessary columns

In [9]:
X = df.drop(columns = ["Id","SalePrice"])
Y = df["SalePrice"]

In [10]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [11]:
Y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [12]:
X.nunique()

MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
Street              2
                 ... 
MiscVal            21
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
Length: 79, dtype: int64

In [13]:
Y.nunique()

663

In [14]:
def get_high_cardinality(X : pd.DataFrame, threshold : float = 0.9) -> list[str]:
    unique_count = X.select_dtypes(include="object").nunique()
    cardinality = unique_count / len(X)
    print(cardinality)
    high_cardinality = cardinality[cardinality >= threshold]
    return high_cardinality.index.tolist()

In [15]:
high_card = get_high_cardinality(X)

MSZoning         0.003425
Street           0.001370
Alley            0.001370
LotShape         0.002740
LandContour      0.002740
Utilities        0.001370
LotConfig        0.003425
LandSlope        0.002055
Neighborhood     0.017123
Condition1       0.006164
Condition2       0.005479
BldgType         0.003425
HouseStyle       0.005479
RoofStyle        0.004110
RoofMatl         0.005479
Exterior1st      0.010274
Exterior2nd      0.010959
MasVnrType       0.002740
ExterQual        0.002740
ExterCond        0.003425
Foundation       0.004110
BsmtQual         0.002740
BsmtCond         0.002740
BsmtExposure     0.002740
BsmtFinType1     0.004110
BsmtFinType2     0.004110
Heating          0.004110
HeatingQC        0.003425
CentralAir       0.001370
Electrical       0.003425
KitchenQual      0.002740
Functional       0.004795
FireplaceQu      0.003425
GarageType       0.004110
GarageFinish     0.002055
GarageQual       0.003425
GarageCond       0.003425
PavedDrive       0.002055
PoolQC      

In [16]:
high_card


[]

In [17]:
X = X.drop(columns=high_card)
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


## Step 4 - Apply Train test split first

In [18]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(
    X, Y, test_size=0.3, random_state=10
)

In [19]:
xtrain.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
912,30,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,Shed,620,7,2006,WD,Abnorml
373,20,RL,79.0,10634,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdWo,,0,11,2009,WD,Normal
20,60,RL,101.0,14215,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,11,2006,New,Partial
800,60,RL,79.0,12798,Pave,,IR1,HLS,AllPub,Inside,...,0,0,,,Shed,400,5,2008,WD,Normal
152,60,RL,,14803,Pave,,IR1,Lvl,AllPub,CulDSac,...,0,0,,GdWo,,0,6,2006,WD,Normal


In [20]:
ytrain.head()

912     88000
373    123000
20     325300
800    200000
152    190000
Name: SalePrice, dtype: int64

In [21]:
xtest.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
854,20,RL,102.0,17920,Pave,,Reg,Lvl,AllPub,Inside,...,312,0,,,,0,7,2006,WD,Abnorml
381,20,FV,60.0,7200,Pave,Pave,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2006,New,Partial
816,20,RL,,11425,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,7,2006,WD,Normal
577,80,RL,96.0,11777,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,5,2006,WD,Abnorml
35,60,RL,108.0,13418,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Normal


In [22]:
ytest.head()

854    170000
381    187750
816    137000
577    164500
35     309000
Name: SalePrice, dtype: int64

In [23]:
xtrain.shape

(1022, 79)

In [24]:
xtest.shape

(438, 79)

In [25]:
ytrain.shape

(1022,)

In [26]:
ytest.shape

(438,)

## Step 5 - Apply Preprocessing on X

1. Continous Feature -> SimpleImputer(strategy="mean).StandardScaler()
2. Categorical Feature -> SimpleImputer, OrdinalEncoder, StandardSclaer

In [27]:
cat = xtrain.select_dtypes(include="object").columns.tolist()
cat

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [28]:
con = xtrain.select_dtypes(include="number").columns.tolist()
con

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [29]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [30]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

In [31]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    StandardScaler()
)

In [32]:
pre = ColumnTransformer(
    [
        ("num", num_pipe, con),
        ("cat", cat_pipe, cat)
    ]
).set_output(transform="pandas")

In [33]:
pre.fit(xtrain) # type: ignore

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [34]:
# Transform train and test data
xtrain_pre = pre.transform(xtrain) # type: ignore
xtest_pre = pre.transform(xtest) # type: ignore

In [35]:
xtrain_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
912,-0.637138,-0.832375,-0.403687,-0.832137,1.322374,-1.536211,-1.693863,-0.574378,0.089078,-0.272009,...,1.596608,0.971066,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,-3.386035
373,-0.872425,0.381056,-0.004215,-0.832137,0.402383,-0.612735,-1.548422,-0.574378,-0.041272,0.926265,...,-0.680759,0.971066,0.209088,0.187974,0.281373,0.044281,-1.924895,0.059456,0.319937,0.206519
20,0.068721,1.334466,0.31269,1.354094,-0.517607,1.102291,1.021023,1.46354,-0.955862,-0.272009,...,0.457925,-0.25871,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,-0.952342,1.104658
800,0.068721,0.381056,0.187291,-0.103393,-0.517607,0.838441,0.584702,-0.574378,0.031382,-0.272009,...,-0.680759,-0.25871,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,0.206519
152,0.068721,0.0,0.364725,-0.103393,-0.517607,-0.019072,-0.67578,0.777083,-0.066915,-0.272009,...,-0.680759,-0.25871,0.209088,0.187974,0.281373,0.044281,-1.924895,0.059456,0.319937,0.206519


In [36]:
xtest_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
854,-0.872425,1.377803,0.640568,-0.832137,-1.437597,-0.546773,-0.53034,-0.574378,-0.301973,6.950921,...,-0.680759,0.971066,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,-3.386035
381,-0.872425,-0.442343,-0.308111,0.62535,-0.517607,1.135273,1.021023,-0.574378,-0.955862,-0.272009,...,-0.680759,-0.25871,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,-0.952342,1.104658
816,-0.872425,0.0,0.065785,-0.832137,0.402383,-0.579754,-1.499942,-0.574378,0.082668,-0.272009,...,-0.680759,-0.25871,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,0.206519
577,0.539294,1.117782,0.096936,-0.832137,0.402383,-0.183979,-0.918181,-0.054173,-0.254961,3.396042,...,-0.680759,-0.25871,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,-3.386035
35,0.068721,1.637824,0.242158,1.354094,-0.517607,1.06931,0.972543,0.13353,-0.955862,-0.272009,...,0.457925,-1.488486,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,0.206519


## Step 5 - Model Building Experiment with different models

1. Baseline - Linear Regression
2. Apply Feature Selection - Linear Regression
3. Apply Ridge on selected features
4. Apply Lasso on Selected Features

### Model 1 - Baseline Model Linear Regression

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

baseline = LinearRegression()
scores = cross_val_score(baseline, xtrain_pre, ytrain, cv=5, scoring="r2")
scores

array([0.84232872, 0.65198776, 0.58207534, 0.70829589, 0.83149358])

In [38]:
scores.mean()

np.float64(0.7232362566095751)

In [39]:
scores.std()

np.float64(0.10112233089435309)

In [40]:
# Fit the entire model
baseline.fit(xtrain_pre, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [41]:
baseline.intercept_

np.float64(182072.7573385518)

In [42]:
baseline.coef_

array([ -5949.74977644,  -4861.8747776 ,   4536.59598823,  15303.15713677,
         5872.85440248,   5794.74998719,   -545.77295183,   6048.06168464,
          414.03044915,    756.86371909,   -753.06267389,    -58.53852841,
         6858.43927252,   9892.24841712,  -1731.40379546,  12977.46723053,
         3874.20509889,   -588.19350326,   1637.41282806,  -2074.60774999,
        -1411.41044128,  -2841.81748307,   6870.34167907,   3661.33845843,
          671.99134685,   9419.12213844,  -1577.22504595,   3060.72635778,
           29.49407048,   -501.44172864,   1293.7419893 ,   2649.0416879 ,
        -4754.51664829,    276.55979782,     21.73001652,   -910.38304245,
        -1142.05684275,   3131.1476983 ,   -254.5349447 ,  -2023.90053353,
         3094.35939288,  -1895.12025886,   -688.59384397,   2665.42922876,
         3223.13474035,   -527.38746262,  -2412.40682958,  -1670.26881753,
        -2290.93325479,   1287.40496926,   5913.56182776,  -3263.44706654,
         1907.50968236,  

In [43]:
# R2 score on train
r2_train = baseline.score(xtrain_pre, ytrain)
r2_train

0.8576026905597295

In [44]:
# R2 score on test
r2_test = baseline.score(xtest_pre, ytest)
r2_test

0.6973680771486954

In [45]:
# Generalization error
gen_err = r2_train - r2_test
gen_err

0.16023461341103407

### Create a function to save above results for each model

In [46]:
# Initializing a balnk results list
results = []

def evaluate_and_log_model(results, model, xtrain, ytrain, xtest, ytest, description):
    #Cross validation data on train
    scores = cross_val_score(model, xtrain, ytrain, cv=5, scoring="r2")
    mean_score = scores.mean().round(4)
    std_score = scores.std().round(4)
    #Fit the model
    model.fit(xtrain, ytrain)
    #Evaluate r2 score on train and test
    r2_train = round(model.score(xtrain, ytrain), 4)
    r2_test = round(model.score(xtest, ytest), 4)
    #Generalization error
    gen_err = round(r2_train - r2_test, 4)
    # Save above results in a dictionary
    r = {
        "description": description,
        "name":type(model).__name__,
        "cv_mean":mean_score,
        "cv_std":std_score,
        "r2_train" : r2_train,
        "r2_test": r2_test,
        "gen_err": gen_err
    }
    print(r)
    results.append(r)
    return model

In [47]:
baseline = evaluate_and_log_model(
    results, LinearRegression(), xtrain_pre, ytrain, xtest_pre, ytest, description="Baseline Linear Regression"
)

{'description': 'Baseline Linear Regression', 'name': 'LinearRegression', 'cv_mean': np.float64(0.7232), 'cv_std': np.float64(0.1011), 'r2_train': 0.8576, 'r2_test': 0.6974, 'gen_err': 0.1602}


### Model 2 - Feature Selection on Linear Regression

In [48]:
from sklearn.feature_selection import SequentialFeatureSelector

sel = SequentialFeatureSelector(
    LinearRegression(), n_features_to_select="auto", direction="forward"
)
sel.fit(xtrain_pre, ytrain)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [49]:
sel_cols = sel.get_feature_names_out()
sel_cols

array(['num__MSSubClass', 'num__LotArea', 'num__OverallQual',
       'num__OverallCond', 'num__YearBuilt', 'num__LowQualFinSF',
       'num__GrLivArea', 'num__BsmtFullBath', 'num__HalfBath',
       'num__KitchenAbvGr', 'num__Fireplaces', 'num__GarageCars',
       'num__ScreenPorch', 'num__YrSold', 'cat__Street', 'cat__Alley',
       'cat__LotShape', 'cat__LandContour', 'cat__Utilities',
       'cat__LandSlope', 'cat__Neighborhood', 'cat__HouseStyle',
       'cat__RoofStyle', 'cat__RoofMatl', 'cat__Exterior2nd',
       'cat__ExterQual', 'cat__BsmtQual', 'cat__BsmtCond',
       'cat__BsmtExposure', 'cat__BsmtFinType1', 'cat__Heating',
       'cat__HeatingQC', 'cat__CentralAir', 'cat__KitchenQual',
       'cat__Functional', 'cat__FireplaceQu', 'cat__GarageQual',
       'cat__GarageCond', 'cat__SaleCondition'], dtype=object)

In [50]:
len(sel_cols)

39

In [51]:
xtrain_pre_sel = sel.transform(xtrain_pre)
xtrain_pre_sel

array([[-0.63713842, -0.40368676, -0.83213659, ...,  0.20908829,
         0.18797359, -3.38603512],
       [-0.87242495, -0.00421508, -0.83213659, ...,  0.20908829,
         0.18797359,  0.20651914],
       [ 0.06872116,  0.31268967,  1.35409374, ...,  0.20908829,
         0.18797359,  1.10465771],
       ...,
       [ 0.06872116,  0.37755736,  2.08283718, ...,  0.20908829,
         0.18797359,  1.10465771],
       [ 0.30400769, -0.14881781,  0.62535029, ...,  0.20908829,
         0.18797359,  0.20651914],
       [ 0.06872116,  0.03392676,  1.35409374, ...,  0.20908829,
         0.18797359,  1.10465771]], shape=(1022, 39))

In [52]:
xtrain_pre_sel.shape

(1022, 39)

In [53]:
xtest_pre_sel =sel.transform(xtest_pre)
xtest_pre_sel

array([[-8.72424952e-01,  6.40567958e-01, -8.32136593e-01, ...,
         2.09088285e-01,  1.87973592e-01, -3.38603512e+00],
       [-8.72424952e-01, -3.08110908e-01,  6.25350293e-01, ...,
         2.09088285e-01,  1.87973592e-01,  1.10465771e+00],
       [-8.72424952e-01,  6.57853805e-02, -8.32136593e-01, ...,
         2.09088285e-01,  1.87973592e-01,  2.06519142e-01],
       ...,
       [ 3.12744604e+00,  8.34846130e-02, -8.32136593e-01, ...,
         2.09088285e-01,  1.87973592e-01,  2.06519142e-01],
       [-1.66565366e-01, -3.21253194e-04,  1.35409374e+00, ...,
         2.09088285e-01,  1.87973592e-01,  2.06519142e-01],
       [-1.66565366e-01, -4.14306303e-01, -8.32136593e-01, ...,
         2.09088285e-01,  1.87973592e-01,  2.06519142e-01]],
      shape=(438, 39))

In [54]:
xtest_pre_sel.shape

(438, 39)

In [55]:
model_feat_sel = evaluate_and_log_model(
    results, LinearRegression(), xtrain_pre_sel, ytrain, xtest_pre_sel, ytest, description="Featue selection on Linear Regression"
)

{'description': 'Featue selection on Linear Regression', 'name': 'LinearRegression', 'cv_mean': np.float64(0.8126), 'cv_std': np.float64(0.0544), 'r2_train': 0.8395, 'r2_test': 0.8378, 'gen_err': 0.0017}


## Model 3 - Ridge on selected features

In [56]:
alphas_list = [0, 0 , 0.1, 1, 20, 50, 200, 500, 10000]

In [57]:
params = {
    "alpha":alphas_list
}
print(params)

{'alpha': [0, 0, 0.1, 1, 20, 50, 200, 500, 10000]}


In [58]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

gscv_ridge = GridSearchCV(Ridge(random_state=42), params, cv=5, scoring="r2")
gscv_ridge.fit(xtrain_pre_sel, ytrain) # Apply this on selected features only

0,1,2
,estimator,Ridge(random_state=42)
,param_grid,"{'alpha': [0, 0, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,50
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [59]:
gscv_ridge.best_params_

{'alpha': 50}

In [60]:
gscv_ridge.best_score_

np.float64(0.8150277907030189)

In [61]:
best_ridge = gscv_ridge.best_estimator_
best_ridge

0,1,2
,alpha,50
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [62]:
sel_ridge = evaluate_and_log_model(
    results, best_ridge, xtrain_pre_sel, ytrain, xtest_pre_sel, ytest,
    description="Ridge on Selected Features"
)

{'description': 'Ridge on Selected Features', 'name': 'Ridge', 'cv_mean': np.float64(0.815), 'cv_std': np.float64(0.0485), 'r2_train': 0.8389, 'r2_test': 0.8393, 'gen_err': -0.0004}


## Model 4 - Lasso on Selected features

In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

gscv_lasso = GridSearchCV(Lasso(random_state=42), params, cv=5, scoring="r2")
gscv_lasso.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Lasso(random_state=42)
,param_grid,"{'alpha': [0, 0, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,200
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [64]:
gscv_lasso.best_score_

np.float64(0.8127563380172429)

In [65]:
gscv_lasso.best_params_

{'alpha': 200}

In [66]:
best_lasso = gscv_lasso.best_estimator_
best_lasso

0,1,2
,alpha,200
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [67]:
sel_lasso = evaluate_and_log_model(
    results, best_lasso, xtrain_pre_sel, ytrain, xtest_pre_sel, ytest,
    description="Lasso on Selected features"
)

{'description': 'Lasso on Selected features', 'name': 'Lasso', 'cv_mean': np.float64(0.8128), 'cv_std': np.float64(0.0533), 'r2_train': 0.8394, 'r2_test': 0.8394, 'gen_err': 0.0}


## Model 5 - Polynomial feature engineering with Ridge 

In [68]:
from sklearn.preprocessing import PolynomialFeatures

In [69]:
poly_ridge = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    Ridge(random_state=42) # type: ignore
)

In [70]:
params2 = {
    "ridge__alpha":alphas_list
}

In [71]:
gscv_poly_ridge = GridSearchCV(poly_ridge, params2, cv=5, scoring="r2")
gscv_poly_ridge.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'ridge__alpha': [0, 0, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,alpha,500
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [72]:
gscv_poly_ridge.best_params_

{'ridge__alpha': 500}

In [73]:
gscv_poly_ridge.best_score_

np.float64(0.7849051344467123)

In [74]:
best_poly_ridge = gscv_poly_ridge.best_estimator_
best_poly_ridge

0,1,2
,steps,"[('polynomialfeatures', ...), ('ridge', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,alpha,500
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [75]:
sel_poly_ridge = evaluate_and_log_model(
    results, best_poly_ridge, xtrain_pre_sel, ytrain, xtest_pre_sel, ytest,
    description="Polynomial Ridge on Selected Features"
)

{'description': 'Polynomial Ridge on Selected Features', 'name': 'Pipeline', 'cv_mean': np.float64(0.7849), 'cv_std': np.float64(0.0346), 'r2_train': 0.9318, 'r2_test': 0.797, 'gen_err': 0.1348}


## Model 6 - Polynomial features with Lasso

In [76]:
poly_lasso = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    Lasso(random_state=42)
)

In [77]:
param3 = {
    "lasso__alpha":alphas_list
}

In [78]:
gscv_poly_lasso = GridSearchCV(poly_lasso, param3, cv=5, scoring="r2")
gscv_poly_lasso.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'lasso__alpha': [0, 0, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,alpha,500
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [79]:
gscv_poly_lasso.best_params_

{'lasso__alpha': 500}

In [80]:
gscv_poly_lasso.best_score_

np.float64(0.8460438503737336)

In [81]:
best_poly_lasso = gscv_poly_lasso.best_estimator_
best_poly_lasso

0,1,2
,steps,"[('polynomialfeatures', ...), ('lasso', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,alpha,500
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [82]:
sel_poly_lasso = evaluate_and_log_model(
    results, best_poly_lasso, xtrain_pre_sel, ytrain, xtest_pre_sel, ytest,
    description="Poly Lasso on selected features"
)

{'description': 'Poly Lasso on selected features', 'name': 'Pipeline', 'cv_mean': np.float64(0.846), 'cv_std': np.float64(0.0288), 'r2_train': 0.9469, 'r2_test': 0.8459, 'gen_err': 0.101}


In [83]:
res_df = pd.DataFrame(results)
res_df

Unnamed: 0,description,name,cv_mean,cv_std,r2_train,r2_test,gen_err
0,Baseline Linear Regression,LinearRegression,0.7232,0.1011,0.8576,0.6974,0.1602
1,Featue selection on Linear Regression,LinearRegression,0.8126,0.0544,0.8395,0.8378,0.0017
2,Ridge on Selected Features,Ridge,0.815,0.0485,0.8389,0.8393,-0.0004
3,Lasso on Selected features,Lasso,0.8128,0.0533,0.8394,0.8394,0.0
4,Polynomial Ridge on Selected Features,Pipeline,0.7849,0.0346,0.9318,0.797,0.1348
5,Poly Lasso on selected features,Pipeline,0.846,0.0288,0.9469,0.8459,0.101


In [84]:
sort_df = res_df.sort_values(by="gen_err")
sort_df

Unnamed: 0,description,name,cv_mean,cv_std,r2_train,r2_test,gen_err
2,Ridge on Selected Features,Ridge,0.815,0.0485,0.8389,0.8393,-0.0004
3,Lasso on Selected features,Lasso,0.8128,0.0533,0.8394,0.8394,0.0
1,Featue selection on Linear Regression,LinearRegression,0.8126,0.0544,0.8395,0.8378,0.0017
5,Poly Lasso on selected features,Pipeline,0.846,0.0288,0.9469,0.8459,0.101
4,Polynomial Ridge on Selected Features,Pipeline,0.7849,0.0346,0.9318,0.797,0.1348
0,Baseline Linear Regression,LinearRegression,0.7232,0.1011,0.8576,0.6974,0.1602


In [85]:
sort_df.to_csv("evaluation.csv", index=False)

## Step 6 - Evaluate the model

In [86]:
# Evaluate in detail
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)

def evaluate_model(model, x, y):
    #Predict results for x
    ypred = model.predict(x)
    # Get the metrices
    rmse = root_mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    # Print the results
    print(f"RMSE : {rmse:.2f}")
    print(f"MAE : {mae:.2f}")
    print(f"MAPE : {mape:.2%}")
    print(f"R2 : {r2:.2%}")

In [87]:
evaluate_model(sel_poly_ridge, xtrain_pre_sel, ytrain)

RMSE : 21039.18
MAE : 14146.70
MAPE : 8.46%
R2 : 93.18%


In [88]:
evaluate_model(sel_poly_ridge, xtest_pre_sel, ytest)

RMSE : 34485.19
MAE : 24072.18
MAPE : 15.55%
R2 : 79.70%


## Out of Sample Prediction

In [89]:
xnew = pd.read_csv("testing_set.csv", na_values=["","NA"], keep_default_na=False)
xnew

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [90]:
xnew_pre = pre.transform(xnew) # type: ignore
xnew_pre

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,-0.872425,0.424393,0.083219,-0.832137,0.402383,-0.348885,-1.160582,-0.574378,0.044204,0.686611,...,-0.680759,0.971066,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,0.206519
1,-0.872425,0.467730,0.317291,-0.103393,0.402383,-0.447829,-1.306022,0.004820,1.016489,-0.272009,...,-0.680759,0.971066,0.209088,0.187974,0.281373,0.044281,0.265798,-20.195052,0.319937,0.206519
2,0.068721,0.164372,0.278619,-0.832137,-0.517607,0.838441,0.633182,-0.574378,0.734419,-0.272009,...,-0.680759,-1.488486,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,0.206519
3,0.068721,0.337719,-0.062269,-0.103393,0.402383,0.871422,0.633182,-0.467119,0.330547,-0.272009,...,-0.680759,-1.488486,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,0.206519
4,1.480440,-1.179069,-0.502360,1.354094,-0.517607,0.673535,0.342302,-0.574378,-0.393859,-0.272009,...,-0.680759,-0.258710,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,0.206519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.421586,-2.132479,-0.773955,-1.560880,1.322374,-0.052054,-0.724261,-0.574378,-0.955862,-0.272009,...,-0.680759,0.971066,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,0.206519
1455,2.421586,-2.132479,-0.777672,-1.560880,-0.517607,-0.052054,-0.724261,-0.574378,-0.417365,-0.272009,...,1.027266,0.971066,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,-3.386035
1456,-0.872425,3.891339,0.824640,-0.832137,1.322374,-0.381866,0.536222,-0.574378,1.659694,-0.272009,...,1.596608,0.971066,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,-3.386035
1457,0.656937,-0.355670,-0.021295,-0.832137,-0.517607,0.673535,0.342302,-0.574378,-0.235729,-0.272009,...,-0.680759,0.971066,0.209088,0.187974,0.281373,0.044281,0.265798,0.059456,0.319937,0.206519


In [91]:
xnew_pre_sel = sel.transform(xnew_pre)
xnew_pre_sel

array([[-0.87242495,  0.08321912, -0.83213659, ...,  0.20908829,
         0.18797359,  0.20651914],
       [-0.87242495,  0.31729148, -0.10339315, ...,  0.20908829,
         0.18797359,  0.20651914],
       [ 0.06872116,  0.27861865, -0.83213659, ...,  0.20908829,
         0.18797359,  0.20651914],
       ...,
       [-0.87242495,  0.82463998, -0.83213659, ...,  0.20908829,
         0.18797359, -3.38603512],
       [ 0.65693749, -0.02129484, -0.83213659, ...,  0.20908829,
         0.18797359,  0.20651914],
       [ 0.06872116, -0.09333072,  0.62535029, ...,  0.20908829,
         0.18797359,  0.20651914]], shape=(1459, 39))

In [92]:
preds = sel_poly_ridge.predict(xnew_pre_sel)
preds

array([145314.31937471, 143557.09646739, 169735.93839256, ...,
       172982.09251557, 144806.43372817, 229596.24161267], shape=(1459,))

In [93]:
xnew["SalesPrice_pred"] = preds.round(2)
xnew

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalesPrice_pred
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,145314.32
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,143557.10
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,169735.94
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,178145.99
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,190692.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,111796.37
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,96871.20
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,172982.09
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,144806.43


In [94]:
xnew.to_csv("results.csv", index=False)

## Step 7 - Save Preprocessor, Selector and Model in Joblib Format

In [95]:
import joblib

joblib.dump(pre, "preprocessor.joblib")

['preprocessor.joblib']

In [96]:
joblib.dump(sel, "selector.joblib")

['selector.joblib']

In [97]:
joblib.dump(sel_poly_ridge, "model.joblib")

['model.joblib']

In [98]:
p = joblib.load("preprocessor.joblib")
p

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [102]:
s = joblib.load("selector.joblib")
s

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [103]:
m = joblib.load("model.joblib")
m

0,1,2
,steps,"[('polynomialfeatures', ...), ('ridge', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,alpha,500
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42
