## Step 1 - Data Ingestion


In [2]:
from warnings import filterwarnings
filterwarnings("ignore")

In [3]:
import pandas as pd
df = pd.read_csv("training_set.csv", na_values=["","NA"], keep_default_na=False)

## Target is SalesPrice

## Step 2 - Perform Basic Data Quality Checks

In [4]:
df.duplicated().sum()

np.int64(0)

In [5]:
df.shape

(1460, 81)

In [6]:
df = df.drop_duplicates(keep="first").reset_index(drop=True)
df.shape

(1460, 81)

In [7]:
m = df.isna().sum()
m

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [8]:
m[m > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Step 3 - Separate X and Y (SalesPrice)
and remove unneccessary columns

In [10]:
X = df.drop(columns = ["Id","SalePrice"])
Y = df["SalePrice"]

In [11]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [12]:
Y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [13]:
X.nunique()

MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
Street              2
                 ... 
MiscVal            21
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
Length: 79, dtype: int64

In [14]:
Y.nunique()

663

In [15]:
def get_high_cardinality(X : pd.DataFrame, threshold : float = 0.9) -> list[str]:
    unique_count = X.select_dtypes(include="object").nunique()
    cardinality = unique_count / len(X)
    print(cardinality)
    high_cardinality = cardinality[cardinality >= threshold]
    return high_cardinality.index.tolist()

In [16]:
high_card = get_high_cardinality(X)

MSZoning         0.003425
Street           0.001370
Alley            0.001370
LotShape         0.002740
LandContour      0.002740
Utilities        0.001370
LotConfig        0.003425
LandSlope        0.002055
Neighborhood     0.017123
Condition1       0.006164
Condition2       0.005479
BldgType         0.003425
HouseStyle       0.005479
RoofStyle        0.004110
RoofMatl         0.005479
Exterior1st      0.010274
Exterior2nd      0.010959
MasVnrType       0.002740
ExterQual        0.002740
ExterCond        0.003425
Foundation       0.004110
BsmtQual         0.002740
BsmtCond         0.002740
BsmtExposure     0.002740
BsmtFinType1     0.004110
BsmtFinType2     0.004110
Heating          0.004110
HeatingQC        0.003425
CentralAir       0.001370
Electrical       0.003425
KitchenQual      0.002740
Functional       0.004795
FireplaceQu      0.003425
GarageType       0.004110
GarageFinish     0.002055
GarageQual       0.003425
GarageCond       0.003425
PavedDrive       0.002055
PoolQC      

In [17]:
high_card


[]

In [18]:
X = X.drop(columns=high_card)
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


## Step 4 - Apply Train test split first

In [19]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [20]:
xtrain.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
254,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1066,60,RL,59.0,7837,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,5,2009,WD,Normal
638,30,RL,67.0,8777,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,5,2008,WD,Normal
799,50,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,Corner,...,0,0,,MnPrv,,0,6,2007,WD,Normal
380,50,RL,50.0,5000,Pave,Pave,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,5,2010,WD,Normal


In [21]:
ytrain.head()

254     145000
1066    178000
638      85000
799     175000
380     127000
Name: SalePrice, dtype: int64

In [22]:
xtest.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
892,20,RL,70.0,8414,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2006,WD,Normal
1105,60,RL,98.0,12256,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,4,2010,WD,Normal
413,30,RM,56.0,8960,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,3,2010,WD,Normal
522,50,RM,50.0,5000,Pave,,Reg,Lvl,AllPub,Corner,...,0,0,,,,0,10,2006,WD,Normal
1036,20,RL,89.0,12898,Pave,,IR1,HLS,AllPub,Inside,...,0,0,,,,0,9,2009,WD,Normal


In [23]:
ytest.head()

892     154500
1105    325000
413     115000
522     159000
1036    315500
Name: SalePrice, dtype: int64

In [24]:
xtrain.shape

(1168, 79)

In [25]:
xtest.shape

(292, 79)

In [26]:
ytrain.shape

(1168,)

In [27]:
ytest.shape

(292,)

## Step 5 - Apply Preprocessing on X

1. Continous Feature -> SimpleImputer(strategy="mean).StandardScaler()
2. Categorical Feature -> SimpleImputer, OrdinalEncoder, StandardSclaer

In [28]:
cat = xtrain.select_dtypes(include="object").columns.tolist()
cat

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [29]:
con = xtrain.select_dtypes(include="number").columns.tolist()
con

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [30]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [31]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

In [32]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    StandardScaler()
)

In [33]:
pre = ColumnTransformer(
    [
        ("num", num_pipe, con),
        ("cat", cat_pipe, cat)
    ]
).set_output(transform="pandas")

In [34]:
pre.fit(xtrain) # type: ignore

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [35]:
# Transform train and test data
xtrain_pre = pre.transform(xtrain) # type: ignore
xtest_pre = pre.transform(xtest) # type: ignore

In [36]:
xtrain_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
254,-0.866764,-0.015314,-0.212896,-0.820445,0.372217,-0.455469,-1.346063,-0.601531,1.037269,-0.285504,...,-0.68768,-0.27283,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
1066,0.07411,-0.505211,-0.265245,-0.088934,1.268609,0.718609,0.439214,-0.601531,-0.971996,-0.285504,...,-0.68768,-0.27283,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
638,-0.631546,-0.148922,-0.177841,-0.820445,1.268609,-1.988293,-1.683818,-0.601531,-0.971996,-0.285504,...,-0.68768,0.957537,0.214591,0.187569,-1.707294,-0.055603,0.259336,0.044155,0.316662,0.201772
799,-0.161109,-0.460675,-0.324474,-0.820445,1.268609,-1.107734,-1.683818,0.859229,0.267995,-0.285504,...,1.589316,0.957537,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
380,-0.161109,-0.906036,-0.529035,-0.820445,0.372217,-1.531707,-1.683818,-0.601531,-0.49692,-0.285504,...,1.589316,0.957537,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772


In [37]:
xtest_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
892,-0.866764,-0.015314,-0.211594,-0.088934,2.165,-0.259789,0.87347,-0.601531,0.472844,-0.285504,...,-0.68768,-0.27283,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
1105,0.07411,1.231697,0.145643,1.374088,-0.524174,0.751222,0.487465,1.496862,1.276986,-0.285504,...,-0.68768,-0.27283,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
413,-0.631546,-0.638819,-0.160826,-0.820445,0.372217,-1.433867,-1.683818,-0.601531,-0.971996,-0.285504,...,1.589316,0.957537,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
522,-0.161109,-0.906036,-0.529035,-0.088934,1.268609,-0.781602,-1.683818,-0.601531,-0.102477,-0.285504,...,1.589316,0.957537,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
1036,-0.866764,0.830872,0.205338,2.105599,-0.524174,1.175195,1.114724,-0.195765,1.255193,-0.285504,...,-0.68768,-1.503196,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772


## Step 5 - Model Building Experiment with different models

1. Baseline - Linear Regression
2. Apply Feature Selection - Linear Regression
3. Apply Ridge on selected features
4. Apply Lasso on Selected Features

### Model 1 - Baseline Model Linear Regression

In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

baseline = LinearRegression()
scores = cross_val_score(baseline, xtrain_pre, ytrain, cv=5, scoring="r2")
scores

array([0.87110095, 0.73462181, 0.71108825, 0.86830822, 0.61764144])

In [39]:
scores.mean()

np.float64(0.7605521374603141)

In [40]:
scores.std()

np.float64(0.0973395488341272)

In [41]:
# Fit the entire model
baseline.fit(xtrain_pre, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [42]:
baseline.intercept_

np.float64(181441.5419520549)

In [43]:
baseline.coef_

array([-4.32087919e+03, -4.62446666e+03,  4.13673839e+03,  1.32379046e+04,
        6.31208226e+03,  6.72265756e+03,  1.08417910e+03,  5.17471994e+03,
        3.79234333e+03,  1.36923721e+03, -1.30321897e+03,  3.12220377e+03,
        8.52131517e+03,  7.74487208e+03, -1.54942961e+03,  1.26330881e+04,
        3.02420731e+03, -1.17662737e+03,  6.32078857e+02, -2.38471709e+02,
       -2.14490870e+03, -3.41738080e+03,  7.33040022e+03,  4.07088110e+03,
        1.05848111e+03,  6.07004899e+03,  1.15632715e+03,  2.35449930e+03,
       -1.17201972e+02, -1.13744181e+03,  1.33784847e+03,  2.85237140e+03,
        1.65953108e+04, -7.52126479e+02, -5.91630126e+02, -8.26087953e+02,
       -1.65667243e+03,  1.33754834e+03, -2.27290122e+02, -1.40859413e+03,
        1.10286977e+03, -1.41597034e+03, -3.62952473e+01,  2.39767927e+03,
        1.89322643e+03, -3.70252107e+02, -2.92258006e+03, -2.85333953e+03,
       -1.22979093e+03,  1.45065505e+03,  3.15679563e+03, -5.05055524e+03,
        3.54003810e+03,  

In [44]:
# R2 score on train
r2_train = baseline.score(xtrain_pre, ytrain)
r2_train

0.8701199665674562

In [45]:
# R2 score on test
r2_test = baseline.score(xtest_pre, ytest)
r2_test

0.8259893003880228

In [46]:
# Generalization error
gen_err = abs(r2_train - r2_test)
gen_err

0.0441306661794334

### Create a function to save above results for each model

In [47]:
# Initializing a balnk results list
results = []

def evaluate_and_log_model(results, model, xtrain, ytrain, xtest, ytest, description):
    #Cross validation data on train
    scores = cross_val_score(model, xtrain, ytrain, cv=5, scoring="r2")
    mean_score = scores.mean().round(4)
    std_score = scores.std().round(4)
    #Fit the model
    model.fit(xtrain, ytrain)
    #Evaluate r2 score on train and test
    r2_train = round(model.score(xtrain, ytrain), 4)
    r2_test = round(model.score(xtest, ytest), 4)
    #Generalization error
    gen_err = abs(round(r2_train - r2_test, 4))
    # Save above results in a dictionary
    r = {
        "description": description,
        "name":type(model).__name__,
        "cv_mean":mean_score,
        "cv_std":std_score,
        "r2_train" : r2_train,
        "r2_test": r2_test,
        "gen_err": gen_err
    }
    print(r)
    results.append(r)
    return model

In [48]:
baseline = evaluate_and_log_model(
    results, LinearRegression(), xtrain_pre, ytrain, xtest_pre, ytest, description="Baseline Linear Regression"
)

{'description': 'Baseline Linear Regression', 'name': 'LinearRegression', 'cv_mean': np.float64(0.7606), 'cv_std': np.float64(0.0973), 'r2_train': 0.8701, 'r2_test': 0.826, 'gen_err': 0.0441}


### Model 2 - Feature Selection on Linear Regression

In [49]:
from sklearn.feature_selection import SequentialFeatureSelector

sel = SequentialFeatureSelector(
    LinearRegression(), n_features_to_select="auto", direction="forward"
)
sel.fit(xtrain_pre, ytrain)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [50]:
sel_cols = sel.get_feature_names_out()
sel_cols

array(['num__MSSubClass', 'num__LotFrontage', 'num__LotArea',
       'num__OverallQual', 'num__OverallCond', 'num__YearBuilt',
       'num__MasVnrArea', 'num__BsmtUnfSF', 'num__GrLivArea',
       'num__BsmtFullBath', 'num__BsmtHalfBath', 'num__Fireplaces',
       'num__GarageCars', 'num__WoodDeckSF', 'num__3SsnPorch',
       'cat__MSZoning', 'cat__LotShape', 'cat__LandContour',
       'cat__Utilities', 'cat__LandSlope', 'cat__Neighborhood',
       'cat__RoofStyle', 'cat__Exterior1st', 'cat__MasVnrType',
       'cat__ExterQual', 'cat__BsmtQual', 'cat__BsmtCond',
       'cat__BsmtExposure', 'cat__BsmtFinType1', 'cat__HeatingQC',
       'cat__CentralAir', 'cat__Electrical', 'cat__KitchenQual',
       'cat__Functional', 'cat__FireplaceQu', 'cat__GarageType',
       'cat__PavedDrive', 'cat__Fence', 'cat__SaleCondition'],
      dtype=object)

In [51]:
len(sel_cols)

39

In [52]:
xtrain_pre_sel = sel.transform(xtrain_pre)
xtrain_pre_sel

array([[-0.8667643 , -0.01531368, -0.21289571, ...,  0.29282582,
         0.25933624,  0.20177167],
       [ 0.07410996, -0.50521084, -0.26524463, ...,  0.29282582,
         0.25933624,  0.20177167],
       [-0.63154574, -0.14892199, -0.17784146, ..., -1.7072944 ,
         0.25933624,  0.20177167],
       ...,
       [-0.8667643 , -0.46067473, -0.23409563, ..., -3.70741463,
         0.25933624,  0.20177167],
       [-0.16110861, -0.68335526, -0.28337613, ...,  0.29282582,
        -4.16263063,  0.20177167],
       [ 1.48542135, -0.77242747, -0.65139925, ...,  0.29282582,
         0.25933624,  0.20177167]], shape=(1168, 39))

In [53]:
xtrain_pre_sel.shape

(1168, 39)

In [54]:
xtest_pre_sel =sel.transform(xtest_pre)
xtest_pre_sel

array([[-0.8667643 , -0.01531368, -0.21159396, ...,  0.29282582,
         0.25933624,  0.20177167],
       [ 0.07410996,  1.23169728,  0.14564323, ...,  0.29282582,
         0.25933624,  0.20177167],
       [-0.63154574, -0.63881915, -0.16082574, ...,  0.29282582,
         0.25933624,  0.20177167],
       ...,
       [ 0.07410996, -0.32706642, -0.23158511, ...,  0.29282582,
         0.25933624,  0.20177167],
       [ 0.30932853, -0.46067473, -0.14929596, ..., -1.7072944 ,
         0.25933624,  0.20177167],
       [-0.8667643 , -0.01531368, -0.2389307 , ...,  0.29282582,
         0.25933624,  0.20177167]], shape=(292, 39))

In [55]:
xtest_pre_sel.shape

(292, 39)

In [56]:
model_feat_sel = evaluate_and_log_model(
    results, LinearRegression(), xtrain_pre_sel, ytrain, xtest_pre_sel, ytest, description="Featue selection on Linear Regression"
)

{'description': 'Featue selection on Linear Regression', 'name': 'LinearRegression', 'cv_mean': np.float64(0.8253), 'cv_std': np.float64(0.0591), 'r2_train': 0.8405, 'r2_test': 0.84, 'gen_err': 0.0005}


## Model 3 - Ridge on selected features

In [57]:
alphas_list = [0, 0 , 0.1, 1, 20, 50, 200, 500, 10000]

In [58]:
params = {
    "alpha":alphas_list
}
print(params)

{'alpha': [0, 0, 0.1, 1, 20, 50, 200, 500, 10000]}


In [59]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

gscv_ridge = GridSearchCV(Ridge(random_state=42), params, cv=5, scoring="r2")
gscv_ridge.fit(xtrain_pre_sel, ytrain) # Apply this on selected features only

0,1,2
,estimator,Ridge(random_state=42)
,param_grid,"{'alpha': [0, 0, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,50
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [60]:
gscv_ridge.best_params_

{'alpha': 50}

In [61]:
gscv_ridge.best_score_

np.float64(0.8265490715182413)

In [62]:
best_ridge = gscv_ridge.best_estimator_
best_ridge

0,1,2
,alpha,50
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [63]:
sel_ridge = evaluate_and_log_model(
    results, best_ridge, xtrain_pre_sel, ytrain, xtest_pre_sel, ytest,
    description="Ridge on Selected Features"
)

{'description': 'Ridge on Selected Features', 'name': 'Ridge', 'cv_mean': np.float64(0.8265), 'cv_std': np.float64(0.0565), 'r2_train': 0.84, 'r2_test': 0.8401, 'gen_err': 0.0001}


## Model 4 - Lasso on Selected features

In [64]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

gscv_lasso = GridSearchCV(Lasso(random_state=42), params, cv=5, scoring="r2")
gscv_lasso.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Lasso(random_state=42)
,param_grid,"{'alpha': [0, 0, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [65]:
gscv_lasso.best_score_

np.float64(0.8252520790916694)

In [66]:
gscv_lasso.best_params_

{'alpha': 0.1}

In [67]:
best_lasso = gscv_lasso.best_estimator_
best_lasso

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [68]:
sel_lasso = evaluate_and_log_model(
    results, best_lasso, xtrain_pre_sel, ytrain, xtest_pre_sel, ytest,
    description="Lasso on Selected features"
)

{'description': 'Lasso on Selected features', 'name': 'Lasso', 'cv_mean': np.float64(0.8253), 'cv_std': np.float64(0.0591), 'r2_train': 0.8405, 'r2_test': 0.84, 'gen_err': 0.0005}


## Model 5 - Polynomial feature engineering with Ridge 

In [69]:
from sklearn.preprocessing import PolynomialFeatures

In [70]:
poly_ridge = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    Ridge(random_state=42) # type: ignore
)

In [71]:
params2 = {
    "ridge__alpha":alphas_list
}

In [72]:
gscv_poly_ridge = GridSearchCV(poly_ridge, params2, cv=5, scoring="r2")
gscv_poly_ridge.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'ridge__alpha': [0, 0, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,alpha,500
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [73]:
gscv_poly_ridge.best_params_

{'ridge__alpha': 500}

In [74]:
gscv_poly_ridge.best_score_

np.float64(0.7996925156615364)

In [75]:
best_poly_ridge = gscv_poly_ridge.best_estimator_
best_poly_ridge

0,1,2
,steps,"[('polynomialfeatures', ...), ('ridge', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,alpha,500
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [76]:
sel_poly_ridge = evaluate_and_log_model(
    results, best_poly_ridge, xtrain_pre_sel, ytrain, xtest_pre_sel, ytest,
    description="Polynomial Ridge on Selected Features"
)

{'description': 'Polynomial Ridge on Selected Features', 'name': 'Pipeline', 'cv_mean': np.float64(0.7997), 'cv_std': np.float64(0.046), 'r2_train': 0.9373, 'r2_test': 0.8254, 'gen_err': 0.1119}


## Model 6 - Polynomial features with Lasso

In [77]:
poly_lasso = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    Lasso(random_state=42)
)

In [78]:
param3 = {
    "lasso__alpha":alphas_list
}

In [79]:
gscv_poly_lasso = GridSearchCV(poly_lasso, param3, cv=5, scoring="r2")
gscv_poly_lasso.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'lasso__alpha': [0, 0, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,alpha,500
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [80]:
gscv_poly_lasso.best_params_

{'lasso__alpha': 500}

In [81]:
gscv_poly_lasso.best_score_

np.float64(0.8291434466892438)

In [82]:
best_poly_lasso = gscv_poly_lasso.best_estimator_
best_poly_lasso

0,1,2
,steps,"[('polynomialfeatures', ...), ('lasso', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,alpha,500
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [83]:
sel_poly_lasso = evaluate_and_log_model(
    results, best_poly_lasso, xtrain_pre_sel, ytrain, xtest_pre_sel, ytest,
    description="Poly Lasso on selected features"
)

{'description': 'Poly Lasso on selected features', 'name': 'Pipeline', 'cv_mean': np.float64(0.8291), 'cv_std': np.float64(0.0639), 'r2_train': 0.9439, 'r2_test': 0.863, 'gen_err': 0.0809}


In [84]:
res_df = pd.DataFrame(results)
res_df

Unnamed: 0,description,name,cv_mean,cv_std,r2_train,r2_test,gen_err
0,Baseline Linear Regression,LinearRegression,0.7606,0.0973,0.8701,0.826,0.0441
1,Featue selection on Linear Regression,LinearRegression,0.8253,0.0591,0.8405,0.84,0.0005
2,Ridge on Selected Features,Ridge,0.8265,0.0565,0.84,0.8401,0.0001
3,Lasso on Selected features,Lasso,0.8253,0.0591,0.8405,0.84,0.0005
4,Polynomial Ridge on Selected Features,Pipeline,0.7997,0.046,0.9373,0.8254,0.1119
5,Poly Lasso on selected features,Pipeline,0.8291,0.0639,0.9439,0.863,0.0809


In [85]:
sort_df = res_df.sort_values(by="gen_err")
sort_df

Unnamed: 0,description,name,cv_mean,cv_std,r2_train,r2_test,gen_err
2,Ridge on Selected Features,Ridge,0.8265,0.0565,0.84,0.8401,0.0001
1,Featue selection on Linear Regression,LinearRegression,0.8253,0.0591,0.8405,0.84,0.0005
3,Lasso on Selected features,Lasso,0.8253,0.0591,0.8405,0.84,0.0005
0,Baseline Linear Regression,LinearRegression,0.7606,0.0973,0.8701,0.826,0.0441
5,Poly Lasso on selected features,Pipeline,0.8291,0.0639,0.9439,0.863,0.0809
4,Polynomial Ridge on Selected Features,Pipeline,0.7997,0.046,0.9373,0.8254,0.1119


In [86]:
sort_df.to_csv("evaluation.csv", index=False)

## Step 6 - Evaluate the model

In [87]:
# Evaluate in detail
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)

def evaluate_model(model, x, y):
    #Predict results for x
    ypred = model.predict(x)
    # Get the metrices
    rmse = root_mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    # Print the results
    print(f"RMSE : {rmse:.2f}")
    print(f"MAE : {mae:.2f}")
    print(f"MAPE : {mape:.2%}")
    print(f"R2 : {r2:.2%}")

In [88]:
evaluate_model(sel_ridge, xtrain_pre_sel, ytrain)

RMSE : 30895.41
MAE : 18644.98
MAPE : 10.73%
R2 : 84.00%


In [89]:
evaluate_model(sel_ridge, xtest_pre_sel, ytest)

RMSE : 35025.13
MAE : 21748.92
MAPE : 13.13%
R2 : 84.01%


## Out of Sample Prediction

In [90]:
xnew = pd.read_csv("testing_set.csv", na_values=["","NA"], keep_default_na=False)
xnew

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [91]:
xnew_pre = pre.transform(xnew) # type: ignore
xnew_pre

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,-0.866764,0.430047,0.086693,-0.820445,0.372217,-0.325016,-1.153060,-0.601531,0.047891,0.625025,...,-0.687680,0.957537,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
1,-0.866764,0.474583,0.332630,-0.088934,0.372217,-0.422856,-1.297812,0.024509,1.039448,-0.285504,...,-0.687680,0.957537,0.214591,0.187569,0.292826,-0.055603,0.259336,-20.584838,0.316662,0.201772
2,0.074110,0.162831,0.291997,-0.820445,-0.524174,0.849062,0.632217,-0.601531,0.751787,-0.285504,...,-0.687680,-1.503196,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
3,0.074110,0.340975,-0.066170,-0.088934,0.372217,0.881675,0.632217,-0.485598,0.339910,-0.285504,...,-0.687680,-1.503196,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
4,1.485421,-1.217789,-0.528570,1.374088,-0.524174,0.685996,0.342712,-0.601531,-0.398854,-0.285504,...,-0.687680,-0.272830,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.426296,-2.197583,-0.813932,-1.551955,1.268609,-0.031496,-0.718804,-0.601531,-0.971996,-0.285504,...,-0.687680,0.957537,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772
1455,2.426296,-2.197583,-0.817837,-1.551955,-0.524174,-0.031496,-0.718804,-0.601531,-0.422826,-0.285504,...,1.020067,0.957537,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,-3.509556
1456,-0.866764,3.992936,0.865697,-0.820445,1.268609,-0.357629,0.535715,-0.601531,1.695401,-0.285504,...,1.589316,0.957537,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,-3.509556
1457,0.662156,-0.371603,-0.023119,-0.820445,-0.524174,0.685996,0.342712,-0.601531,-0.237590,-0.285504,...,-0.687680,0.957537,0.214591,0.187569,0.292826,-0.055603,0.259336,0.044155,0.316662,0.201772


In [92]:
xnew_pre_sel = sel.transform(xnew_pre)
xnew_pre_sel

array([[-0.8667643 ,  0.43004738,  0.08669258, ...,  0.29282582,
         0.25933624,  0.20177167],
       [-0.8667643 ,  0.47458348,  0.33263021, ...,  0.29282582,
         0.25933624,  0.20177167],
       [ 0.07410996,  0.16283075,  0.29199704, ...,  0.29282582,
         0.25933624,  0.20177167],
       ...,
       [-0.8667643 ,  3.99293582,  0.86569654, ...,  0.29282582,
         0.25933624, -3.50955605],
       [ 0.66215637, -0.37160252, -0.02311926, ...,  0.29282582,
         0.25933624,  0.20177167],
       [ 0.07410996,  0.16283075, -0.09880669, ...,  0.29282582,
         0.25933624,  0.20177167]], shape=(1459, 39))

In [93]:
preds = sel_ridge.predict(xnew_pre_sel)
preds

array([105506.26079942, 162056.06301965, 168038.37362102, ...,
       139270.76859008, 108367.25111377, 241984.07361903], shape=(1459,))

In [94]:
xnew["SalesPrice_pred"] = preds.round(2)
xnew

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalesPrice_pred
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,105506.26
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,162056.06
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,168038.37
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,183098.17
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,187756.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,70642.63
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,66515.67
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,139270.77
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,108367.25


In [95]:
xnew.to_csv("results.csv", index=False)

## Step 7 - Save Preprocessor, Selector and Model in Joblib Format

In [96]:
import joblib

joblib.dump(pre, "preprocessor.joblib")

['preprocessor.joblib']

In [97]:
joblib.dump(sel, "selector.joblib")

['selector.joblib']

In [98]:
joblib.dump(sel_ridge, "model.joblib")

['model.joblib']

In [99]:
p = joblib.load("preprocessor.joblib")
p

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [100]:
s = joblib.load("selector.joblib")
s

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [101]:
m = joblib.load("model.joblib")
m

0,1,2
,alpha,50
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42
