In [15]:
import numpy as np
import pandas as pd

from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.impute import KNNImputer

from sklearn import set_config
set_config(display='diagram')

In [16]:
housing = pd.read_csv('../datasets/train_clean.csv')

In [17]:
housing.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [18]:
# removing outliers

housing = housing[housing['1st_flr_sf'] < 3000]
housing = housing[(housing['total_bsmt_sf'] > 0) & (housing['total_bsmt_sf'] < 3000)]
housing = housing[housing['gr_liv_area'] < 4000]
housing = housing[housing['lot_area'] < 40000]
housing = housing[housing['garage_area'] > 0]
housing = housing[housing['2nd_flr_sf'] > 0]

housing = housing[(housing['saleprice'] > 50_000) & (housing['saleprice'] < 500_000)]

X = Features to be trained on

y = What the model is trying to predict

In [19]:
X = housing[
    [
        "overall_qual",
        "gr_liv_area",
        'exter_qual',
        "kitchen_qual",
        'garage_cars',
        'bsmt_qual',
        # 'Year Built',
        "1st_flr_sf",
        # 'year_remod_add',
        'full_bath',
        # 'fireplace_qu',
        "fireplaces",
        # "Garage Area",
        "total_bsmt_sf",
        "neighborhood",
    ]
]

y = housing["saleprice"]

Splitting data into training and testing sets

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Model

In [31]:
# column transformer that uses OneHotEncoder to encode nominal features
ct = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)

# pipeline that transforms columns, fills missing values in data using KNNInputer, generates polynomial features and feature interactions, scales data, and uses a Ridge regression model
pipe = make_pipeline(ct, KNNImputer(), PolynomialFeatures(), StandardScaler(), Lasso(max_iter=10_000))

# parameters used in the grid search
params = {
    'polynomialfeatures__degree': [1, 2, 3],
    'polynomialfeatures__interaction_only': [True, False], 
    'lasso__alpha': [600, 800, 1000]
}

# grid searches to find the best model and parameters
gs = GridSearchCV(pipe, params)
gs.fit(X_train, y_train)

gs.best_params_

{'lasso__alpha': 1000,
 'polynomialfeatures__degree': 2,
 'polynomialfeatures__interaction_only': False}

#### Scores

In [22]:
preds = gs.predict(X_test)

print('Train R2: ', gs.score(X_train, y_train))
print(' Test R2: ', gs.score(X_test, y_test))
print('    RMSE: ', mean_squared_error(y_test, preds, squared=False))

Train R2:  0.9158342140161198
 Test R2:  0.8802721047839671
    RMSE:  23184.3286400092


#### Coefficients for features used

In [23]:
coefs = gs.best_estimator_.named_steps['lasso'].coef_
cols = gs.best_estimator_.named_steps['columntransformer'].get_feature_names_out()
pd.DataFrame(zip(cols, coefs)).sort_values(1)

Unnamed: 0,0,1
0,neighborhood_Blueste,0.0
32,full_bath,-0.0
31,1st_flr_sf,0.0
30,bsmt_qual,0.0
29,garage_cars,0.0
28,kitchen_qual,0.0
27,exter_qual,0.0
26,gr_liv_area,0.0
25,overall_qual,-0.0
23,neighborhood_Timber,0.0


### Loop to run multiple tests of a model

In [32]:
# scores = []

# for n in range(1,51):

#     X_train, X_test, y_train, y_test = train_test_split(X, y)

#     gs.fit(X_train, y_train)
#     preds = gs.predict(X_test)
#     scores.append(mean_squared_error(y_test, preds, squared=False))

# np.mean(scores)

24975.122479528804

Lasso model

Mean of 50 tests = 24975.122479528804

# Kaggle Submission Section

In [26]:
# testing = pd.read_csv('./datasets/test.csv')

In [27]:
# # encoding ordinal columns 

# qual_without_0 = {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
# qual_with_0 = {np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}

# testing['Heating QC'] = testing['Heating QC'].map(qual_without_0)
# testing['Exter Qual'] = testing['Exter Qual'].map(qual_without_0)
# testing['Kitchen Qual'] = testing['Kitchen Qual'].map(qual_without_0)

# testing['Fireplace Qu'] = testing['Fireplace Qu'].map(qual_with_0)
# testing['Bsmt Cond'] = testing['Bsmt Cond'].map(qual_with_0)
# testing['Bsmt Qual'] = testing['Bsmt Qual'].map(qual_with_0)
# testing['Garage Qual'] = testing['Garage Qual'].map(qual_with_0)

# functionality = {'Sal':1, 'Sev':2, 'Maj2':3, 'Maj1':4, 'Mod':5, 'Min2':6, 'Min1':7, 'Typ':8}
# testing['Functional'] = testing['Functional'].map(functionality)

# basement_type = {np.nan:0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
# testing['BsmtFin Type 1'] = testing['BsmtFin Type 1'].map(basement_type)

# testing['Central Air'] = testing['Central Air'].map({'Y':1, 'N':0})

In [28]:
# X = housing[
#     [
#         "overall_qual",
#         "gr_liv_area",
#         'exter_qual',
#         "kitchen_qual",
#         'garage_cars',
#         'bsmt_qual',
#         # 'Year Built',
#         "1st_flr_sf",
#         'year_remod_add',
#         'full_bath',
#         'fireplace_qu',
#         # 'Heating QC',
#         # 'TotRms AbvGrd',
#         # "Fireplaces",
#         # 'Open Porch SF',
#         # 'Mas Vnr Area',
#         # "Garage Area",
#         "total_bsmt_sf",
#         # "BsmtFin Type 1",
#         # 'Garage Qual',
#         # "Lot Area",
#         # 'Wood Deck SF',
#         "neighborhood",
#         # 'Functional', # look into more
#         # 'MS Zoning',
#         # 'MS SubClass',
#         # 'Street',
#         # 'Sale Type',
#         # 'Land Contour',
#         # 'Lot Config',
#         # 'Land Slope',
#         # 'Lot Shape',
#         # 'Mas Vnr Type',
#         # 'Central Air', # encode this
#         # 'Electrical',
#         # 'Condition 1'
#     ]
# ]

# y = housing["saleprice"]

In [29]:
# preds_test = gs.predict(X)
# testing['SalePrice'] = preds_test

In [30]:
# testing[['Id', 'SalePrice']].to_csv('./datasets/submit_4.csv', index=False)