In [892]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn import set_config
set_config(display='diagram')

In [893]:
housing = pd.read_csv('../datasets/train_clean.csv')

In [894]:
housing.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [895]:
# removing outliers

housing = housing[housing['1st_flr_sf'] < 3000]
housing = housing[(housing['total_bsmt_sf'] > 0) & (housing['total_bsmt_sf'] < 3000)]
housing = housing[housing['gr_liv_area'] < 4000]
housing = housing[housing['lot_area'] < 40000]
housing = housing[housing['garage_area'] > 0]
housing = housing[housing['2nd_flr_sf'] > 0]

housing = housing[(housing['saleprice'] > 50_000) & (housing['saleprice'] < 500_000)]

X = Features to be trained on

y = What the model is trying to predict

In [896]:
X = housing[
    [
        "overall_qual",
        "gr_liv_area",
        'exter_qual',
        "kitchen_qual",
        # 'garage_cars',
        'bsmt_qual',
        'year_built',
        # "1st_flr_sf",
        # 'year_remod_add',
        'full_bath',
        'fireplace_qu',
        # "fireplaces",
        "garage_area",
        "total_bsmt_sf",
        "neighborhood",
        # 'half_bath'
    ]
]

y = housing["saleprice"]

Splitting the data into training and testing sets

In [897]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Model

In [898]:
# column transformer that uses OneHotEncoder to encode nominal features
ct = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)

# pipeline that transforms columns, fills missing values in data using KNNInputer, generates polynomial features and feature interactions, scales data, and uses a Ridge regression model
pipe = make_pipeline(ct, KNNImputer(), PolynomialFeatures(), StandardScaler(), Ridge())

# parameters used in the grid search
params = {
    'polynomialfeatures__degree': [1, 2, 3],
    'polynomialfeatures__interaction_only': [True, False], 
    'ridge__alpha': [1, 10, 50, 100],
}

# grid searches to find the best model and parameters
gs = GridSearchCV(pipe, params)
gs.fit(X_train, y_train)

gs.best_params_

{'polynomialfeatures__degree': 2,
 'polynomialfeatures__interaction_only': False,
 'ridge__alpha': 100}

#### Scores

In [899]:
preds = gs.predict(X_test)

print('Train R2: ', gs.score(X_train, y_train))
print(' Test R2: ', gs.score(X_test, y_test))
print('    RMSE: ', mean_squared_error(y_test, preds, squared=False))

Train R2:  0.923084266537954
 Test R2:  0.8894705080840832
    RMSE:  23927.269025728034


#### Coefficients for features used

In [900]:
coefs = gs.best_estimator_.named_steps['ridge'].coef_
cols = gs.best_estimator_.named_steps['columntransformer'].get_feature_names_out()
pd.DataFrame(zip(cols, coefs)).sort_values(1)

Unnamed: 0,0,1
32,garage_area,-828.513047
7,neighborhood_Gilbert,-775.422455
17,neighborhood_SWISU,-719.703112
33,total_bsmt_sf,-626.722807
31,fireplace_qu,-605.043422
16,neighborhood_OldTown,-581.880949
22,neighborhood_Timber,-431.449704
29,year_built,-319.685635
2,neighborhood_BrkSide,-255.762049
12,neighborhood_NPkVill,-249.368512


Exterior quality, overall quality, above grade living area, full baths, basement quality and certain neightborhoods all consistently generated high coefficients

### Loop to run multiple tests of a model

In [901]:
# scores = []

# for n in range(1,51):

#     X_train, X_test, y_train, y_test = train_test_split(X, y)

#     gs.fit(X_train, y_train)
#     preds = gs.predict(X_test)
#     scores.append(mean_squared_error(y_test, preds, squared=False))

# scores

In [902]:
# np.mean(scores)

Ridge model

Mean of 50 tests = 24211.581398473027

# Baseline Model

In [903]:
baseline_model = DummyRegressor(strategy='mean')

baseline_model.fit(X_train, y_train)
baseline_preds = baseline_model.predict(X_test)

print('  Baseline R2: ', baseline_model.score(X_test, y_test))
print('Baseline RMSE: ', mean_squared_error(y_test, baseline_preds, squared=False))

  Baseline R2:  -3.0017813314797692e-05
Baseline RMSE:  71971.5016228314


# Kaggle Submission Section

In [904]:
# testing = pd.read_csv('./datasets/test.csv')

In [905]:
# # encoding ordinal columns 

# qual_without_0 = {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
# qual_with_0 = {np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}

# testing['Heating QC'] = testing['Heating QC'].map(qual_without_0)
# testing['Exter Qual'] = testing['Exter Qual'].map(qual_without_0)
# testing['Kitchen Qual'] = testing['Kitchen Qual'].map(qual_without_0)

# testing['Fireplace Qu'] = testing['Fireplace Qu'].map(qual_with_0)
# testing['Bsmt Cond'] = testing['Bsmt Cond'].map(qual_with_0)
# testing['Bsmt Qual'] = testing['Bsmt Qual'].map(qual_with_0)
# testing['Garage Qual'] = testing['Garage Qual'].map(qual_with_0)

# functionality = {'Sal':1, 'Sev':2, 'Maj2':3, 'Maj1':4, 'Mod':5, 'Min2':6, 'Min1':7, 'Typ':8}
# testing['Functional'] = testing['Functional'].map(functionality)

# basement_type = {np.nan:0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
# testing['BsmtFin Type 1'] = testing['BsmtFin Type 1'].map(basement_type)

# testing['Central Air'] = testing['Central Air'].map({'Y':1, 'N':0})

In [906]:
# X = housing[
#     [
#         "overall_qual",
#         "gr_liv_area",
#         'exter_qual',
#         "kitchen_qual",
#         'garage_cars',
#         'bsmt_qual',
#         # 'Year Built',
#         "1st_flr_sf",
#         'year_remod_add',
#         'full_bath',
#         'fireplace_qu',
#         # 'Heating QC',
#         # 'TotRms AbvGrd',
#         # "Fireplaces",
#         # 'Open Porch SF',
#         # 'Mas Vnr Area',
#         # "Garage Area",
#         "total_bsmt_sf",
#         # "BsmtFin Type 1",
#         # 'Garage Qual',
#         # "Lot Area",
#         # 'Wood Deck SF',
#         "neighborhood",
#         # 'Functional', # look into more
#         # 'MS Zoning',
#         # 'MS SubClass',
#         # 'Street',
#         # 'Sale Type',
#         # 'Land Contour',
#         # 'Lot Config',
#         # 'Land Slope',
#         # 'Lot Shape',
#         # 'Mas Vnr Type',
#         # 'Central Air', # encode this
#         # 'Electrical',
#         # 'Condition 1'
#     ]
# ]

# y = housing["saleprice"]

In [907]:
# preds_test = gs.predict(X)
# testing['SalePrice'] = preds_test

In [908]:
# testing[['Id', 'SalePrice']].to_csv('./datasets/submit_4.csv', index=False)