In [447]:
import numpy as np
import pandas as pd
import seaborn as sns
from statsmodels import regression as sm
import sklearn.linear_model as lm
from sklearn import model_selection as ms
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [448]:
train_df = pd.read_csv(r'train.csv')
test_df = pd.read_csv(r'test.csv')

In [449]:
# train_df.info()
categorical = list(train_df.select_dtypes('object').columns)
# categorical

There are 1460 samples in the training data set and 80 features. There are 43 columns with the 'object' data type,
meaning non-numeric categorical data. These features are contained in the "categorical" list. However, notice also that
the 'MSSubClass' feature is numerical-categorical. Thus, there are actually 44 categorical features.

I will select seven non-categorical features.

In [450]:
# sns.pairplot(train_df[['SalePrice', 'LotArea', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'FullBath', '1stFlrSF', '2ndFlrSF']])

The plots we care about here are in row 1 (or column 1). There appears to be a correlation between sales price and:
Overall Quality, 1st Floor Area, 2nd Floor Area, and some slight correlations with Year of Remodelling, and
Number of Full Baths.

In [451]:
# sm.linear_model.OLS()

In [452]:
MSSubClass_encoded = pd.get_dummies(train_df[['MSSubClass']].astype(str))
train_df_dropped = train_df.drop('Id', axis=1)
df_encoded = pd.get_dummies(train_df_dropped)
df_encoded = pd.concat([df_encoded, MSSubClass_encoded], axis=1).drop('MSSubClass', axis=1)
# df_encoded.info(verbose=True, null_counts=True)
# df_encoded

In [453]:
split = ms.train_test_split(df_encoded, train_size=0.8)
train_split = split[0]
test_split = split[1]
# train_split

In [454]:
# encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
# encoder.fit(train_df)

In [455]:
all_columns = df_encoded.columns
columns = df_encoded.drop(['SalePrice'], axis=1).columns
train_split.columns[train_split.isna().any()].tolist()
# test_split.columns[test_split.isna().any()].tolist()

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

Setting values to the mean or zeroes could highly skew the results of a regression model.
I will use KNN to perform multivariate imputation, filling in the above columns.

In [456]:
# Training the inputer with train split
imputer_train = KNNImputer(n_neighbors=15, weights="uniform")
imputer_train.fit(train_split)
train_split = pd.DataFrame(imputer_train.fit_transform(df_encoded), columns = all_columns)
test_split = pd.DataFrame(imputer_train.fit_transform(test_split), columns = all_columns)

In [457]:
# training the normalizer with train split
normalize_train = StandardScaler().fit(train_split.drop(['SalePrice'], axis=1))

train_norm = normalize_train.transform(train_split.drop(['SalePrice'], axis=1))
test_norm = normalize_train.transform(test_split.drop(['SalePrice'], axis=1))

train_norm = pd.DataFrame(train_norm, columns = columns)
test_norm = pd.DataFrame(test_norm, columns = columns)

In [458]:
X = train_norm
Y = train_split['SalePrice']
X_test = test_norm
Y_test = test_split['SalePrice']

In [459]:
lin_reg = lm.LinearRegression().fit(X, Y)
lin_pred = lin_reg.predict(X_test)
# lin_reg.score(test_norm.drop(['SalePrice'], axis=1), test_norm['SalePrice'])
r2_score(Y_test, lin_pred)

0.9372192687370252

In [460]:
knn_reg = KNeighborsRegressor()
param_grid = {'n_neighbors': np.arange(1, 15)}
knn_grid_cv = ms.GridSearchCV(knn_reg, param_grid, cv=10)
knn_grid_cv.fit(X, Y)
print(knn_grid_cv.best_params_)
print(knn_grid_cv.best_score_)
display(pd.DataFrame(knn_grid_cv.cv_results_))

{'n_neighbors': 11}
0.7405634361447293


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003001,0.000448,0.004701,0.000458,1,{'n_neighbors': 1},0.704337,0.288652,0.673225,0.668503,0.728969,0.596297,0.750835,0.632051,0.345203,0.638231,0.60263,0.149873,14
1,0.002801,0.0006,0.005001,0.000632,2,{'n_neighbors': 2},0.760294,0.597509,0.785232,0.744155,0.724942,0.711895,0.783782,0.755437,0.457308,0.693007,0.701356,0.096509,13
2,0.002901,0.0003,0.005001,1e-06,3,{'n_neighbors': 3},0.791678,0.680672,0.78975,0.732312,0.732995,0.74353,0.791826,0.754361,0.515357,0.750465,0.728295,0.078027,12
3,0.003001,0.000633,0.005336,0.000447,4,{'n_neighbors': 4},0.779533,0.716608,0.789691,0.720356,0.731071,0.745167,0.763657,0.782534,0.535011,0.747364,0.731099,0.069802,11
4,0.002754,0.000518,0.005401,0.00049,5,{'n_neighbors': 5},0.767779,0.721353,0.793127,0.69067,0.733361,0.774366,0.74956,0.78386,0.549151,0.765257,0.732849,0.067945,10
5,0.003201,0.0006,0.005601,0.00049,6,{'n_neighbors': 6},0.773894,0.783611,0.795534,0.700564,0.712149,0.785216,0.754382,0.779214,0.547689,0.769856,0.740211,0.070802,3
6,0.002901,0.0003,0.005601,0.00049,7,{'n_neighbors': 7},0.770276,0.769669,0.79053,0.676201,0.700273,0.789617,0.766473,0.773471,0.543436,0.772189,0.735213,0.073298,8
7,0.003001,0.000447,0.005501,0.0005,8,{'n_neighbors': 8},0.810577,0.76478,0.791389,0.670147,0.695494,0.779921,0.763272,0.792394,0.561254,0.773031,0.740226,0.072813,2
8,0.002801,0.0004,0.005401,0.000664,9,{'n_neighbors': 9},0.80429,0.769589,0.79115,0.670639,0.694627,0.787157,0.760536,0.782024,0.558202,0.768746,0.738696,0.07264,6
9,0.003101,0.0003,0.005601,0.00049,10,{'n_neighbors': 10},0.79976,0.760243,0.791905,0.686254,0.700491,0.779018,0.76656,0.784111,0.552897,0.765767,0.738701,0.071381,5


It appears that 11 neighbours gives us the best score. However, given the standard error of about 0.07, a score of at least
0.67 performs similarly. Thus, the "rule-of-thumb" best selection could be argued to be k=2 KNN.


In [475]:
ridge_reg = lm.RidgeCV(alphas=np.linspace(0.001,1000,30), cv=10)
ridge_reg.fit(X, Y)
print(ridge_reg.best_score_)
print(ridge_reg.alpha_)

0.8505264801075464
758.6209310344827


In [482]:
ridge_reg = lm.Ridge()
param_grid = {'alpha': np.linspace(0.001,1000,30)}
ridge_grid_cv = ms.GridSearchCV(ridge_reg, param_grid, cv=10)
ridge_grid_cv.fit(X, Y)
print(ridge_grid_cv.best_params_)
print(ridge_grid_cv.best_score_)
display(pd.DataFrame(ridge_grid_cv.cv_results_))

{'alpha': 758.6209310344827}
0.8505264801075464


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007802,0.0007486405,0.001601,0.000490174,0.001,{'alpha': 0.001},0.882797,0.824154,0.912537,0.751018,0.900222,0.589256,0.886591,0.895929,0.420825,-1.254833,0.58085,0.630833,30
1,0.007501,0.000670914,0.001901,0.0003001774,34.483724,{'alpha': 34.483724137931034},0.897533,0.830037,0.915897,0.761106,0.90497,0.72967,0.890851,0.898326,0.464817,0.900752,0.819396,0.13341,29
2,0.007902,0.0002999386,0.0016,0.0004900475,68.966448,{'alpha': 68.96644827586208},0.904408,0.836867,0.916859,0.767384,0.904662,0.759471,0.892386,0.898545,0.487943,0.899918,0.826844,0.12569,28
3,0.007701,0.0004584951,0.001901,0.0003003123,103.449172,{'alpha': 103.44917241379312},0.908695,0.842287,0.91715,0.771488,0.903451,0.779878,0.893177,0.898464,0.504712,0.899076,0.831838,0.120208,27
4,0.007687,0.0004507647,0.002,5.545475e-07,137.931897,{'alpha': 137.93189655172415},0.911698,0.846823,0.917189,0.774447,0.90196,0.795073,0.893558,0.8983,0.517851,0.898299,0.83552,0.115994,26
5,0.007902,0.0003000182,0.0018,0.000400138,172.414621,{'alpha': 172.41462068965518},0.913944,0.85071,0.917125,0.776705,0.900367,0.806892,0.893685,0.898104,0.528586,0.897608,0.838373,0.112608,25
6,0.007596,0.0004964091,0.002001,4.27162e-07,206.897345,{'alpha': 206.89734482758624},0.915691,0.854083,0.917018,0.778492,0.898738,0.816339,0.893641,0.897891,0.537597,0.897002,0.840649,0.109806,24
7,0.007702,0.0006405602,0.001901,0.0002999785,241.380069,{'alpha': 241.38006896551727},0.917084,0.857033,0.916891,0.779939,0.897101,0.824037,0.893474,0.897666,0.545305,0.896473,0.8425,0.107435,23
8,0.008002,0.0006326361,0.001501,0.0005000355,275.862793,{'alpha': 275.86279310344827},0.918214,0.859625,0.916757,0.781132,0.89547,0.830397,0.893215,0.897431,0.551995,0.896008,0.844024,0.105395,22
9,0.009102,0.001044202,0.0021,0.0003000977,310.345517,{'alpha': 310.3455172413793},0.91914,0.861912,0.91662,0.782125,0.893852,0.835706,0.892885,0.897186,0.557867,0.895596,0.845289,0.103614,21
