In [32]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [66]:
import sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [34]:
housing = pd.read_csv("C:\\Users\\cheta\\Downloads\\Housing.csv")

In [35]:
housing.head(4)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished


In [36]:
def binary_maping(x):
    return x.map({'yes':1,'no':0})

In [37]:
binary_vars_list = ['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']

In [38]:
housing[binary_vars_list] = housing[binary_vars_list].apply(binary_maping)

In [39]:
housing.head(4)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished


In [40]:
status = pd.get_dummies(housing['furnishingstatus'],drop_first=True)

In [41]:
housing = pd.concat([housing,status],axis=1)

In [42]:
housing.head(4)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,0,0


In [43]:
housing.drop(['furnishingstatus'],axis=1,inplace=True)

In [44]:
housing.head(4)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0


In [45]:
df_train, df_test = train_test_split(housing,
                                    train_size = 0.7,
                                    test_size = 0.3,
                                    random_state = 100)

In [46]:
sc = StandardScaler()

In [47]:
numeric_vars = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking','price']

df_train[numeric_vars] = sc.fit_transform(df_train[numeric_vars])
df_test[numeric_vars] = sc.fit_transform(df_test[numeric_vars])

In [48]:
df_train.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
359,-0.575831,-0.736734,0.092755,-0.575844,-0.911674,1,0,0,0,0,0.318635,0,0,1
19,2.254239,0.632894,0.092755,1.533738,0.219752,1,0,0,0,1,0.318635,1,1,0
159,0.386778,-0.955291,0.092755,1.533738,-0.911674,1,1,1,0,1,-0.848672,0,0,0
35,1.828458,0.914591,0.092755,1.533738,2.482604,1,0,0,0,1,1.485941,0,0,0
28,2.003961,1.37599,2.811204,1.533738,0.219752,1,0,1,1,0,1.485941,0,0,1


In [49]:
df_test.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
265,-0.195156,-0.978244,-0.058038,-0.565752,0.23637,1,0,0,0,0,-0.711709,1,1,0
54,1.292163,0.321253,-0.058038,1.218544,0.23637,1,1,0,0,1,0.443937,0,1,0
171,0.232316,2.099313,-0.058038,-0.565752,-0.975026,1,0,0,0,0,0.443937,1,1,0
244,-0.120967,0.038029,-0.058038,-0.565752,0.23637,1,1,1,0,0,-0.711709,1,1,0
268,-0.205755,-0.116078,1.301706,-0.565752,0.23637,1,0,0,0,1,-0.711709,0,1,0


In [50]:
y_train = df_train.pop('price')
X_train = df_train

y_test = df_test.pop('price')
x_test = df_test

In [54]:
lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression()

In [62]:
rfe = RFE(lr, n_features_to_select = 10)
rfe.fit(X_train,y_train)

RFE(estimator=LinearRegression(), n_features_to_select=10)

In [63]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

[('area', True, 1),
 ('bedrooms', False, 3),
 ('bathrooms', True, 1),
 ('stories', True, 1),
 ('mainroad', True, 1),
 ('guestroom', True, 1),
 ('basement', True, 1),
 ('hotwaterheating', True, 1),
 ('airconditioning', True, 1),
 ('parking', False, 2),
 ('prefarea', True, 1),
 ('semi-furnished', False, 4),
 ('unfurnished', True, 1)]

In [65]:
y_pred = rfe.predict(x_train)

In [67]:
r2_score(y_test,y_pred)

0.6544464733320661

### Introducing Cross Validation

In [70]:
cross_val_score(rfe, X_train, y_train, cv=5)

array([0.68857078, 0.68007283, 0.65706591, 0.59305924, 0.56608297])

In [71]:
folds = KFold(n_splits=5, shuffle=True, random_state=100)

In [78]:
hyper_params = [{
    'n_features_to_select': list(range(6,12))
}]

In [80]:
lm = LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()

In [81]:
rfe = RFE(lm)

model_cv = GridSearchCV(estimator = rfe,
                     param_grid = hyper_params,
                     scoring = 'r2',
                     verbose = 1,
                     return_train_score = True)

In [82]:
model_cv.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(estimator=RFE(estimator=LinearRegression()),
             param_grid=[{'n_features_to_select': [6, 7, 8, 9, 10, 11]}],
             return_train_score=True, scoring='r2', verbose=1)

In [84]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_features_to_select,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.02853,0.014421,0.001624,0.003248,6,{'n_features_to_select': 6},0.579923,0.489275,0.541045,0.568366,...,0.541392,0.031933,6,0.541352,0.567559,0.625346,0.620859,0.649616,0.600946,0.040067
1,0.017583,0.001777,0.002155,0.002951,7,{'n_features_to_select': 7},0.673675,0.64726,0.555141,0.578214,...,0.596825,0.05481,5,0.606327,0.618895,0.630451,0.632542,0.657453,0.629134,0.016972
2,0.009827,0.008064,0.003523,0.006099,8,{'n_features_to_select': 8},0.667955,0.691082,0.579277,0.594132,...,0.610132,0.062535,4,0.647784,0.647751,0.64019,0.666912,0.66713,0.653953,0.011023
3,0.015769,0.00248,0.00335,0.006213,9,{'n_features_to_select': 9},0.683762,0.705161,0.649093,0.588076,...,0.637347,0.055141,2,0.656854,0.654383,0.664606,0.676592,0.695986,0.669684,0.015258
4,0.00986,0.008099,0.006257,0.007664,10,{'n_features_to_select': 10},0.688571,0.680073,0.657066,0.593059,...,0.63697,0.048739,3,0.660656,0.669792,0.668441,0.679076,0.699911,0.675575,0.013501
5,0.0,0.0,0.009373,0.007653,11,{'n_features_to_select': 11},0.691487,0.695005,0.673934,0.616475,...,0.650802,0.046375,1,0.673134,0.671408,0.678027,0.688201,0.701554,0.682465,0.011192


In [85]:
en = ElasticNet(alpha=0.01)
en.fit(X_train, y_train)
rfe = RFE(en)

In [86]:
folds

KFold(n_splits=5, random_state=100, shuffle=True)

In [87]:
model_cv = GridSearchCV(rfe,
                       param_grid = hyper_params,
                       cv = folds,
                       verbose = 1,
                       scoring='r2',
                       return_train_score = True)

model_cv.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=100, shuffle=True),
             estimator=RFE(estimator=ElasticNet(alpha=0.01)),
             param_grid=[{'n_features_to_select': [6, 7, 8, 9, 10, 11]}],
             return_train_score=True, scoring='r2', verbose=1)

In [90]:
cv_result = pd.DataFrame(model_cv.cv_results_)
cv_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_features_to_select,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.012264,0.002593,0.000885,0.001196,6,{'n_features_to_select': 6},0.514763,0.632374,0.556307,0.541037,...,0.566516,0.040583,6,0.631073,0.606874,0.60586,0.62448,0.612904,0.616238,0.009945
1,0.011811,0.006616,0.003151,0.006302,7,{'n_features_to_select': 7},0.514624,0.688746,0.552266,0.547416,...,0.570994,0.060521,5,0.638366,0.633764,0.611622,0.636279,0.630306,0.630067,0.009606
2,0.011146,0.006744,0.004218,0.006557,8,{'n_features_to_select': 8},0.559595,0.695172,0.575901,0.601908,...,0.60087,0.049127,4,0.669029,0.642809,0.662946,0.663504,0.664874,0.660633,0.009163
3,0.009099,0.004741,0.001428,0.001753,9,{'n_features_to_select': 9},0.590708,0.70992,0.585321,0.620559,...,0.619621,0.046803,3,0.67435,0.651529,0.673595,0.669243,0.669315,0.667606,0.008312
4,0.015992,0.000675,0.0,0.0,10,{'n_features_to_select': 10},0.596148,0.712039,0.588701,0.630272,...,0.628809,0.044152,2,0.678629,0.654756,0.676854,0.671268,0.678137,0.671929,0.008978
5,0.007065,0.008657,0.0094,0.007675,11,{'n_features_to_select': 11},0.597863,0.710875,0.603895,0.642134,...,0.637715,0.040297,1,0.691388,0.668312,0.684278,0.681962,0.679951,0.681178,0.007503


In [92]:
hyper_params = [
    {'alpha': [0.01,0.05,0.1,0.5,1]}
]

In [93]:
r = Ridge()

In [95]:
model_cv = GridSearchCV(estimator = r,
                       param_grid = hyper_params,
                       cv = folds,
                       scoring='r2',
                       verbose = 1,
                       return_train_score = True)
model_cv.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=100, shuffle=True),
             estimator=Ridge(),
             param_grid=[{'alpha': [0.01, 0.05, 0.1, 0.5, 1]}],
             return_train_score=True, scoring='r2', verbose=1)

In [97]:
cv_res = pd.DataFrame(model_cv.cv_results_)
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.018836,0.029688,0.00079,0.001579,0.01,{'alpha': 0.01},0.599309,0.713072,0.613245,0.627437,...,0.634877,0.040216,5,0.694368,0.672234,0.688467,0.686242,0.683746,0.685011,0.007292
1,0.006249,0.007653,0.0,0.0,0.05,{'alpha': 0.05},0.599324,0.713054,0.613196,0.627622,...,0.634924,0.040198,4,0.694367,0.672234,0.688467,0.686242,0.683745,0.685011,0.007292
2,0.003347,0.006202,0.003399,0.004172,0.1,{'alpha': 0.1},0.599341,0.713032,0.613135,0.627849,...,0.634983,0.040176,3,0.694367,0.672233,0.688466,0.686241,0.683745,0.68501,0.007292
3,0.004965,0.001594,0.003068,0.001262,0.5,{'alpha': 0.5},0.599465,0.712843,0.612645,0.629558,...,0.635416,0.040013,2,0.694344,0.672211,0.68844,0.686209,0.683722,0.684986,0.007291
4,0.00361,0.000369,0.00223,0.000532,1.0,{'alpha': 1},0.599582,0.712584,0.612029,0.631448,...,0.635878,0.039841,1,0.694281,0.672149,0.688366,0.68612,0.683656,0.684914,0.007289


In [99]:
model_cv.best_estimator_

Ridge(alpha=1)

In [100]:
y_hat = model_cv.predict(x_test)

In [102]:
r2_score(y_test,y_hat)

0.6770046132652724