In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score,
    cross_val_predict,
    KFold)
from sklearn.preprocessing import (
    StandardScaler,
    PolynomialFeatures)
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso

In [3]:
df = pd.read_csv('./datasets/train_cleaned.csv')

In [4]:
df.drop(columns= 'Unnamed: 0', inplace=True)

In [5]:
df

Unnamed: 0,id,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,...,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,60,RL,0.0,13517,Pave,3,Lvl,4.0,CulDSac,...,0,44,0,0,0,0,3,2010,WD,130500
1,544,60,RL,43.0,11492,Pave,3,Lvl,4.0,CulDSac,...,0,74,0,0,0,0,4,2009,WD,220000
2,153,20,RL,68.0,7922,Pave,4,Lvl,4.0,Inside,...,0,52,0,0,0,0,1,2010,WD,109000
3,318,60,RL,73.0,9802,Pave,4,Lvl,4.0,Inside,...,100,0,0,0,0,0,4,2010,WD,174000
4,255,50,RL,82.0,14235,Pave,3,Lvl,4.0,Inside,...,0,59,0,0,0,0,3,2010,WD,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2044,1587,20,RL,79.0,11449,Pave,3,HLS,4.0,Inside,...,0,276,0,0,0,0,1,2008,WD,298751
2045,785,30,RL,0.0,12342,Pave,3,Lvl,4.0,Inside,...,158,0,0,0,0,0,3,2009,WD,82500
2046,916,50,RL,57.0,7558,Pave,4,Bnk,4.0,Inside,...,0,0,0,0,0,0,3,2009,WD,177000
2047,639,20,RL,80.0,10400,Pave,4,Lvl,4.0,Corner,...,0,189,140,0,0,0,11,2009,WD,144000


## MODELLING ##

**---Identify X & Y, TRAIN/TEST/SPLIT---**

In [7]:
#selecting all numeric values as features, with the exception of SalePrice
features = [col for col in df._get_numeric_data().columns if col !='saleprice']

X = df[features]
y = df['saleprice']

In [8]:
#train/test/split with training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [9]:
#showing the distributions of X_train, X_test, y_train, y_test
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1639, 55)
(410, 55)
(1639,)
(410,)


**---LINEAR REGRESSION---**

In [10]:
#instantiate
lm = LinearRegression()

#10 fold cross val score on TRAIN
lm_cv_mse = -cross_val_score(lm, 
                             X_train, 
                             y_train, 
                             cv=10,
                             scoring='neg_root_mean_squared_error').mean()


In [11]:
#root MSE: >>> this value is different from manually calculated
#if we use neg_mean_squared_error?
lm_cv_mse

26289.43268155268

In [12]:
#fitting our model on training set:
lm.fit(X_train, y_train)

LinearRegression()

In [13]:
#list predictions for our test set
y_preds_lm = lm.predict(X_test)
print("Linear Model y_preds mean:",y_preds_lm.mean())

Linear Model y_preds mean: 185878.65441280822


**---RIDGE REGRESSION---**

In [14]:
#Scaling data
ss=StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [15]:
#Instantiate ridge model:
r_alphas = np.logspace(0,5,200)
ridge_cv = RidgeCV(alphas=r_alphas, store_cv_values = True)

#Fit ridgeCV model on scaled training set:
ridge_cv.fit(X_train_scaled, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.05956018e+00, 1.12266777e+00, 1.18953407e+00,
       1.26038293e+00, 1.33545156e+00, 1.41499130e+00, 1.49926843e+00,
       1.58856513e+00, 1.68318035e+00, 1.78343088e+00, 1.88965234e+00,
       2.00220037e+00, 2.12145178e+00, 2.24780583e+00, 2.38168555e+00,
       2.52353917e+00, 2.67384162e+00, 2.83309610e+00, 3.00183581e+00,
       3.18062569e+00, 3.37006433e+0...
       2.64308149e+04, 2.80050389e+04, 2.96730241e+04, 3.14403547e+04,
       3.33129479e+04, 3.52970730e+04, 3.73993730e+04, 3.96268864e+04,
       4.19870708e+04, 4.44878283e+04, 4.71375313e+04, 4.99450512e+04,
       5.29197874e+04, 5.60716994e+04, 5.94113398e+04, 6.29498899e+04,
       6.66991966e+04, 7.06718127e+04, 7.48810386e+04, 7.93409667e+04,
       8.40665289e+04, 8.90735464e+04, 9.43787828e+04, 1.00000000e+05]),
        store_cv_values=True)

In [16]:
#obtain optimal alpha from ridgeCV
r_optimal_alpha = ridge_cv.alpha_
print(r_optimal_alpha)

#use optimal alpha to run ridge:
ridge_opt = Ridge(alpha=r_optimal_alpha)


57.384416483023955


In [17]:
ridge_opt.fit(X_train_scaled, y_train)

Ridge(alpha=57.384416483023955)

In [None]:
ridge_opt

In [15]:
ridge_cv_RMSE = - cross_val_score(ridge_cv, 
                                X_train_scaled, 
                                y_train,
                                scoring='neg_root_mean_squared_error').mean()

ridge_cv_RMSE.mean()

26135.150392349304

**---LASSO REGRESSION---**

In [47]:
#instantiate:
lasso_cv = LassoCV(n_alphas=100)
lasso_cv.fit(X_train_scaled, y_train)

LassoCV()

In [17]:
X_train_scaled.shape

(1639, 55)

In [18]:
l_optimal_alpha = lasso_cv.alpha_
lasso_model=Lasso(alpha=l_optimal_alpha)
lasso_model.fit(X_train_scaled, y_train)

Lasso(alpha=237.72236654734826)

In [19]:
lasso_cv_RMSE = -cross_val_score(lasso_cv,
                                 X_train_scaled,
                                 y_train,
                                 scoring='neg_root_mean_squared_error'
                                ).mean()

In [20]:
lasso_cv_RMSE

26108.194076502958

In [21]:
lasso_cv.predict(X_train_scaled)

array([231498.35273143,  92153.96446311, 243413.37723852, ...,
       198889.52240061, 160300.27583973, 254924.19575379])

In [22]:
lasso_model_RMSE = -cross_val_score(lasso_model,
                                    X_train_scaled,
                                    y_train,
                                    scoring='neg_root_mean_squared_error'
                                   ).mean()

In [23]:
lasso_model_RMSE

26084.118653649588

## Adjusting Rows & Columns of Test Set to Match Training Set ##

In [24]:
test = pd.read_csv('./datasets/test.csv')

In [25]:
test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [26]:
test.shape

(879, 80)

In [27]:
test['Pool Area'].replace(0,np.nan, inplace=True)
test['Pool Area'].isnull().value_counts()
test.drop(columns=['Misc Feature','Alley', 'Pool QC', 'Pool Area', 'PID', 'Fence'], inplace=True)

In [28]:
#ORDINAL ENCODING FOR COL: LOT SHAPE
lotshape_ordinal = {'Reg': 4,
                    'IR1': 3,
                    'IR2': 2,
                    'IR3': 1}

test['Lot Shape'] = test['Lot Shape'].apply(lambda x:
                                              lotshape_ordinal[x] 
                                              if x in lotshape_ordinal 
                                              else np.nan)

#ORDINAL ENCODING FOR COL: UTILITIES
utilities_ordinal = {'AllPub':4,
                     'NoSwer':3,
                     'NoSeWa':2,
                     'ELO':1}

test['Utilities'] = test['Utilities'].apply(lambda x:
                                              utilities_ordinal[x] 
                                              if x in utilities_ordinal 
                                              else np.nan)

#ORDINAL ENCODING FOR COL: UTILITIES
landslope_ordinal = {'Gtl':3,
                     'Mod':2,
                     'Sev':1}

test['Land Slope'] = test['Land Slope'].apply(lambda x:
                                                landslope_ordinal[x] 
                                                if x in landslope_ordinal 
                                                else np.nan)

#ORDINAL ENCODING FOR COL: OVERALL QUAL & OVERAL COND NOT NECESSARY
#AS ALREADY RANKED

#ORDINAL ENCODING FOR COL: EXTER QUAL
exterqual_ordinal = {'Ex':5,
                     'Gd':4,
                     'TA':3,
                     'Fa':2,
                     'Po':1}

test['Exter Qual'] = test['Exter Qual'].apply(lambda x:
                                                exterqual_ordinal[x] 
                                                if x in exterqual_ordinal 
                                                else np.nan)


#ORDINAL ENCODING FOR COL: EXTER COND
extercond_ordinal = {'Ex':5,
                     'Gd':4,
                     'TA':3,
                     'Fa':2,
                     'Po':1}

test['Exter Cond'] = test['Exter Cond'].apply(lambda x:
                                                extercond_ordinal[x] 
                                                if x in extercond_ordinal 
                                                else np.nan)

#ORDINAL ENCODING FOR COL: BSMT QUAL, NA TO RESULT IN 0
bsmtqual_ordinal = {'Ex':5,
                    'Gd':4,
                    'TA':3,
                    'Fa':2,
                    'Po':1,
                    'NA':0}

test['Bsmt Qual'] = test['Bsmt Qual'].apply(lambda x:
                                              bsmtqual_ordinal[x] 
                                              if x in bsmtqual_ordinal 
                                              else np.nan)

#ORDINAL ENCODING FOR COL: BSMT COND, NA TO RESULT IN 0
bsmtcond_ordinal = {'Ex':5,
                    'Gd':4,
                    'TA':3,
                    'Fa':2,
                    'Po':1,
                    'NA':0}

test['Bsmt Cond'] = test['Bsmt Cond'].apply(lambda x:
                                              bsmtcond_ordinal[x] 
                                              if x in bsmtcond_ordinal 
                                              else np.nan)

#ORDINAL ENCODING FOR COL: BSMT EXPOSURE, NA TO RESULT IN 0
bsmtexposure_ordinal = {'Gd':4,
                        'Av':3,
                        'Mn':2,
                        'No':1,
                        'NA':0}

test['Bsmt Exposure'] = test['Bsmt Exposure'].apply(lambda x:
                                              bsmtexposure_ordinal[x] 
                                              if x in bsmtexposure_ordinal 
                                              else np.nan)

#ORDINAL ENCODING FOR COL: BSMTFIN, NA TO RESULT IN 0
bsmtfin_ordinal = {'GLQ':6,
                   'ALQ':5,
                   'BLQ':4,
                   'Rec':3,
                   'LwQ':2,
                   'Unf':1,
                   'NA':0}

test['BsmtFin Type 1'] = test['BsmtFin Type 1'].apply(lambda x:
                                                        bsmtfin_ordinal[x] 
                                                        if x in bsmtfin_ordinal 
                                                        else np.nan)


#ORDINAL ENCODING FOR COL: BSMTFINTYPE 2, NA TO RESULT IN 0
bsmtfintype2_ordinal = {'GLQ':6,
                        'ALQ':5,
                        'BLQ':4,
                        'Rec':3,
                        'LwQ':2,
                        'Unf':1,
                        'NA':0}

test['BsmtFin Type 2'] = test['BsmtFin Type 2'].apply(lambda x:
                                              bsmtfintype2_ordinal[x] 
                                              if x in bsmtfintype2_ordinal 
                                              else np.nan)


#ORDINAL ENCODING FOR COL: HEATINGQC
heatingqc_ordinal = {'Ex':5,
                     'Gd':4,
                     'TA':3,
                     'Fa':2,
                     'Po':1}
                    
test['Heating QC'] = test['Heating QC'].apply(lambda x:
                                              heatingqc_ordinal[x] 
                                              if x in heatingqc_ordinal
                                              else np.nan)

#ORDINAL ENCODING FOR COL: ELECTRICAL
elec_ordinal = {'SBrkr':5,
                'FusaA':4,
                'FuseF':3,
                'FuseP':2,
                'Mix':1}
                    
test['Electrical'] = test['Electrical'].apply(lambda x:
                                                elec_ordinal[x] 
                                                if x in elec_ordinal
                                                else np.nan)

#ORDINAL ENCODING FOR COL: KitchenQual
kitchenqual_ordinal = {'Ex':5,
                       'Gd':4,
                       'TA':3,
                       'Fa':2,
                       'Po':1}
                    
test['Kitchen Qual'] = test['Kitchen Qual'].apply(lambda x:
                                                  kitchenqual_ordinal[x] 
                                                  if x in kitchenqual_ordinal
                                                  else np.nan)

#ORDINAL ENCODING FOR COL: FUNCTIONAL
homefunctlty_ordinal = {'Typ':8,
                        'Min1':7,
                        'Min2':6,
                        'Mod':5,
                        'Maj1':4,
                        'Maj2':3,
                        'Sev':2,
                        'Sal':1}
                    
test['Functional'] = test['Functional'].apply(lambda x:
                                                homefunctlty_ordinal[x] 
                                                if x in homefunctlty_ordinal
                                                else np.nan)

#ORDINAL ENCODING FOR COL: FIREPLACEQU, NA TO RESULT IN 0
fireplacequal_ordinal = {'Ex':5,
                        'Gd':4,
                        'TA':3,
                        'Fa':2,
                        'Po':1,
                        'NA':0}

test['Fireplace Qu'] = test['Fireplace Qu'].apply(lambda x:
                                                  fireplacequal_ordinal[x] 
                                                  if x in fireplacequal_ordinal 
                                                  else np.nan)


#ORDINAL ENCODING FOR COL: GARAGE FINISH, NA TO RESULT IN 0
garagefinish_ordinal = {'Fin':3,
                        'RFn':2,
                        'Unf':1,
                        'NA':0}

test['Garage Finish'] = test['Garage Finish'].apply(lambda x:
                                                      garagefinish_ordinal[x] 
                                                      if x in garagefinish_ordinal 
                                                      else np.nan)


#ORDINAL ENCODING FOR COL: GARAGE QUAL, NA TO RESULT IN 0
garagequal_ordinal = {'Ex':5,
                      'Gd':4,
                      'TA':3,
                      'Fa':2,
                      'Po':1,
                      'NA':0}

test['Garage Qual'] = test['Garage Qual'].apply(lambda x:
                                              garagequal_ordinal[x] 
                                              if x in garagequal_ordinal 
                                              else np.nan)

#ORDINAL ENCODING FOR COL: GARAGE COND, NA TO RESULT IN 0
garagecond_ordinal = {'Ex':5,
                      'Gd':4,
                      'TA':3,
                      'Fa':2,
                      'Po':1,
                      'NA':0}

test['Garage Cond'] = test['Garage Cond'].apply(lambda x:
                                                  garagecond_ordinal[x] 
                                                  if x in garagecond_ordinal 
                                                  else np.nan)

#ORDINAL ENCODING FOR COL: PAVED DRIVE, NA TO RESULT IN 0
paveddrive_ordinal = {'Y':2,
                      'P':1,
                      'N':0}

test['Paved Drive'] = test['Paved Drive'].apply(lambda x:
                                                  paveddrive_ordinal[x] 
                                                  if x in paveddrive_ordinal 
                                                  else np.nan)


In [29]:
test[(test['Id'] == 1499)]
test[(test['Id'] == 2181)]

Unnamed: 0,Id,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,...,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Misc Val,Mo Sold,Yr Sold,Sale Type


In [30]:
test.replace(np.nan, 0, inplace=True)

In [31]:
test.shape

(879, 74)

In [32]:
test=test.select_dtypes(include=[np.number])

In [33]:
test.shape

(879, 55)

In [34]:
test.columns

Index(['Id', 'MS SubClass', 'Lot Frontage', 'Lot Area', 'Lot Shape',
       'Utilities', 'Land Slope', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond',
       'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1',
       'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF',
       'Total Bsmt SF', 'Heating QC', 'Electrical', '1st Flr SF', '2nd Flr SF',
       'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath',
       'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr',
       'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces',
       'Fireplace Qu', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars',
       'Garage Area', 'Garage Qual', 'Garage Cond', 'Paved Drive',
       'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
       'Screen Porch', 'Misc Val', 'Mo Sold', 'Yr Sold'],
      dtype='object')

In [35]:
test_scaled = ss.transform(test)

In [36]:
lasso_model.predict(test_scaled).mean()

180026.3346424088

In [37]:
test['SalePrice']=lasso_model.predict(test_scaled)

In [38]:
test.to_csv('./datasets/test_first')

In [45]:
test.drop(test.iloc[:, 1:55], inplace = True, axis = 1) 

In [46]:
test

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,133497.414267
2718,154264.880437
2414,205989.753197
1989,119805.351520
625,184176.794248
...,...
1662,195663.393678
1234,243291.176902
1373,124213.617358
1672,105185.608445


In [44]:
test.to_csv('./datasets/kaggletest_1')