# Project 2: Ames Housing Data and Kaggle Challenge

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error


current_palette = sns.color_palette()
sns.palplot(sns.color_palette("Paired"))
%matplotlib inline

## Data Import and Cleaning

#### Cleaning on the test.csv

In [2]:
#Reading the data
test_df = pd.read_csv("../datasets/test.csv")

# replace spaces in column names and convert all columns to lowercase:
test_df.columns = [x.lower().replace(' ','_') for x in test_df.columns]
pd.options.display.max_columns = None

In [3]:
test_df.head(5)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,2fmCon,2Story,6,8,1910,1950,Gable,CompShg,AsbShng,AsbShng,,0.0,TA,Fa,Stone,Fa,TA,No,Unf,0,Unf,0,1020,1020,GasA,Gd,N,FuseP,908,1020,0,1928,0,0,2,0,4,2,Fa,9,Typ,0,,Detchd,1910.0,Unf,1,440,Po,Po,Y,0,60,112,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,Duplex,1Story,5,4,1977,1977,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,No,Unf,0,Unf,0,1967,1967,GasA,TA,Y,SBrkr,1967,0,0,1967,0,0,2,0,6,2,TA,10,Typ,0,,Attchd,1977.0,Fin,2,580,TA,TA,Y,170,0,0,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,7,5,2006,2006,Gable,CompShg,VinylSd,VinylSd,,0.0,Gd,TA,PConc,Gd,Gd,Av,GLQ,554,Unf,0,100,654,GasA,Ex,Y,SBrkr,664,832,0,1496,1,0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,2006.0,RFn,2,426,TA,TA,Y,100,24,0,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,6,1923,2006,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,Gd,TA,CBlock,TA,TA,No,Unf,0,Unf,0,968,968,GasA,TA,Y,SBrkr,968,0,0,968,0,0,1,0,2,1,TA,5,Typ,0,,Detchd,1935.0,Unf,2,480,Fa,TA,N,0,0,184,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1963,1963,Gable,CompShg,Plywood,Plywood,BrkFace,247.0,TA,TA,CBlock,Gd,TA,No,BLQ,609,Unf,0,785,1394,GasA,Gd,Y,SBrkr,1394,0,0,1394,1,0,1,1,3,1,TA,6,Typ,2,Gd,Attchd,1963.0,RFn,2,514,TA,TA,Y,0,76,0,0,185,0,,,,0,7,2009,WD


In [4]:
#Checking the summary of the Dataframe
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 80 columns):
id                 879 non-null int64
pid                879 non-null int64
ms_subclass        879 non-null int64
ms_zoning          879 non-null object
lot_frontage       719 non-null float64
lot_area           879 non-null int64
street             879 non-null object
alley              58 non-null object
lot_shape          879 non-null object
land_contour       879 non-null object
utilities          879 non-null object
lot_config         879 non-null object
land_slope         879 non-null object
neighborhood       879 non-null object
condition_1        879 non-null object
condition_2        879 non-null object
bldg_type          879 non-null object
house_style        879 non-null object
overall_qual       879 non-null int64
overall_cond       879 non-null int64
year_built         879 non-null int64
year_remod/add     879 non-null int64
roof_style         879 non-null object
roof_m

In [5]:
#dropping columns based on features not being used on the select model.(Lasso)
test_df.drop(columns=['alley','pool_qc','misc_feature','id','pid','bsmtfin_sf_2','low_qual_fin_sf','bsmt_half_bath',\
                     'enclosed_porch','3ssn_porch','screen_porch','pool_area','misc_val','kitchen_abvgr','yr_sold',\
                     'mo_sold','bedroom_abvgr','bsmt_unf_sf','2nd_flr_sf','garage_yr_blt','half_bath','bsmt_full_bath',\
                     'lot_area','lot_frontage','wood_deck_sf','open_porch_sf', 'bsmtfin_sf_1','fireplaces','street',\
                     'land_contour','utilities','land_slope','condition_1','condition_2','bldg_type','roof_matl','exter_cond',\
                     'bsmt_cond','bsmtfin_type_2','heating','central_air','electrical','functional','garage_qual','garage_cond',\
                     'paved_drive', 'fence', 'sale_type'], inplace = True)

In [6]:
#Cleaning up the data.

test_df['mas_vnr_area'].fillna(0, inplace=True) 
test_df['total_bsmt_sf'].fillna(0, inplace=True)  
test_df['garage_cars'].fillna(0, inplace=True)                                  
test_df['garage_area'].fillna(0, inplace=True) 
test_df['mas_vnr_type'].fillna('None', inplace=True)                                  
test_df['bsmt_qual'].fillna('None', inplace=True)
test_df['bsmt_exposure'].fillna('None', inplace=True)
test_df['bsmtfin_type_1'].fillna('None', inplace=True)                                
test_df['fireplace_qu'].fillna('None', inplace=True)    
test_df['garage_type'].fillna('None', inplace=True)
test_df['garage_finish'].fillna('None', inplace=True)

#replace typo error in Exterior 2 column. Shld be CemntBd instead of CmentBd
test_df['exterior_2nd'].replace('CmentBd','CemntBd', inplace = True)

In [7]:
#Checking the numerical features
test_df.describe().round(2)

Unnamed: 0,ms_subclass,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,total_bsmt_sf,1st_flr_sf,gr_liv_area,full_bath,totrms_abvgrd,garage_cars,garage_area
count,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
mean,58.27,6.05,5.57,1970.53,1984.44,106.86,1036.75,1148.05,1500.53,1.54,6.46,1.74,470.83
std,42.21,1.37,1.13,30.4,20.45,188.28,419.26,381.02,517.41,0.56,1.6,0.75,213.07
min,20.0,2.0,1.0,1880.0,1950.0,0.0,0.0,407.0,407.0,0.0,3.0,0.0,0.0
25%,20.0,5.0,5.0,1954.0,1967.0,0.0,790.0,864.0,1114.0,1.0,5.0,1.0,323.0
50%,50.0,6.0,5.0,1972.0,1992.0,0.0,975.0,1064.0,1436.0,2.0,6.0,2.0,473.0
75%,70.0,7.0,6.0,2000.0,2003.0,173.0,1248.0,1358.0,1781.0,2.0,7.0,2.0,576.0
max,190.0,10.0,9.0,2010.0,2010.0,1378.0,3138.0,3138.0,4676.0,4.0,12.0,4.0,1488.0


In [8]:
#Convert some data types
test_df['ms_subclass'] = test_df['ms_subclass'].astype(object)
test_df['overall_qual'] = test_df['overall_qual'].astype(object)
test_df['overall_cond'] = test_df['overall_cond'].astype(object)
test_df['mas_vnr_area'] = test_df['mas_vnr_area'].astype(int)
test_df['total_bsmt_sf'] = test_df['total_bsmt_sf'].astype(int)
test_df['garage_cars'] = test_df['garage_cars'].astype(int)
test_df['garage_area'] = test_df['garage_area'].astype(int)

## Converting test.csv to the required format.

In [9]:
#Re-init. column_drop[] to hold new variables.
column_drop = []
#New dataframe for new dummies columns 
dummy=[]
column_dummy = pd.DataFrame(dummy)

In [10]:
#Add _dummies for ms_subclass
ms_subclass_dummy = pd.get_dummies(test_df['ms_subclass'],prefix='mssub', drop_first=True)

#Add additional column
ms_subclass_dummy['mssub_150'] = 0

#Rearrange the columns
columnsTitles = ['mssub_30','mssub_40','mssub_45','mssub_50','mssub_60','mssub_70','mssub_75','mssub_80',\
                 'mssub_85','mssub_90','mssub_120','mssub_150','mssub_160','mssub_180','mssub_190']
ms_subclass_dummy = ms_subclass_dummy.reindex(columns=columnsTitles)

column_drop.append('ms_subclass')
column_dummy = pd.concat([column_dummy, ms_subclass_dummy], axis=1)


In [11]:
#Add _dummies for roof_style
roof_style_dummy = pd.get_dummies(test_df['roof_style'],prefix='rfstyle', drop_first=True)

column_drop.append('roof_style')
column_dummy = pd.concat([column_dummy, roof_style_dummy], axis=1)

In [12]:
#Add _dummies for neighborhood
neighborhood_dummy = pd.get_dummies(test_df['neighborhood'],prefix='neigh', drop_first=True)

#Add additional column
neighborhood_dummy['neigh_Landmrk'] = 0
neighborhood_dummy['neigh_GrnHill'] = 0

#Rearrange the columns
columnsTitles = ['neigh_Blueste','neigh_BrDale','neigh_BrkSide','neigh_ClearCr','neigh_CollgCr','neigh_Crawfor',\
                 'neigh_Edwards','neigh_Gilbert','neigh_Greens','neigh_GrnHill','neigh_IDOTRR','neigh_Landmrk','neigh_MeadowV',\
                 'neigh_Mitchel','neigh_NAmes','neigh_NPkVill','neigh_NWAmes','neigh_NoRidge','neigh_NridgHt','neigh_OldTown',\
                 'neigh_SWISU','neigh_Sawyer','neigh_SawyerW','neigh_Somerst','neigh_StoneBr','neigh_Timber','neigh_Veenker']
neighborhood_dummy = neighborhood_dummy.reindex(columns=columnsTitles)

column_drop.append('neighborhood')
column_dummy = pd.concat([column_dummy,neighborhood_dummy], axis=1)

In [13]:
#Add _dummies for mas_vnr_type
mas_vnr_type_dummy = pd.get_dummies(test_df['mas_vnr_type'],prefix='mvtype',drop_first=True)

column_drop.append('mas_vnr_type')
column_dummy = pd.concat([column_dummy,mas_vnr_type_dummy], axis=1)

In [14]:
#Add _dummies for foundation
foundation_dummy = pd.get_dummies(test_df['foundation'],prefix='fnd', drop_first=True)

column_drop.append('foundation')
column_dummy = pd.concat([column_dummy,foundation_dummy], axis=1)

In [15]:
#Add _dummies for garage_type
garage_type_dummy = pd.get_dummies(test_df['garage_type'],prefix='gartyp', drop_first=True)

column_drop.append('garage_type')
column_dummy = pd.concat([column_dummy,garage_type_dummy], axis=1)

In [16]:
#Residential will include RH,RL,RP,RM,FV : Yes
#Non-Residential will include C(all),A(agr),I(all): No
residential_dummy = pd.get_dummies((test_df['ms_zoning'].map(lambda x: 'Yes' if (x == 'RL' or x == 'RM' or x == 'RH' \
                                                                                    or x == 'FV') else 'No')),prefix='Res', drop_first=True)

column_drop.append('ms_zoning')
column_dummy = pd.concat([column_dummy,residential_dummy], axis=1)

print(residential_dummy.sum())
residential_dummy.head()

Res_Yes    872
dtype: int64


Unnamed: 0,Res_Yes
0,1
1,1
2,1
3,1
4,1


In [17]:
#Regular lot shape will include Reg: Yes
#Irregular lot shape will include IR1, IR2, IR3: No
lotshape_dummy = pd.get_dummies((test_df['lot_shape'].map(lambda x: 'Yes' if (x == 'Reg') else 'No')),prefix='shapereg', drop_first=True)

column_drop.append('lot_shape')
column_dummy = pd.concat([column_dummy,lotshape_dummy], axis=1)


print(lotshape_dummy.sum())
lotshape_dummy.head()

shapereg_Yes    564
dtype: int64


Unnamed: 0,shapereg_Yes
0,1
1,0
2,0
3,1
4,0


In [18]:
#Inside will include Inside: Yes
#Non-Inside lot config will include Corner,CulDSac,FR2,FR3: No
lotconfig_dummy = pd.get_dummies((test_df['lot_config'].map(lambda x: 'Yes' if (x == 'Inside') else 'No')),prefix='lotInside', drop_first=True)

column_drop.append('lot_config')
column_dummy = pd.concat([column_dummy,lotconfig_dummy], axis=1)

print(lotconfig_dummy.sum())
lotconfig_dummy.head()

lotInside_Yes    637
dtype: int64


Unnamed: 0,lotInside_Yes
0,1
1,1
2,1
3,1
4,1


In [19]:
#Split Foyer & Split Level is group under non-1 story.
#1 story will include 1Story: Yes
#Non-1 story will include 2Story,1.5Fin,SLvl,SFoyer,2.5Unf,1.5Unf,2.5Fin: No
hselevel_dummy = pd.get_dummies((test_df['house_style'].map(lambda x: 'Yes' if (x == '1Story') else 'No')),prefix='1story', drop_first=True)

column_drop.append('house_style')
column_dummy = pd.concat([column_dummy,hselevel_dummy], axis=1)

print(hselevel_dummy.sum())
hselevel_dummy.head()

1story_Yes    422
dtype: int64


Unnamed: 0,1story_Yes
0,0
1,1
2,0
3,1
4,1


In [20]:
#AboveAvg will include 6, 7, 8, 9, 10 : abvavg
#Avg and below will include 1, 2, 3, 4, 5: below
overall_qual_dummy = pd.get_dummies((test_df['overall_qual'].map(lambda x: 'abvavg' if (x == 6 or x == 7 or x ==8 or \
                                                                                           x == 9 or x == 10) else 'below')),prefix='overallqty', drop_first=True)
column_drop.append('overall_qual')
column_dummy = pd.concat([column_dummy,overall_qual_dummy], axis=1)

print(overall_qual_dummy.sum())
overall_qual_dummy.head()

overallqty_below    344
dtype: int64


Unnamed: 0,overallqty_below
0,0
1,1
2,0
3,1
4,0


In [21]:
#AboveAvg will include 6, 7, 8, 9, 10 : abvavg
#Avg and below will include 1, 2, 3, 4, 5: below
overall_cond_dummy = pd.get_dummies((test_df['overall_cond'].map(lambda x: 'abvavg' if (x == 6 or x == 7 or x ==8 or \
                                                                                           x == 9 or x == 10) else 'below')),prefix='overcond', drop_first=True)
column_drop.append('overall_cond')
column_dummy = pd.concat([column_dummy,overall_cond_dummy], axis=1)

print(overall_cond_dummy.sum())
overall_cond_dummy.head()

overcond_below    539
dtype: int64


Unnamed: 0,overcond_below
0,0
1,1
2,1
3,0
4,1


In [22]:
#Ext. material more than 1 type: 1
#Ext. material only 1 type : 0

#iterate through each row and select 'exterior_1st' & 'exterior_2nd' column respectively.
test_df['mat_more_than1'] = 0
           
for ind in test_df.index:
    if (test_df['exterior_1st'][ind]) == (test_df['exterior_2nd'][ind]):
        test_df['mat_more_than1'][ind] = 0
    else:
        test_df['mat_more_than1'][ind] = 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [23]:
mat_morethan1_dummy = pd.get_dummies(test_df['mat_more_than1'],prefix='mat_morethan1', drop_first=True)

column_drop.append('exterior_1st')
column_drop.append('exterior_2nd')
column_drop.append('mat_more_than1')
column_dummy = pd.concat([column_dummy,mat_morethan1_dummy], axis=1)

print(mat_morethan1_dummy.sum())
mat_morethan1_dummy.head()

mat_morethan1_1    89
dtype: int64


Unnamed: 0,mat_morethan1_1
0,0
1,0
2,0
3,0
4,0


In [24]:
#Ext. quality above average will include Gd,Ex: abvavg
#Ext. quality average and below will include TA ,Fa : below
exter_qual_dummy = pd.get_dummies((test_df['exter_qual'].map(lambda x:'abvavg'if (x == 'Gd' or x == 'Ex') else 'below')),prefix='extqual', drop_first=True)

column_drop.append('exter_qual')
column_dummy = pd.concat([column_dummy,exter_qual_dummy], axis=1)

print(exter_qual_dummy.sum())
exter_qual_dummy.head()

extqual_below    561
dtype: int64


Unnamed: 0,extqual_below
0,1
1,1
2,0
3,0
4,1


In [25]:
#Basement height >= 80" will include TA,Gd,Ex: abv80inc
#Basement height < 80" will include Fa,Po,None : below80inc
bsmt_height_dummy = pd.get_dummies((test_df['bsmt_qual'].map(lambda x:'abv80inc'if (x == 'TA' or x == 'Gd' or x == 'Ex') else 'below80inc')),prefix='bsmtheight', drop_first=True)

column_drop.append('bsmt_qual')
column_dummy = pd.concat([column_dummy,bsmt_height_dummy], axis=1)

print(bsmt_height_dummy.sum())
bsmt_height_dummy.head()

bsmtheight_below80inc    54
dtype: int64


Unnamed: 0,bsmtheight_below80inc
0,1
1,0
2,0
3,0
4,0


In [26]:
#Basement exposure yes will include Av,Gd,Mn: Yes
#Basement exposure no will include No : No
bsmt_exposure_dummy = pd.get_dummies((test_df['bsmt_exposure'].map(lambda x:'No'if (x == 'No') else 'Yes')),prefix='bsmtexp', drop_first=True)

column_drop.append('bsmt_exposure')
column_dummy = pd.concat([column_dummy,bsmt_exposure_dummy], axis=1)

print(bsmt_exposure_dummy.sum())
bsmt_exposure_dummy.head()

bsmtexp_Yes    312
dtype: int64


Unnamed: 0,bsmtexp_Yes
0,0
1,0
2,1
3,0
4,0


In [27]:
#Basement finished yes will include GLQ,ALQ,BLQ,Rec,LwQ: Yes
#Basement finished no will include Unf,None : No
bsmt_finish_dummy = pd.get_dummies((test_df['bsmtfin_type_1'].map(lambda x:'No'if (x == 'Unf' or x == 'None') else 'Yes')),prefix='bsmtfin', drop_first=True)

column_drop.append('bsmtfin_type_1')
column_dummy = pd.concat([column_dummy,bsmt_finish_dummy], axis=1)

print(bsmt_finish_dummy.sum())
bsmt_finish_dummy.head()

bsmtfin_Yes    606
dtype: int64


Unnamed: 0,bsmtfin_Yes
0,0
1,0
2,1
3,0
4,1


In [28]:
#Heating qc above avg will include Ex,Gd: abvavg
#Heating qc average and below will include TA,Fa,Po : below
heatqc_dummy = pd.get_dummies((test_df['heating_qc'].map(lambda x:'abvavg'if (x == 'Ex' or x == 'Gd') else 'below')),prefix='heatqc', drop_first=True)

column_drop.append('heating_qc')
column_dummy = pd.concat([column_dummy,heatqc_dummy], axis=1)

print(heatqc_dummy.sum())
heatqc_dummy.head()

heatqc_below    292
dtype: int64


Unnamed: 0,heatqc_below
0,0
1,1
2,0
3,1
4,0


In [29]:
#Kitchen qc above avg will include Ex,Gd: abvavg
#Kitchen qc average and below will include TA,Fa,Po : below
kitqc_dummy = pd.get_dummies((test_df['kitchen_qual'].map(lambda x:'abvavg'if (x == 'Ex' or x == 'Gd') else 'below')),prefix='kitqc', drop_first=True)

column_drop.append('kitchen_qual')
column_dummy = pd.concat([column_dummy,kitqc_dummy], axis=1)

print(kitqc_dummy.sum())
kitqc_dummy.head()

kitqc_below    471
dtype: int64


Unnamed: 0,kitqc_below
0,1
1,1
2,0
3,1
4,1


In [30]:
#Fireplace yes will include Gd,TA,Fa,Po,Ex : Yes
#Fireplace no will include None : No
fireplace_dummy = pd.get_dummies((test_df['fireplace_qu'].map(lambda x:'No'if (x == 'None') else 'Yes')),prefix='firepl', drop_first=True)

column_drop.append('fireplace_qu')
column_dummy = pd.concat([column_dummy,fireplace_dummy], axis=1)

print(fireplace_dummy.sum())
fireplace_dummy.head()

firepl_Yes    457
dtype: int64


Unnamed: 0,firepl_Yes
0,0
1,0
2,1
3,0
4,1


In [31]:
#Finished will include Fin, RFn : Finish
#Unfinished will include Unfinished and None : Unfinish
garage_fin_dummy = pd.get_dummies((test_df['garage_finish'].map(lambda x: 'Finish' if (x == 'Fin' or x == 'RFn') else 'Unfinish')),prefix='garfin', drop_first=True)

column_drop.append('garage_finish')
column_dummy = pd.concat([column_dummy,garage_fin_dummy], axis=1)

print(garage_fin_dummy.sum())
garage_fin_dummy.head()

garfin_Unfinish    427
dtype: int64


Unnamed: 0,garfin_Unfinish
0,1
1,0
2,0
3,1
4,0


In [32]:
#Dropped those feature first that will be replaced with the newly created features later
test_df.drop(columns= column_drop, inplace = True)

In [33]:
test_df.head()

Unnamed: 0,year_built,year_remod/add,mas_vnr_area,total_bsmt_sf,1st_flr_sf,gr_liv_area,full_bath,totrms_abvgrd,garage_cars,garage_area
0,1910,1950,0,1020,908,1928,2,9,1,440
1,1977,1977,0,1967,1967,1967,2,10,2,580
2,2006,2006,0,654,664,1496,2,7,2,426
3,1923,2006,0,968,968,968,1,5,2,480
4,1963,1963,247,1394,1394,1394,1,6,2,514


In [34]:
#Create new dataframe for selected features, newly created features and dummy coded variable columns
test_df = pd.concat([test_df, column_dummy], axis=1)

## Create feature matrix (`X`) and target vector (`y`) on the train set to be used in our model.

In [35]:
#Reading the train data
train_df = pd.read_csv("../datasets/newtrain.csv")

In [36]:
train_df.head(2)

Unnamed: 0.1,Unnamed: 0,year_built,year_remod/add,mas_vnr_area,total_bsmt_sf,1st_flr_sf,gr_liv_area,full_bath,totrms_abvgrd,garage_cars,garage_area,saleprice,mssub_30,mssub_40,mssub_45,mssub_50,mssub_60,mssub_70,mssub_75,mssub_80,mssub_85,mssub_90,mssub_120,mssub_150,mssub_160,mssub_180,mssub_190,rfstyle_Gable,rfstyle_Gambrel,rfstyle_Hip,rfstyle_Mansard,rfstyle_Shed,neigh_Blueste,neigh_BrDale,neigh_BrkSide,neigh_ClearCr,neigh_CollgCr,neigh_Crawfor,neigh_Edwards,neigh_Gilbert,neigh_Greens,neigh_GrnHill,neigh_IDOTRR,neigh_Landmrk,neigh_MeadowV,neigh_Mitchel,neigh_NAmes,neigh_NPkVill,neigh_NWAmes,neigh_NoRidge,neigh_NridgHt,neigh_OldTown,neigh_SWISU,neigh_Sawyer,neigh_SawyerW,neigh_Somerst,neigh_StoneBr,neigh_Timber,neigh_Veenker,mvtype_BrkFace,mvtype_None,mvtype_Stone,fnd_CBlock,fnd_PConc,fnd_Slab,fnd_Stone,fnd_Wood,gartyp_Attchd,gartyp_Basment,gartyp_BuiltIn,gartyp_CarPort,gartyp_Detchd,gartyp_None,Res_Yes,shapereg_Yes,lotInside_Yes,1story_Yes,overallqty_below,overcond_below,mat_morethan1_1,extqual_below,bsmtheight_below80inc,bsmtexp_Yes,bsmtfin_Yes,heatqc_below,kitqc_below,firepl_Yes,garfin_Unfinish
0,0,2009,2010,760,2330,2364,2364,2,11,3,820,611657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,1,0,1,0,0,0,1,1,0,0,1,0
1,1,2006,2007,710,2660,2338,2338,2,8,3,1110,591587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,1,0,0,1,0


In [37]:
train_df.drop(columns='Unnamed: 0', inplace=True)

In [38]:
#Saleprice is our target
target = 'saleprice'

#Select 25 features based on the Lasson model built 
features = ['gr_liv_area','neigh_NridgHt','neigh_StoneBr','year_built','bsmtexp_Yes',\
            'mas_vnr_area','garage_cars','bsmtfin_Yes','garage_area','firepl_Yes',\
            'neigh_Crawfor','neigh_Somerst','rfstyle_Hip','year_remod/add','neigh_NoRidge',\
            '1story_Yes','neigh_GrnHill','mssub_75','gartyp_None','mvtype_None',\
            'overallqty_below','overcond_below','extqual_below','mssub_160','mssub_120']

X = train_df[features]
y = train_df[target]

### Using Lasso models

In [39]:
lasso = LassoCV(n_alphas=500)

In [40]:
X_train_model, X_test_model, y_train_model, y_test_model = train_test_split(X, y, random_state=42)

In [41]:
ss = StandardScaler()
X_train_model_sc = ss.fit_transform(X_train_model)
X_test_model_sc = ss.transform(X_test_model)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


### Fitting the model

In [46]:
lasso.fit(X_train_model_sc, y_train_model)



LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=500, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

## Create our feature matrix (`X1`) for the test set

In [47]:
#Using the same 25 features  
X1 = test_df[features]

In [48]:
X1_sc = ss.transform(X1)

  """Entry point for launching an IPython kernel.


In [49]:
X1_sc.shape

(879, 25)

In [50]:
prediction_new = lasso.predict(X1_sc)

In [51]:
pred_df = pd.DataFrame(prediction_new)

In [52]:
#output the predicted value to a new .csv file 
pred_df.to_csv("../datasets/predictions.csv")

<img src="../files/kaggle.png">