In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns

from sklearn.linear_model import LinearRegression, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

%matplotlib inline

In [2]:
df = pd.read_csv('train.csv')
final_test = pd.read_csv('test.csv')

In [3]:
df.rename(columns= {col: col.lower().replace(' ','_') for col in df.columns}, inplace=True)
final_test.rename(columns= {col: col.lower().replace(' ','_') for col in final_test.columns}, inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 82 columns):
id                 2051 non-null int64
pid                2051 non-null int64
ms_subclass        2051 non-null int64
ms_zoning          2051 non-null object
lot_frontage       1721 non-null float64
lot_area           2051 non-null int64
street             2051 non-null object
alley              140 non-null object
lot_shape          2051 non-null object
land_contour       2051 non-null object
utilities          2051 non-null object
lot_config         2051 non-null object
land_slope         2051 non-null object
neighborhood       2051 non-null object
condition_1        2051 non-null object
condition_2        2051 non-null object
bldg_type          2051 non-null object
house_style        2051 non-null object
overall_qual       2051 non-null int64
overall_cond       2051 non-null int64
year_built         2051 non-null int64
year_remod/add     2051 non-null int64
roof_style         20

In [10]:
y = df['saleprice']
features = ['ms_subclass', 'lot_area', 'overall_qual', 'overall_cond','1st_flr_sf', '2nd_flr_sf', 'full_bath', 'bedroom_abvgr', 'kitchen_abvgr']
X = df[features]

In [12]:
X.isnull().sum()

ms_subclass      0
lot_area         0
overall_qual     0
overall_cond     0
1st_flr_sf       0
2nd_flr_sf       0
full_bath        0
bedroom_abvgr    0
kitchen_abvgr    0
dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)
# stratify give the percentage of each class -- for categories

In [14]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [15]:
lr = LinearRegression()
lr.fit(X_train_sc ,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
print('train', lr.score(X_train_sc, y_train))
print('test', lr.score(X_test_sc, y_test))

train 0.755934230159
test 0.823783204785


In [17]:
# ElasticNet - GridSearch

In [18]:
enet = ElasticNet()
params = {
    'alpha': np.arange(.01, .08, .005),
    'l1_ratio': np.arange(.01, .03, .005)
}
gs = GridSearchCV(enet, params)
gs.fit(X_train_sc, y_train)
gs.best_params_

{'alpha': 0.074999999999999983, 'l1_ratio': 0.024999999999999998}

In [38]:
gs.score(X_test_sc, y_test)

0.81447440223909962

In [28]:
X_final_test = final_test[features]
X_final_test_sc = ss.transform(X_final_test)
predictions = gs.predict(X_final_test_sc)

In [29]:
len(X_final_test)

879

In [30]:
len(final_test)

879

In [31]:
len(predictions)

879

In [32]:
final_test['SalePrice'] = predictions

In [35]:
final_test.columns

Index(['id', 'pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area',
       'street', 'alley', 'lot_shape', 'land_contour', 'utilities',
       'lot_config', 'land_slope', 'neighborhood', 'condition_1',
       'condition_2', 'bldg_type', 'house_style', 'overall_qual',
       'overall_cond', 'year_built', 'year_remod/add', 'roof_style',
       'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type',
       'mas_vnr_area', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1',
       'bsmtfin_type_2', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf',
       'heating', 'heating_qc', 'central_air', 'electrical', '1st_flr_sf',
       '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'bsmt_full_bath',
       'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
       'kitchen_abvgr', 'kitchen_qual', 'totrms_abvgrd', 'functional',
       'fireplaces', 'fireplace_qu', 'garage_type', 'garage_yr_blt',
       'g

In [37]:
final_test[['id','SalePrice']].to_csv('export_submit_2.csv', index=False)