# Project 2: Ames Housing Data and Kaggle Challenge

--- 
# Kaggle Submission
---
- 1. cleaning the test.csv dataset using the same process I used for the training.csv
- 2. use the chosen model to predict sale price with the test.csv dataset

*All libraries used should be added here*

In [1]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import make_column_selector
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, OrdinalEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn import metrics
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import RFE
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
import warnings
warnings.filterwarnings('ignore')
import os

### Data Import and Cleaning test.csv Dataset using the same process I used to clean training.csv

In [2]:
# read in the data
testcsv = pd.read_csv('datasets/test.csv')
testcsv.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [3]:
testcsv.shape

(878, 80)

In [4]:
testcsv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    int64  
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     718 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

Exploratory

### Checking for nulls

In [5]:
#return list of columns with nulls
#housing.columns[housing.isnull().any()]
nulls = pd.DataFrame(testcsv.isnull().sum())
nulls.loc[nulls[0] != 0]

#19 cols with nulls

Unnamed: 0,0
Lot Frontage,160
Alley,820
Mas Vnr Type,1
Mas Vnr Area,1
Bsmt Qual,25
Bsmt Cond,25
Bsmt Exposure,25
BsmtFin Type 1,25
BsmtFin Type 2,25
Electrical,1


### Dropping rows & columns where it's helpful

In [6]:
#dropping rows in columns where there aren't that many nulls

testcsv.dropna(subset=['Mas Vnr Type','Mas Vnr Area','Electrical'], inplace=True)

### Engineer new columns
looking at nulls for some of these values, I decided to create new columns that might be more helpful
- does it have a garage?
- does it have a pool?
- does it have a basement?
- does it have a fence?
- does it have an alley?

In [7]:
testcsv['has_garage'] = np.where(testcsv['Garage Type'].isna(), 0, 1)
testcsv['has_pool'] = np.where(testcsv['Pool QC'].isna(), 0, 1)
testcsv['has_basement'] = np.where(testcsv['Bsmt Qual'].isna(), 0, 1)
testcsv['has_fence'] = np.where(testcsv['Fence'].isna(), 0, 1)
testcsv['has_alley'] = np.where(testcsv['Alley'].isna(), 0, 1)

In [8]:
#doublechecking that columns were mapped correctly
testcsv['has_alley'].value_counts()

0    818
1     58
Name: has_alley, dtype: int64

In [9]:
testcsv['Alley'].value_counts()

Grvl    35
Pave    23
Name: Alley, dtype: int64

### Drop columns that have a very high percentage of nulls (>than 50% nulls). The values in the newly engineered columns will be more helpful

In [10]:
#drop columns that have a very high percentage of nulls (>than 50% nulls). The values in the newly engineered columns will be more helpful

testcsv.drop(columns = ['Alley', 'Fireplace Qu', 'Pool QC', 'Fence'], inplace=True)

In [11]:
testcsv.shape

(876, 81)

### Rename columns

In [12]:
# remove spaces in column names
testcsv.columns = [col.replace(' ', '') for col in testcsv.columns]

In [13]:
# remove spaces in column names
testcsv.columns = [col.replace('/', '_') for col in testcsv.columns]

In [14]:
# rename columns to lowercase
testcsv.columns = [col.lower() for col in testcsv.columns]     #list comprehension to rename cols
testcsv.head(2)

Unnamed: 0,id,pid,mssubclass,mszoning,lotfrontage,lotarea,street,lotshape,landcontour,utilities,...,miscfeature,miscval,mosold,yrsold,saletype,has_garage,has_pool,has_basement,has_fence,has_alley
0,2658,902301120,190,RM,69.0,9142,Pave,Reg,Lvl,AllPub,...,,0,4,2006,WD,1,0,1,0,1
1,2718,905108090,90,RL,,9662,Pave,IR1,Lvl,AllPub,...,,0,8,2006,WD,1,0,1,0,0


# 3. Pre-Processing

In [15]:
nulls = pd.DataFrame(testcsv.isnull().sum())
nulls.loc[nulls[0] != 0]

Unnamed: 0,0
lotfrontage,160
bsmtqual,25
bsmtcond,25
bsmtexposure,25
bsmtfintype1,25
bsmtfintype2,25
garagetype,44
garageyrblt,45
garagefinish,45
garagequal,45


#### - create imputers to address nulls for nominal, ordinal, discrete, and continuous variables

In [16]:
#impute with median
imputer1 = SimpleImputer(strategy = 'median')

# impute with str 'NA'
imputer2 = SimpleImputer(strategy = 'constant', fill_value='NA')

# # imputer with most frequent
# imputer3 = SimpleImputer(strategy = 'most_frequent')

# making a column transformer
imputers = make_column_transformer(
                (imputer1, ['lotfrontage', 'garageyrblt']),
                (imputer2, ['bsmtqual', 'bsmtcond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfintype2', 'garagetype', 'garagefinish', 'garagequal', 'garagecond', 'miscfeature']),
                # (imputer3, ['masvnrtype']),
                remainder = 'passthrough',
                verbose_feature_names_out=False)  #we need false so that the feature names come out in the form we want

In [17]:
# use the pipe above to impute the nulls
# turn the output into a dataframe with the column names
testcsv_imp = pd.DataFrame(imputers.fit_transform(testcsv), columns=imputers.get_feature_names_out())

In [18]:
nulls = pd.DataFrame(testcsv_imp.isna().sum())
nulls.loc[nulls[0] != 0]

Unnamed: 0,0


In [19]:
testcsv_imp['lotfrontage'].value_counts()

# this matches what I expected. median is 68 and 160 (nulls) + 16

68.0     176
60.0      97
80.0      43
75.0      37
70.0      36
        ... 
150.0      1
122.0      1
30.0       1
174.0      1
182.0      1
Name: lotfrontage, Length: 104, dtype: int64

#### - create ordinal feature encoder pipeline

In [20]:
# cols I dropped & don't need to ordinal impute - 'Pool QC', 'Fireplace Qu', 'Fence'


# 'Lot Shape'
lotshape=['IR3','IR2', 'IR1','Reg'] 
ord1 = OrdinalEncoder(categories=[lotshape],dtype=int)

# Utilities
util=['ELO','NoSeWa','NoSewr','AllPub']
ord2 = OrdinalEncoder(categories=[util],dtype=int)

# Land Slope
lslope=['Sev', 'Mod','Gtl']
ord3= OrdinalEncoder(categories=[lslope],dtype=int)


# # Overall Qual, Overall Cond - talked to ben and he said it might be unnecessary to encode these b/c they are already in valued order
# overall=['1', '2', '3','4','5','6','7','8','9','10']
# ord4= OrdinalEncoder(categories=[overall],dtype=int)


# Exter Qual, Exter Cond, Bsmt Cond, Heating QC, Kitchen Qual, Garage Qual, Garage Cond, Bsmt Qual
# don't include b/c dropped: Pool QC, Fireplace Qu
exter=['NA','Po','Fa','TA','Gd','Ex']
ord5= OrdinalEncoder(categories=[exter],dtype=int)


# Bsmt Exposure
bsmtexp=['NA','No','Mn','Av','Gd']
ord6= OrdinalEncoder(categories=[bsmtexp],dtype=int)


# BsmtFin Type 1, BsmtFinType 2
bsmtfin=['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ']
ord7= OrdinalEncoder(categories=[bsmtfin],dtype=int)


# Electrical
elec=['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix']
ord8= OrdinalEncoder(categories = [elec],dtype=int)


#Functional
func=['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1','Typ']
ord9= OrdinalEncoder(categories=[func],dtype=int)


#Garage Finish
gar=['NA','Unf','RFn','Fin']
ord10= OrdinalEncoder(categories=[gar],dtype=int)


#paved driveway
drive=['N','P','Y']
ord11= OrdinalEncoder(categories=[drive],dtype=int)


# Fence - I don't need this since I dropped fence col
# fence=['NA','MnWw','GdWo','MnPrv','GdPrv']
# ord12= OrdinalEncoder(categories=[[fence]])


# combine all imputers into one imputer pipe
ord_imputers = make_column_transformer(
                (ord1, ['lotshape']),
                (ord2, ['utilities']),
                (ord3, ['landslope']),
                (ord5, ['exterqual']),
                (ord5, ['extercond']),
                (ord5, ['bsmtcond']),
                (ord5, ['heatingqc']),
                (ord5, ['kitchenqual']),
                (ord5, ['garagequal']),
                (ord5, ['garagecond']),
                (ord5, ['bsmtqual']),
                (ord6, ['bsmtexposure']),
                (ord7, ['bsmtfintype1']),
                (ord7, ['bsmtfintype2']),
                (ord8, ['electrical']),
                (ord9, ['functional']),
                (ord10, ['garagefinish']),
                (ord11, ['paveddrive']),
                remainder = 'passthrough',
                verbose_feature_names_out=False)

# what I tried to do with ord5 - (ord5, ['exterqual','extercond', 'bsmtcond', 'heatingqc', 'kitchenqual', 'garagequal','garagecond', 'bsmtqual']),

In [21]:
ord_imputers.fit_transform(testcsv_imp)

array([[3, 3, 2, ..., 1, 0, 1],
       [2, 3, 2, ..., 1, 0, 0],
       [2, 3, 2, ..., 1, 0, 0],
       ...,
       [3, 3, 2, ..., 1, 0, 0],
       [3, 3, 2, ..., 1, 1, 0],
       [3, 3, 2, ..., 1, 1, 0]], dtype=object)

In [22]:
# apply the ord_imputer, convert to dataframe and assign to new object
testcsv_imp2 = pd.DataFrame(ord_imputers.fit_transform(testcsv_imp), columns = ord_imputers.get_feature_names_out())

### double-checking that features that I ordinal encoded do match up to the category list

In [23]:
testcsv_imp['extercond'].value_counts()

TA    768
Gd     84
Fa     18
Ex      5
Po      1
Name: extercond, dtype: int64

In [24]:
testcsv_imp2['extercond'].value_counts()

3    768
4     84
2     18
5      5
1      1
Name: extercond, dtype: int64

In [25]:
#everything was converted to object. changing necessary cols to int/float
cols_to_change = ['id', 'pid', 'lotfrontage', 'lotarea', 'overallqual',
       'overallcond', 'yearbuilt', 'yearremod_add', 'masvnrarea', 'bsmtfinsf1',
       'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', '1stflrsf', '2ndflrsf',
       'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath',
       'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'totrmsabvgrd',
       'fireplaces', 'garageyrblt', 'garagecars', 'garagearea', 'wooddecksf',
       'openporchsf', 'enclosedporch', '3ssnporch', 'screenporch', 'poolarea',
       'miscval', 'mosold', 'yrsold', 'has_garage', 'has_pool', 'has_basement',
       'has_fence', 'has_alley','lotshape','utilities','landslope','exterqual','extercond','bsmtcond',
       'heatingqc', 'kitchenqual', 'garagequal', 'garagecond','bsmtqual','bsmtexposure','bsmtfintype1',
        'bsmtfintype2', 'electrical','functional','garagefinish','paveddrive','overallqual','overallcond']


for col in cols_to_change:
    testcsv_imp2[col] = pd.to_numeric(testcsv_imp2[col])

In [26]:
#checking to make sure these are the only list of categoricals
testcsv_imp2.select_dtypes(object).columns

Index(['garagetype', 'miscfeature', 'mssubclass', 'mszoning', 'street',
       'landcontour', 'lotconfig', 'neighborhood', 'condition1', 'condition2',
       'bldgtype', 'housestyle', 'roofstyle', 'roofmatl', 'exterior1st',
       'exterior2nd', 'masvnrtype', 'foundation', 'heating', 'centralair',
       'saletype'],
      dtype='object')

### Dummy categorical features on test.csv

In [27]:
# dummy all the cat columns - drop first
# dummy_na will separate out a column for non-value
testcsv_dummied = pd.get_dummies(data=testcsv_imp2, drop_first=True)

In [28]:
testcsv_dummied.shape

(876, 201)

In [29]:
testcsv_imp2.shape

(876, 81)

In [30]:
testcsv_dummied.head()

Unnamed: 0,lotshape,utilities,landslope,exterqual,extercond,bsmtcond,heatingqc,kitchenqual,garagequal,garagecond,...,centralair_Y,saletype_CWD,saletype_Con,saletype_ConLD,saletype_ConLI,saletype_ConLw,saletype_New,saletype_Oth,saletype_VWD,saletype_WD
0,3,3,2,3,2,3,4,2,1,1,...,0,0,0,0,0,0,0,0,0,1
1,2,3,2,3,3,3,3,3,3,3,...,1,0,0,0,0,0,0,0,0,1
2,2,3,2,4,3,4,5,4,3,3,...,1,0,0,0,0,0,1,0,0,0
3,3,3,2,4,3,3,3,3,2,3,...,1,0,0,0,0,0,0,0,0,1
4,2,3,2,3,3,3,4,3,3,3,...,1,0,0,0,0,0,0,0,0,1


Before scaling, dropping 'id' and 'pid'

In [31]:
#testcsv_dummied
testcsv_toscale = testcsv_dummied.drop(columns = ['id', 'pid'])
testcsv_id = testcsv_dummied[['id', 'pid']]

In [32]:
testcsv_toscale

Unnamed: 0,lotshape,utilities,landslope,exterqual,extercond,bsmtcond,heatingqc,kitchenqual,garagequal,garagecond,...,centralair_Y,saletype_CWD,saletype_Con,saletype_ConLD,saletype_ConLI,saletype_ConLw,saletype_New,saletype_Oth,saletype_VWD,saletype_WD
0,3,3,2,3,2,3,4,2,1,1,...,0,0,0,0,0,0,0,0,0,1
1,2,3,2,3,3,3,3,3,3,3,...,1,0,0,0,0,0,0,0,0,1
2,2,3,2,4,3,4,5,4,3,3,...,1,0,0,0,0,0,1,0,0,0
3,3,3,2,4,3,3,3,3,2,3,...,1,0,0,0,0,0,0,0,0,1
4,2,3,2,3,3,3,4,3,3,3,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,3,3,2,3,3,3,3,3,3,3,...,1,0,0,0,0,0,0,0,0,1
872,3,3,2,4,4,3,5,4,3,3,...,1,0,0,0,0,0,0,0,0,1
873,3,3,2,3,3,3,5,3,3,3,...,1,0,0,0,0,0,0,0,0,1
874,3,3,2,3,3,3,3,3,3,3,...,1,0,0,0,0,0,0,0,0,1


### Use standard scalar to fit and transform data

In [33]:
#instantiate the StandardScaler
sc = StandardScaler()

In [34]:
#transform
Z_test = pd.DataFrame(sc.fit_transform(testcsv_toscale), columns = testcsv_toscale.columns)

In [35]:
# join the ids and scaled df back together
Z_test = pd.concat([testcsv_id, Z_test], axis=1)

In [36]:
Z_test.tail()

Unnamed: 0,id,pid,lotshape,utilities,landslope,exterqual,extercond,bsmtcond,heatingqc,kitchenqual,...,centralair_Y,saletype_CWD,saletype_Con,saletype_ConLD,saletype_ConLI,saletype_ConLw,saletype_New,saletype_Oth,saletype_VWD,saletype_WD
871,1662,527377110,0.68775,0.033806,0.216901,-0.677444,-0.228478,0.160843,-1.193322,-0.753497,...,0.258827,-0.047836,-0.033806,-0.101885,-0.047836,-0.058621,-0.312641,-0.058621,-0.033806,0.404162
872,1234,535126140,0.68775,0.033806,0.216901,1.110028,2.476205,0.160843,0.924915,0.770899,...,0.258827,-0.047836,-0.033806,-0.101885,-0.047836,-0.058621,-0.312641,-0.058621,-0.033806,0.404162
873,1373,904100040,0.68775,0.033806,0.216901,-0.677444,-0.228478,0.160843,0.924915,-0.753497,...,0.258827,-0.047836,-0.033806,-0.101885,-0.047836,-0.058621,-0.312641,-0.058621,-0.033806,0.404162
874,1672,527425140,0.68775,0.033806,0.216901,-0.677444,-0.228478,0.160843,-1.193322,-0.753497,...,0.258827,-0.047836,-0.033806,-0.101885,-0.047836,-0.058621,-0.312641,-0.058621,-0.033806,0.404162
875,1939,535327160,0.68775,0.033806,0.216901,-0.677444,2.476205,0.160843,-1.193322,-0.753497,...,0.258827,-0.047836,-0.033806,-0.101885,-0.047836,-0.058621,-0.312641,-0.058621,-0.033806,0.404162


In [37]:
#keep the columns that match the training final df set used for modeling
Z_test_final = Z_test[['id','pid','overallqual', 'exterqual', 'kitchenqual', 'garagearea', 'garagefinish',
       'yearremod_add', 'fullbath', 'masvnrarea', 'garageyrblt',
       'totrmsabvgrd', 'heatingqc', 'neighborhood_NridgHt', 'fireplaces',
       'bsmtfinsf1', 'bsmtexposure', 'saletype_New', 'openporchsf',
       'bsmtfintype1', 'exterior2nd_VinylSd', 'lotfrontage',
       'masvnrtype_Stone', 'wooddecksf', 'halfbath', 'lotarea', 'paveddrive',
       'neighborhood_StoneBr', 'roofstyle_Hip', 'bsmtfullbath', 'centralair_Y',
       'garagetype_BuiltIn', 'masvnrtype_BrkFace', 'neighborhood_NoRidge',
       'bsmtcond', 'housestyle_2Story', 'landcontour_HLS', 'has_garage',
       'bsmtunfsf', 'exterior2nd_CmentBd', 'screenporch', 'bedroomabvgr',
       'lotconfig_CulDSac', 'neighborhood_Somerst', 'functional',
       'neighborhood_Timber', 'condition1_Norm', 'condition2_PosA',
       'neighborhood_MeadowV', 'overallcond', 'neighborhood_BrDale',
       'kitchenabvgr', 'condition1_Feedr', 'bldgtype_Twnhs', 'has_alley',
       'exterior2nd_MetalSd', 'neighborhood_BrkSide', 'foundation_Slab',
       'exterior2nd_HdBoard', 'neighborhood_Sawyer', 'enclosedporch',
       'has_fence', 'neighborhood_NAmes', 'exterior2nd_Wd Sdng',
       'neighborhood_IDOTRR', 'neighborhood_OldTown', 'neighborhood_Edwards',
       'exterior1st_Wd Sdng', 'saletype_WD ', 'garagetype_NA', 'electrical',
       'mszoning_RM', 'lotshape', 'foundation_CBlock', 'garagetype_Detchd']]

In [38]:
pd.set_option('display.max_rows', 200)

In [39]:
Z_test_final.shape

(876, 75)

### datasets for modeling
- df_model_train
- df_model_test
- y_train
- y_test

In [40]:
#read in datasets that I used for modelling

housing = pd.read_csv('cleaned_datasets/housing_preprocessed1_data.csv')
df_model_train = pd.read_csv('cleaned_datasets/housing_df_model_train.csv')
df_model_test = pd.read_csv('cleaned_datasets/housing_df_model_test.csv')
y_train = pd.read_csv('cleaned_datasets/housing_y-train.csv')
y_test = pd.read_csv('cleaned_datasets/housing_y-test_data.csv')

In [41]:
df_model_train_final = df_model_train.drop(columns=['pid', 'id'])
df_model_test_final = df_model_test.drop(columns=['pid', 'id'])
y_train = y_train.drop(columns=['id'])
y_test = y_test.drop(columns=['id'])

In [42]:
df_model_train_final.shape

(1370, 73)

In [43]:
df_model_test_final.shape

(676, 73)

In [44]:
Z_test_final.shape

(876, 75)

In [45]:
# baseline
y_train.mean()[0]

181205.50218978102

#### Model 1 - Linear Regression

In [46]:
#instantiate and fit OLS model
lr = LinearRegression()
lr.fit(df_model_train_final, y_train)

In [47]:
# r2 for test set from train.csv
lr.score(df_model_train_final, y_train)

0.9036883527932799

In [48]:
#calculate RMSE train
y_pred = lr.predict(df_model_train_final)
mean_squared_error(y_train, y_pred, squared=False)

24716.971636189173

In [49]:
#calculate RMSE test
y_test_pred = lr.predict(df_model_test_final)
mean_squared_error(y_test, y_test_pred, squared=False)

26742.42745018656

In [50]:
#checking the model on the test data
lr.score(df_model_test_final, y_test)

0.8839212571741166

In [51]:
lr.coef_

array([[ 1.65131274e+04,  4.95501674e+03,  6.56775827e+03,
         6.22163441e+03,  1.14557332e+03,  1.03839192e+03,
         6.93849462e+03,  7.65770700e+03, -7.27464922e+02,
         1.25913545e+04,  1.35253727e+03,  6.92751427e+03,
         4.05998588e+03,  1.61233666e+04,  5.30533140e+03,
         4.02010467e+03,  2.67868674e+03, -1.07315126e+02,
        -3.84751759e+02,  4.05037683e+03, -6.14363524e+02,
         1.87787610e+03,  4.48483914e+03,  5.98047187e+03,
         1.58524788e+03,  6.20797124e+03,  3.10034005e+03,
         3.25818644e+03, -1.42807937e+02,  2.44934571e+03,
        -4.68715899e+03,  5.37061758e+03, -2.95357180e+03,
         7.74672457e+01,  2.68138297e+03, -8.55669797e+16,
         7.85219727e+03,  2.03673104e+03,  4.73792603e+03,
        -1.56570407e+03,  1.97158102e+03,  1.04610193e+03,
         1.54639655e+03, -9.54586927e+02,  2.34693397e+03,
         1.62754114e+03, -6.91800497e+02,  5.04090410e+03,
         4.46318933e+02, -4.46217386e+03,  3.95480207e+0

In [63]:
pd.DataFrame(lr.coef_[0], df_model_train_final.columns).sort_values(by=0, ascending=False)

Unnamed: 0,0
overallqual,16513.13
bsmtfinsf1,16123.37
totrmsabvgrd,12591.35
bsmtunfsf,7852.197
masvnrarea,7657.707
fullbath,6938.495
neighborhood_NridgHt,6927.514
kitchenqual,6567.758
garagearea,6221.634
neighborhood_StoneBr,6207.971


In [52]:
print('baseline:', y_train.mean())

print("-----LR/OLS----- ")
print('rmse train:', mean_squared_error(y_train, y_pred, squared=False))
print('rmse test:', mean_squared_error(y_test, y_test_pred, squared=False))
print('r2 train:', lr.score(df_model_train_final, y_train))
print('r2 test:', lr.score(df_model_test_final, y_test))

baseline: saleprice    181205.50219
dtype: float64
-----LR/OLS----- 
rmse train: 24716.971636189173
rmse test: 26742.42745018656
r2 train: 0.9036883527932799
r2 test: 0.8839212571741166


### Run test csv data through the model built above to predict sale price

In [53]:
Z_test_final2 = Z_test_final.drop(columns=['id', 'pid'])

In [54]:
lr.predict(Z_test_final2)[:10]

array([[143014.05548929],
       [152130.17357999],
       [231431.32623981],
       [104241.35118477],
       [188958.17357999],
       [ 83269.23257242],
       [ 80581.35118477],
       [155838.29219234],
       [158342.7104856 ],
       [176413.67643813]])

In [58]:
pd.set_option('display.max_rows', 200)

In [59]:
kaggle_preds = pd.DataFrame(lr.predict(Z_test_final2))
kaggle_preds.rename(columns = {0: 'SalePrice'}, inplace=True)
kaggle_preds[:15]

Unnamed: 0,SalePrice
0,143014.055489
1,152130.17358
2,231431.32624
3,104241.351185
4,188958.17358
5,83269.232572
6,80581.351185
7,155838.292192
8,158342.710486
9,176413.676438


In [65]:
kaggle_preds.mean()

SalePrice    181174.601292
dtype: float64

In [66]:
kaggle = pd.concat([Z_test_final, kaggle_preds], axis=1)
kaggle = kaggle[['id', 'SalePrice']]
kaggle.rename(columns={'id': 'Id'}, inplace=True)
kaggle[:20]

Unnamed: 0,Id,SalePrice
0,2658,143014.055489
1,2718,152130.17358
2,2414,231431.32624
3,1989,104241.351185
4,625,188958.17358
5,333,83269.232572
6,1327,80581.351185
7,858,155838.292192
8,95,158342.710486
9,1568,176413.676438


In [68]:
#export predictions to csv
# kaggle.to_csv('DS_kaggle_submission.csv', index = False)

#commenting out so it doesn't reexport