## Predicting House Sale Prices in Iowa
This project is aimed towards predicting house sale prices in Iowa based on Ames housing data available.

In [2]:
#Importing necessary libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
df = pd.read_csv('C:/Users/thomath/OneDrive - TIAA/Desktop/AmesHousing.txt',delimiter='\t')

In [5]:
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [72]:
df_towork=df.copy()

In [73]:
#Identifying columns less than 5% null values
NullSeries=df_towork.isnull().sum()
NullSeries_Columns=NullSeries[(NullSeries!=0) & (NullSeries<146)].index
NullSeries_Columns

Index(['Mas Vnr Type', 'Mas Vnr Area', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2',
       'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Electrical',
       'Bsmt Full Bath', 'Bsmt Half Bath', 'Garage Cars', 'Garage Area'],
      dtype='object')

In [74]:
#Filling numerical columns from nullseries with mean.
to_fill_mean=df_towork[NullSeries_Columns].select_dtypes(include=(['float','int'])).columns
for i in to_fill_mean:
    df_towork[i]=df_towork[i].fillna(np.mean(df_towork[i]))

In [75]:
#df_towork.info()
df_towork['years_since_remod']=df_towork['Yr Sold']-df_towork['Year Remod/Add']
df_towork['years_to_remod']=df_towork['Year Remod/Add']-df_towork['Year Built']
df_towork['years_sold']=df_towork['Yr Sold']-df_towork['Year Built']
df_towork[(df_towork['years_since_remod']<0) | (df_towork['years_to_remod']<0) | (df_towork['years_sold']<0) ]
#Rows 850,1702,2180 and 2181 need to be dropped since they have invalid values
df_towork=df_towork.drop([850,1702,2180,2181],axis=0)



In [76]:
#Dropping columns which are not relevant
df_towork = df_towork.drop(["PID", "Order", "Mo Sold", "Sale Condition", "Sale Type", "Year Built", "Year Remod/Add"], axis=1)
df_towork.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2926 entries, 0 to 2929
Data columns (total 78 columns):
MS SubClass          2926 non-null int64
MS Zoning            2926 non-null object
Lot Frontage         2436 non-null float64
Lot Area             2926 non-null int64
Street               2926 non-null object
Alley                198 non-null object
Lot Shape            2926 non-null object
Land Contour         2926 non-null object
Utilities            2926 non-null object
Lot Config           2926 non-null object
Land Slope           2926 non-null object
Neighborhood         2926 non-null object
Condition 1          2926 non-null object
Condition 2          2926 non-null object
Bldg Type            2926 non-null object
House Style          2926 non-null object
Overall Qual         2926 non-null int64
Overall Cond         2926 non-null int64
Roof Style           2926 non-null object
Roof Matl            2926 non-null object
Exterior 1st         2926 non-null object
Exterior 2nd   

In [77]:
#Dropping text fields that have missing values
text_fields=df_towork.select_dtypes(include=(['object'])).columns
missing_text_vals=df_towork[text_fields].isnull().sum()
missing_text_vals_columns=missing_text_vals[missing_text_vals>0].index
df_towork=df_towork.drop(missing_text_vals_columns,axis=1)

In [252]:
numeric_nulls =df_towork.select_dtypes(include=('int64','float64')).isnull().sum()

In [254]:
numeric_nulls=numeric_nulls[numeric_nulls>0].index
df_towork=df_towork.drop(numeric_nulls,axis=1)

In [262]:
#Combining all the steps above to encapsulate it in a single function
def transform_features(df):
    df_towork=df.copy()
    
    #Dropping text fields that have missing values
    text_fields=df_towork.select_dtypes(include=(['object'])).columns
    missing_text_vals=df_towork[text_fields].isnull().sum()
    missing_text_vals_columns=missing_text_vals[missing_text_vals>0].index
    df_towork=df_towork.drop(missing_text_vals_columns,axis=1)

    #Filling numerical columns that have less than 5% null values
    NullSeries=df_towork.isnull().sum()
    NullSeries_Columns=NullSeries[(NullSeries!=0) & (NullSeries<146)].index
    to_fill_mean=df_towork[NullSeries_Columns].select_dtypes(include=(['float','int'])).columns
    for i in to_fill_mean:
        df_towork[i]=df_towork[i].fillna(np.mean(df_towork[i]))
     
    #Calculating new fields that are numerical in nature
    df_towork['years_since_remod']=df_towork['Yr Sold']-df_towork['Year Remod/Add']
    df_towork['years_to_remod']=df_towork['Year Remod/Add']-df_towork['Year Built']
    df_towork['years_sold']=df_towork['Yr Sold']-df_towork['Year Built']
    #df_towork[(df_towork['years_since_remod']<0) | (df_towork['years_to_remod']<0) | (df_towork['years_sold']<0) ]
    #Rows 850,1702,2180 and 2181 need to be dropped since they have invalid values
    df_towork=df_towork.drop([850,1702,2180,2181],axis=0)
    
    #Dropping columns which still have numeric null values
    numeric_nulls =df_towork.select_dtypes(include=('int64','float64')).isnull().sum()
    numeric_nulls=numeric_nulls[numeric_nulls>0].index
    df_towork=df_towork.drop(numeric_nulls,axis=1)

    #Dropping columns which are not relevant
    df_towork = df_towork.drop(["PID", "Order", "Mo Sold", "Sale Condition", "Sale Type", "Year Built", "Year Remod/Add"], axis=1)
    
    
    return df_towork
    

In [255]:
#Testing above function
df_test=transform_features(df)
df_test.info()
del df_test

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2926 entries, 0 to 2929
Data columns (total 62 columns):
MS SubClass          2926 non-null int64
MS Zoning            2926 non-null object
Lot Frontage         2436 non-null float64
Lot Area             2926 non-null int64
Street               2926 non-null object
Lot Shape            2926 non-null object
Land Contour         2926 non-null object
Utilities            2926 non-null object
Lot Config           2926 non-null object
Land Slope           2926 non-null object
Neighborhood         2926 non-null object
Condition 1          2926 non-null object
Condition 2          2926 non-null object
Bldg Type            2926 non-null object
House Style          2926 non-null object
Overall Qual         2926 non-null int64
Overall Cond         2926 non-null int64
Roof Style           2926 non-null object
Roof Matl            2926 non-null object
Exterior 1st         2926 non-null object
Exterior 2nd         2926 non-null object
Mas Vnr Area  

In [80]:
df_towork.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2926 entries, 0 to 2929
Data columns (total 62 columns):
MS SubClass          2926 non-null int64
MS Zoning            2926 non-null object
Lot Frontage         2436 non-null float64
Lot Area             2926 non-null int64
Street               2926 non-null object
Lot Shape            2926 non-null object
Land Contour         2926 non-null object
Utilities            2926 non-null object
Lot Config           2926 non-null object
Land Slope           2926 non-null object
Neighborhood         2926 non-null object
Condition 1          2926 non-null object
Condition 2          2926 non-null object
Bldg Type            2926 non-null object
House Style          2926 non-null object
Overall Qual         2926 non-null int64
Overall Cond         2926 non-null int64
Roof Style           2926 non-null object
Roof Matl            2926 non-null object
Exterior 1st         2926 non-null object
Exterior 2nd         2926 non-null object
Mas Vnr Area  

In [207]:
#Calculating corrrelation factors of all numerical features
numerical_features=df_towork.select_dtypes(include=(['float64','int64'])).columns
corrmat=df_towork.select_dtypes(include=(['float64','int64'])).corr()
strong_corr=corrmat[corrmat['SalePrice'].abs()>=.4].drop(['SalePrice'],axis=1).index
weak_corr=corrmat[corrmat['SalePrice'].abs()<.4].index
weak_corr
#df_towork=df_towork.drop(weak_corr,axis=1)
#corrmat[corrmat['SalePrice'].abs()>=.4].index

Index(['MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Cond',
       'BsmtFin SF 2', 'Bsmt Unf SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Bsmt Full Bath', 'Bsmt Half Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch',
       '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', 'Yr Sold',
       'years_to_remod'],
      dtype='object')

In [190]:
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]

In [208]:
df_towork_v2=df_towork.copy()

In [209]:
#weak_corr
df_towork_v2=df_towork_v2.drop(weak_corr,axis=1)

In [211]:
df_towork.columns

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Area',
       'Exter Qual', 'Exter Cond', 'Foundation', 'BsmtFin SF 1',
       'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC',
       'Central Air', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Garage Yr Blt',
       'Garage Cars', 'Garage Area', 'Paved Drive', 'Wood Deck SF',
       'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch',
       'Pool Area', 'Misc Val', 'Yr Sold', 'SalePrice', 'years_since_remod',
       

In [170]:
#Dropping nominal features which have weak correlation
variables_to_categorize=[]
for i in nominal_features:
    if i in df_towork_v2.columns:
        variables_to_categorize.append(i)

In [171]:
variables_to_categorize
for i in variables_to_categorize:
    if len(df_towork_v2[i].unique())>=10:
        print(i,len(df_towork_v2[i].unique()))
        df_towork_v2=df_towork_v2.drop(i,axis=1)

Neighborhood 28
Exterior 1st 16
Exterior 2nd 17


In [172]:
variables_to_categorize.remove('Neighborhood')
variables_to_categorize.remove('Exterior 1st')
variables_to_categorize.remove('Exterior 2nd')

In [173]:
variables_to_categorize

['MS Zoning',
 'Street',
 'Land Contour',
 'Lot Config',
 'Condition 1',
 'Condition 2',
 'Bldg Type',
 'House Style',
 'Roof Style',
 'Roof Matl',
 'Foundation',
 'Heating',
 'Central Air']

In [174]:
df_towork_v2
text_cols=df_towork_v2.select_dtypes(include=(['object'])).columns
text_cols
for i in text_cols:
    df_towork_v2[i]=df_towork_v2[i].astype('category')

In [175]:
df_towork_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2926 entries, 0 to 2929
Data columns (total 37 columns):
MS Zoning            2926 non-null category
Street               2926 non-null category
Lot Shape            2926 non-null category
Land Contour         2926 non-null category
Utilities            2926 non-null category
Lot Config           2926 non-null category
Land Slope           2926 non-null category
Condition 1          2926 non-null category
Condition 2          2926 non-null category
Bldg Type            2926 non-null category
House Style          2926 non-null category
Overall Qual         2926 non-null int64
Roof Style           2926 non-null category
Roof Matl            2926 non-null category
Mas Vnr Area         2926 non-null float64
Exter Qual           2926 non-null category
Exter Cond           2926 non-null category
Foundation           2926 non-null category
BsmtFin SF 1         2926 non-null float64
Total Bsmt SF        2926 non-null float64
Heating            

In [176]:
df_towork_v3=pd.concat([pd.get_dummies(df_towork_v2[variables_to_categorize]),df_towork_v2],axis=1).drop(variables_to_categorize,axis=1)

In [186]:
df_towork_v3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2926 entries, 0 to 2929
Data columns (total 100 columns):
MS Zoning_A (agr)     2926 non-null uint8
MS Zoning_C (all)     2926 non-null uint8
MS Zoning_FV          2926 non-null uint8
MS Zoning_I (all)     2926 non-null uint8
MS Zoning_RH          2926 non-null uint8
MS Zoning_RL          2926 non-null uint8
MS Zoning_RM          2926 non-null uint8
Street_Grvl           2926 non-null uint8
Street_Pave           2926 non-null uint8
Land Contour_Bnk      2926 non-null uint8
Land Contour_HLS      2926 non-null uint8
Land Contour_Low      2926 non-null uint8
Land Contour_Lvl      2926 non-null uint8
Lot Config_Corner     2926 non-null uint8
Lot Config_CulDSac    2926 non-null uint8
Lot Config_FR2        2926 non-null uint8
Lot Config_FR3        2926 non-null uint8
Lot Config_Inside     2926 non-null uint8
Condition 1_Artery    2926 non-null uint8
Condition 1_Feedr     2926 non-null uint8
Condition 1_Norm      2926 non-null uint8
Condition 

In [223]:
#Combining all the steps above to encapsulate it in a single function
def select_features(df):
    df_towork=df.copy()
    #Calculating corrrelation factors of all numerical features
    numerical_features=df_towork.select_dtypes(include=(['float64','int64'])).columns
    corrmat=df_towork.select_dtypes(include=(['float64','int64'])).corr()
    strong_corr=corrmat[corrmat['SalePrice'].abs()>=.4].drop(['SalePrice'],axis=1).index
    weak_corr=corrmat[corrmat['SalePrice'].abs()<.4].index
    #weak_corr
    df_towork=df_towork.drop(weak_corr,axis=1)
    
    nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Condition"]
    
    #Dropping nominal features which have weak correlation
    variables_to_categorize=[]
    for i in nominal_features:
        if i in df_towork.columns:
            variables_to_categorize.append(i)
    
    for i in variables_to_categorize:
        if len(df_towork[i].unique())>=10:
            #print(i,len(df_towork[i].unique()))
            df_towork=df_towork.drop(i,axis=1)
        
    variables_to_categorize.remove('Neighborhood')
    variables_to_categorize.remove('Exterior 1st')
    variables_to_categorize.remove('Exterior 2nd')
    
    text_cols=df_towork.select_dtypes(include=(['object'])).columns
    for i in text_cols:
        df_towork[i]=df_towork[i].astype('category')
    
    df_towork=pd.concat([pd.get_dummies(df_towork.select_dtypes(include=(['category']))),df_towork],axis=1).drop(variables_to_categorize,axis=1)
        
    return df_towork

In [263]:
#Testing above function
df_test=transform_features(df)
df_test=select_features(df_test)
df_test.info()
#del df_test


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2926 entries, 0 to 2929
Columns: 139 entries, MS Zoning_A (agr) to years_sold
dtypes: category(9), float64(5), int64(9), uint8(116)
memory usage: 701.7 KB


In [264]:
train=df_test[:1460]
test=df_test[1460:]
lr = linear_model.LinearRegression()
features=df_test.select_dtypes(include=('int64','float64')).drop(['SalePrice'],axis=1).columns
features
lr.fit(train[features],train['SalePrice'])
predict=lr.predict(test[features])
mse=mean_squared_error(predict,test['SalePrice'])
rmse=np.sqrt(mse)
print(rmse)


36653.18910448307


In [272]:
#Combining steps above to encapsulate it in a single function
def train_and_test(df,k=0):
    #features
    features=df.select_dtypes(include=('int64','float64')).drop(['SalePrice'],axis=1).columns
    
    if k==0:
        train=df[:1460]
        test=df[1460:]
        lr = linear_model.LinearRegression()
        lr.fit(train[features],train['SalePrice'])
        predict=lr.predict(test[features])
        mse=mean_squared_error(predict,test['SalePrice'])
        rmse=np.sqrt(mse)
        return rmse
    
    if k==1:
        df_shuffle = df.sample(frac=1,)
        fold_one=df_shuffle[:1460]
        fold_two=df_shuffle[1460:]
        lr = linear_model.LinearRegression()
        
        lr.fit(fold_one[features],fold_one['SalePrice'])
        predict_one=lr.predict(fold_two[features])
        mse_one=mean_squared_error(predict_one,fold_two['SalePrice'])
        rmse_one=np.sqrt(mse_one)
        
        lr = linear_model.LinearRegression()
        lr.fit(fold_two[features],fold_two['SalePrice'])
        predict_two=lr.predict(fold_one[features])
        mse_two=mean_squared_error(predict_two,fold_one['SalePrice'])
        rmse_two=np.sqrt(mse_two)
        
        avg_rmse=np.mean([rmse_one,rmse_two])
        
        return avg_rmse
        
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_vals=[]
        for train_index,test_index, in kf.split(df):
            train=df.iloc[train_index]
            test=df.iloc[test_index]
            lr = linear_model.LinearRegression()
            lr.fit(train[features],train['SalePrice'])       
            predict=lr.predict(test[features])
            mse=mean_squared_error(predict,test['SalePrice'])
            rmse=np.sqrt(mse)
            rmse_vals.append(rmse)
        avg_rmse=np.mean(rmse_vals)
        return avg_rmse

In [277]:
train_and_test(df_test,6)

33130.39929603716

In [278]:
#Testing Model for various values of K
df_test=transform_features(df)
df_test=select_features(df_test)

k=[0,1,2,4,5,8,10,15,20]
for i in k:
    rmse = train_and_test(df_test,i)
    print("Avg RMSE for this model with k as {} is {:.2f}".format(i,rmse))

Avg RMSE for this model with k as 0 is 36653.19
Avg RMSE for this model with k as 1 is 33028.85
Avg RMSE for this model with k as 2 is 33292.77
Avg RMSE for this model with k as 4 is 33589.47
Avg RMSE for this model with k as 5 is 32738.60
Avg RMSE for this model with k as 8 is 33137.76
Avg RMSE for this model with k as 10 is 32718.58
Avg RMSE for this model with k as 15 is 32483.13
Avg RMSE for this model with k as 20 is 32451.01
