In [3]:
import pip
# pip.main(['install', '{insert_pckg_here}'])
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import warnings
warnings.filterwarnings('ignore')

#Exploratory Data Analysis
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

## Loading Training/Testing Data:

In [5]:
train_url = 'https://raw.githubusercontent.com/mturner49/pylovers-final-project/dev/data/train.csv'
test_url = 'https://raw.githubusercontent.com/mturner49/pylovers-final-project/dev/data/test.csv'

train_df = pd.read_csv(train_url, low_memory=False, error_bad_lines=False, index_col='Id')
test_df = pd.read_csv(test_url, low_memory=False, error_bad_lines=False, index_col='Id')

train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
# This is for checking datatypes
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

## Data Preprocessing:

In [7]:
# create df that consists of columns and the number of missing values for each 
data = train_df.isnull().sum().sort_values(ascending=False)
missing_df = pd.DataFrame(data=data, columns=['missing_cnt'])

# add column and fill it with the percentage of those missing values
missing_df['percent_missing'] = missing_df.missing_cnt.apply(lambda x : '{:.2f}'.format(x/train_df.shape[0] * 100)) 
missing_df = missing_df[missing_df.missing_cnt > 0]
missing_df

Unnamed: 0,missing_cnt,percent_missing
PoolQC,1453,99.52
MiscFeature,1406,96.3
Alley,1369,93.77
Fence,1179,80.75
FireplaceQu,690,47.26
LotFrontage,259,17.74
GarageType,81,5.55
GarageCond,81,5.55
GarageFinish,81,5.55
GarageQual,81,5.55


In [8]:
# drop the columns where majority of their values are missing
train_df = train_df.drop(['PoolQC', 'MiscFeature', 'Fence', 'FireplaceQu'], axis = 1)
train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [9]:
# Vida
# I think we should drop 'Alley' as well! with Just 91 non-null values, like 93.77% null!
train_df = train_df.drop(['Alley'], axis = 1)
train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [10]:
# All 'GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageQual','GarageCond' columns has 1379 not null values!
# So, I tested the theory that exactly same rows has null for these columns!
# All 81 do not have Garage! So, we can put zero instead!
train_df.loc[:,['GarageCars','GarageArea','GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageQual','GarageCond']][train_df['GarageType'].isnull()]


Unnamed: 0_level_0,GarageCars,GarageArea,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageQual,GarageCond
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
40,0,0,,,,0,,
49,0,0,,,,0,,
79,0,0,,,,0,,
89,0,0,,,,0,,
90,0,0,,,,0,,
...,...,...,...,...,...,...,...,...
1350,0,0,,,,0,,
1408,0,0,,,,0,,
1450,0,0,,,,0,,
1451,0,0,,,,0,,


In [11]:
# All 81 do not have Garage! So, we can put zero instead!
train_df.update(train_df[['GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageQual','GarageCond']].fillna(0))

In [12]:
# Same for Basement: 'BsmtFinType2','BsmtExposure','BsmtQual','BsmtCond','BsmtFinType1'
train_df.loc[:,['TotalBsmtSF','BsmtUnfSF','BsmtFinType2','BsmtExposure','BsmtQual','BsmtCond','BsmtFinType1']][train_df['BsmtCond'].isnull()]

Unnamed: 0_level_0,TotalBsmtSF,BsmtUnfSF,BsmtFinType2,BsmtExposure,BsmtQual,BsmtCond,BsmtFinType1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
18,0,0,,,,,
40,0,0,,,,,
91,0,0,,,,,
103,0,0,,,,,
157,0,0,,,,,
183,0,0,,,,,
260,0,0,,,,,
343,0,0,,,,,
363,0,0,,,,,
372,0,0,,,,,


In [13]:
# All 37 do not have Garage! So, we can put zero instead!
train_df.update(train_df[['BsmtQual','BsmtCond','BsmtFinType1']].fillna(0))

In [14]:
# Same for Basement: 'BsmtFinType2','BsmtExposure'
train_df.loc[:,['TotalBsmtSF','BsmtUnfSF','BsmtFinType2','BsmtExposure','BsmtQual','BsmtCond','BsmtFinType1']][train_df['BsmtExposure'].isnull()]

Unnamed: 0_level_0,TotalBsmtSF,BsmtUnfSF,BsmtFinType2,BsmtExposure,BsmtQual,BsmtCond,BsmtFinType1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
18,0,0,,,0,0,0
40,0,0,,,0,0,0
91,0,0,,,0,0,0
103,0,0,,,0,0,0
157,0,0,,,0,0,0
183,0,0,,,0,0,0
260,0,0,,,0,0,0
343,0,0,,,0,0,0
363,0,0,,,0,0,0
372,0,0,,,0,0,0


In [15]:
train_df.at[949,'BsmtExposure']='No'

In [16]:
train_df.loc[:,['TotalBsmtSF','BsmtUnfSF','BsmtFinSF2','BsmtFinType2','BsmtExposure','BsmtQual','BsmtCond','BsmtFinType1']][train_df['BsmtFinType2'].isnull()]

Unnamed: 0_level_0,TotalBsmtSF,BsmtUnfSF,BsmtFinSF2,BsmtFinType2,BsmtExposure,BsmtQual,BsmtCond,BsmtFinType1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
18,0,0,0,,,0,0,0
40,0,0,0,,,0,0,0
91,0,0,0,,,0,0,0
103,0,0,0,,,0,0,0
157,0,0,0,,,0,0,0
183,0,0,0,,,0,0,0
260,0,0,0,,,0,0,0
333,3206,1603,479,,No,Gd,TA,GLQ
343,0,0,0,,,0,0,0
363,0,0,0,,,0,0,0


In [17]:
# I'm going with 'Average Living Quarters'
train_df.at[333,'BsmtFinType2']='ALQ'

In [18]:
train_df.update(train_df[['BsmtFinType2','BsmtExposure']].fillna(0))

In [19]:
# dummy=pd.get_dummies(data=train_df, columns=[  ...  ])
# train_df_withDummy=pd.concat([train_df,dummy], axis=1)
# #df_withDummy.shape
# train_df_withDummy.drop(columns=[  ...  ],inplace=True)
# train_df_withDummy.head()

In [20]:

# I'm not sure about this trick! but I saw someone metioned the Mean Absolute Percentage Error (MAPE)
# can be good accuracy for just not negative and non zero features!!
# So, to stay in safe side I suggest we start from 1!
quality_ratings = {
 'NA':1,
 'Po':2, 
 'Fa':3, 
 'TA':4, 
 'Gd':5, 
 'Ex':6
}

In [21]:
# Converting category labels to numerical values for ExterQual column
train_df['ExterQual_Num'] = train_df.ExterQual.map(quality_ratings)

In [22]:
# Converting category labels to numerical values for BsmtQual column
train_df['BsmtQual_Num'] = train_df.BsmtQual.map(quality_ratings)

In [23]:
# Converting category labels to numerical values for HeatingQC column
train_df['HeatingQC_Num'] = train_df.HeatingQC.map(quality_ratings)

In [24]:
# Converting category labels to numerical values for KitchenQual column
train_df['KitchenQual_Num'] = train_df.KitchenQual.map(quality_ratings)

In [25]:
# Converting category labels to numerical values for GarageQual column
train_df['GarageQual_Num'] = train_df.GarageQual.map(quality_ratings)

In [26]:
# drop the columns where we replaced with numbers
train_df = train_df.drop(['ExterQual', 'BsmtQual', 'HeatingQC', 'KitchenQual','GarageQual'], axis = 1)
train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,ExterQual_Num,BsmtQual_Num,HeatingQC_Num,KitchenQual_Num,GarageQual_Num
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,2,2008,WD,Normal,208500,5,5.0,6,5,4.0
2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,5,2007,WD,Normal,181500,4,5.0,6,4,4.0
3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,9,2008,WD,Normal,223500,5,5.0,6,5,4.0
4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,2,2006,WD,Abnorml,140000,4,4.0,5,5,4.0
5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,12,2008,WD,Normal,250000,5,5.0,6,5,4.0


In [27]:
# LotShape
quality_ratings1 = {
    "IR3" : 1,
	"IR2" : 2,
	"IR1" : 3,
	"Reg" : 4
}
train_df["LotShape_Num"] = train_df.LotShape.map(quality_ratings1)

In [28]:
	# LandContour
quality_ratings2 = {
	"Low" : 1,
	"HLS" : 2,
	"Bnk" : 3,
	"Lvl" : 4
}
train_df["LandContour_Num"] = train_df.LandContour.map(quality_ratings2)

In [29]:
	# LandSlope
quality_ratings3 = {
	"Sev" : 1,
	"Mod" : 2,
	"Gtl" : 3
}
train_df["LandSlope_Num"] = train_df.LandSlope.map(quality_ratings3)

In [30]:
	# ExterQual, ExterCond, HeatingQC, KitchenQual
quality_ratings4 = {
	"Po" : 1,
	"Fa" : 2,
	"TA" : 3,
	"Gd" : 4,
	"Ex" : 5
}
# train_df["ExterQual_Num"] = train_df.ExterQual.map(quality_ratings4)
train_df["ExterCond_Num"] = train_df.ExterCond.map(quality_ratings4)
# train_df["HeatingQC_Num"] = train_df.HeatingQC.map(quality_ratings4)
# train_df["KitchenQual_Num"] = train_df.KitchenQual.map(quality_ratings4)

In [31]:
	# BsmtQual, BsmtCond, GarageQual, GarageCond, FireplaceQu
quality_ratings5 = {
	"NA" : 1,
	"Po" : 2,
	"Fa" : 3,
	"TA" : 4,
	"Gd" : 5,
	"Ex" : 6
}
# train_df["BsmtQual_Num"] = train_df.BsmtQual.map(quality_ratings5)
train_df["BsmtCond_Num"] = train_df.BsmtCond.map(quality_ratings5)
# train_df["GarageQual_Num"] = train_df.GarageQual.map(quality_ratings5)
train_df["GarageCond_Num"] = train_df.GarageCond.map(quality_ratings5)
# train_df["FireplaceQu_Num"] = train_df.FireplaceQu.map(quality_ratings5)

In [32]:
	# BsmtExposure
quality_ratings7 = {
	"NA" : 1,
	"No" : 2,
	"Mn" : 3,
	"Av" : 4,
	"Gd" : 5
}
train_df["BsmtExposure_Num"] = train_df.BsmtExposure.map(quality_ratings7)

In [33]:
	# BsmtFinType1, BsmtFinType2
quality_ratings8 = {
	"NA" : 1,
	"Unf" : 2,
	"LwQ" : 3,
	"Rec" : 4,
	"BLQ" : 5,
	"ALQ" : 6,
	"GLQ" : 7
}
train_df["BsmtFinType1_Num"] = train_df.BsmtFinType1.map(quality_ratings8)
train_df["BsmtFinType2_Num"] = train_df.BsmtFinType2.map(quality_ratings8)

In [34]:
	# Functional
quality_ratings9 = {
	"Sal" : 1,
	"Sev" : 2,
	"Maj2" : 3,
	"Maj1" : 4,
	"Mod" : 5,
	"Min2" : 6,
	"Min1" : 7,
	"Typ" : 8
}
train_df["Functional_Num"] = train_df.Functional.map(quality_ratings9)

In [35]:
	# GarageFinish
quality_ratings10 = {
	"NA" : 1,
	"Unf" : 2,
	"RFn" : 3,
	"Fin" : 4
}
train_df["GarageFinish_Num"] = train_df.GarageFinish.map(quality_ratings10)

In [36]:
	# CentralAir
quality_ratings11 = {
	"N" : 1,
	"Y" : 2
}
train_df["CentralAir_Num"] = train_df.CentralAir.map(quality_ratings11)

In [37]:
	# PavedDrive
quality_ratings12 = {
	"N" : 1,
	"P" : 2,
	"Y" : 3
}
train_df["PavedDrive_Num"] = train_df.PavedDrive.map(quality_ratings12)

In [38]:
	# MSSubClass
train_df.replace({'MSSubClass': {20:"St46nNAS", 30:"St45nOl", 40:"StwFinAAA", 45:"HalfStUnfinAAA", 50:"HalfStFinAA",
	60:"St46nN", 70:"St45nOl", 75:"HalfStAA", 80:"SplorML", 85:"SplFoy", 90:"DupASnA", 120:"StPud46nN", 150:"HalfStPudAA", 
	160:"StPud46nNN", 180:"PudML", 190:"FamConv"}})

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,ExterCond_Num,BsmtCond_Num,GarageCond_Num,BsmtExposure_Num,BsmtFinType1_Num,BsmtFinType2_Num,Functional_Num,GarageFinish_Num,CentralAir_Num,PavedDrive_Num
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,St46nN,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,3,4.0,4.0,2.0,7.0,2.0,8,3.0,2,3
2,St46nNAS,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,3,4.0,4.0,5.0,6.0,2.0,8,3.0,2,3
3,St46nN,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,3,4.0,4.0,3.0,7.0,2.0,8,3.0,2,3
4,St45nOl,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,3,5.0,4.0,2.0,6.0,2.0,8,2.0,2,3
5,St46nN,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,3,4.0,4.0,4.0,7.0,2.0,8,3.0,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,St46nN,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,3,4.0,4.0,2.0,2.0,2.0,8,3.0,2,3
1457,St46nNAS,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,3,4.0,4.0,2.0,6.0,4.0,7,2.0,2,3
1458,St45nOl,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,4,5.0,4.0,2.0,7.0,2.0,8,3.0,2,3
1459,St46nNAS,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,3,4.0,4.0,3.0,7.0,4.0,8,2.0,2,3


In [39]:
## For variables with naming values
dummy = pd.get_dummies(data = train_df, columns = ["SaleCondition", "MSSubClass", "MSZoning", "Street", "Utilities", "LotConfig",
 "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", 
 "Exterior2nd", "MasVnrType", "Foundation", "Heating", "Electrical", "GarageType", "SaleType"])
train_df_with_dummy = pd.concat([train_df, dummy], axis=1)

In [41]:
# df_withDummy.shape
train_df_with_dummy.drop(columns=["SaleCondition", "MSSubClass", "MSZoning", "Street", "Utilities", "LotConfig",
 "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", 
 "Exterior2nd", "MasVnrType", "Foundation", "Heating", "Electrical", "GarageType", "SaleType"], inplace=True)
train_df_with_dummy.head()

Unnamed: 0_level_0,LotFrontage,LotArea,LotShape,LandContour,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,GarageType_Detchd,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450,Reg,Lvl,Gtl,7,5,2003,2003,196.0,...,0,0,0,0,0,0,0,0,0,1
2,80.0,9600,Reg,Lvl,Gtl,6,8,1976,1976,0.0,...,0,0,0,0,0,0,0,0,0,1
3,68.0,11250,IR1,Lvl,Gtl,7,5,2001,2002,162.0,...,0,0,0,0,0,0,0,0,0,1
4,60.0,9550,IR1,Lvl,Gtl,7,5,1915,1970,0.0,...,1,0,0,0,0,0,0,0,0,1
5,84.0,14260,IR1,Lvl,Gtl,8,5,2000,2000,350.0,...,0,0,0,0,0,0,0,0,0,1


## Missing Data Imputation

The rest of missing data is numeric. So, I prefer to replace by mean of same column!

In [42]:
# create df that consists of columns and the number of missing values for each 
data = train_df.isnull().sum().sort_values(ascending=False)
missing_df = pd.DataFrame(data=data, columns=['missing_cnt'])

# add column and fill it with the percentage of those missing values
missing_df['percent_missing'] = missing_df.missing_cnt.apply(lambda x : '{:.2f}'.format(x/train_df.shape[0] * 100)) 
missing_df = missing_df[missing_df.missing_cnt > 0]
missing_df

Unnamed: 0,missing_cnt,percent_missing
LotFrontage,259,17.74
GarageFinish_Num,81,5.55
GarageCond_Num,81,5.55
GarageQual_Num,81,5.55
BsmtFinType2_Num,37,2.53
BsmtFinType1_Num,37,2.53
BsmtExposure_Num,37,2.53
BsmtCond_Num,37,2.53
BsmtQual_Num,37,2.53
MasVnrType,8,0.55


In [43]:
train_df.update(train_df[['Electrical']].fillna('SBrkr'))

In [44]:
#I'm not professional but I think there is no Masonry veneer type for the Vinyl and Cement!
train_df.at[530,'MasVnrType']='Stone'

# train_df['Set_of_Numbers'] = train_df['Set_of_Numbers'].fillna(0)
train_df.update(train_df[['MasVnrArea']].fillna(0))
train_df.update(train_df[['MasVnrType']].fillna('None'))

In [45]:
train_df['LotFrontage'] = train_df['LotFrontage'].fillna((train_df['LotFrontage'].mean()))

In [46]:
train_df['BsmtQual_Num'] = train_df['BsmtQual_Num'].fillna((train_df['BsmtQual_Num'].mean()))

In [47]:
train_df['GarageQual_Num'] = train_df['GarageQual_Num'].fillna((train_df['GarageQual_Num'].mean()))

In [48]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 88 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MSSubClass        1460 non-null   int64  
 1   MSZoning          1460 non-null   object 
 2   LotFrontage       1460 non-null   float64
 3   LotArea           1460 non-null   int64  
 4   Street            1460 non-null   object 
 5   LotShape          1460 non-null   object 
 6   LandContour       1460 non-null   object 
 7   Utilities         1460 non-null   object 
 8   LotConfig         1460 non-null   object 
 9   LandSlope         1460 non-null   object 
 10  Neighborhood      1460 non-null   object 
 11  Condition1        1460 non-null   object 
 12  Condition2        1460 non-null   object 
 13  BldgType          1460 non-null   object 
 14  HouseStyle        1460 non-null   object 
 15  OverallQual       1460 non-null   int64  
 16  OverallCond       1460 non-null   int64  


In [None]:
# Vida: let's speak about this part in our meeting! 
# Vida: I handled all null values one by one!

# NA is a valid value and is some cases not equivalent to NaN and should not be converted to 0
# replace null values in df w/ values that had most counts for each column
# train_df1 = train_df1.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' \
#                           else x.fillna(train_df.columns.value_counts().idxmax()))
# test_df = test_df.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' \
#                           else x.fillna(train_df.columns.value_counts().idxmax()))

In [None]:
# Vida: I couldn't run this part!

# encode object (categorical) columns in df
# enc_df = train_df1.select_dtypes(include=['object']).apply(LabelEncoder().fit_transform)

# # add encoded columns back into original train df
# train_df1[enc_df.columns] = enc_df

# train_df1.head()

## Exploratory Data Analysis (EDA)

- Correlation 

In [49]:
corr_matrix = train_df.corr()
corr_matrix['SalePrice'] = round(corr_matrix['SalePrice'],4)
corr_matrix['SalePrice'].sort_values(ascending=False)

SalePrice           1.0000
OverallQual         0.7910
GrLivArea           0.7086
ExterQual_Num       0.6826
KitchenQual_Num     0.6596
GarageCars          0.6404
BsmtQual_Num        0.6354
GarageArea          0.6234
TotalBsmtSF         0.6136
1stFlrSF            0.6059
FullBath            0.5607
TotRmsAbvGrd        0.5337
YearBuilt           0.5229
GarageFinish_Num    0.5131
YearRemodAdd        0.5071
MasVnrArea          0.4726
Fireplaces          0.4669
HeatingQC_Num       0.4276
BsmtFinSF1          0.3864
BsmtExposure_Num    0.3529
LotFrontage         0.3349
WoodDeckSF          0.3244
2ndFlrSF            0.3193
OpenPorchSF         0.3159
HalfBath            0.2841
BsmtFinType1_Num    0.2774
LotArea             0.2638
GarageYrBlt         0.2614
CentralAir_Num      0.2513
PavedDrive_Num      0.2314
BsmtFullBath        0.2271
BsmtUnfSF           0.2145
BedroomAbvGr        0.1682
BsmtCond_Num        0.1607
GarageQual_Num      0.1515
GarageCond_Num      0.1250
ScreenPorch         0.1114
F

In [50]:
train_df['BathPerRoom_Factor'] = train_df['FullBath'] / (train_df['TotRmsAbvGrd']+0.01)
train_df['BsmtFin2TotBsmtSF_Ratio'] = (train_df['BsmtFinSF1']+train_df['BsmtFinSF2']) / (train_df['TotalBsmtSF']+0.01)
train_df['GarageCars2Area_Ratio'] = train_df['GarageCars'] / (train_df['GarageArea']+0.01)
# train_df['PoolFactor'] = train_df['PoolQC'] * train_df['PoolArea']
train_df['Overall_Factor'] = train_df['OverallQual'] * train_df['OverallCond'] * train_df['YearBuilt']
train_df['Yr_Factor'] = train_df['YrSold'] - np.maximum(train_df['YearBuilt'],train_df['YearRemodAdd'])
train_df['Porch_Factor'] = (train_df['OpenPorchSF'] + train_df['EnclosedPorch']+ train_df['3SsnPorch']+ train_df['ScreenPorch']+0.01)
train_df['OpenPorch_Factor'] = train_df['OpenPorchSF']/(train_df['3SsnPorch']+0.01)

## Not high corr
train_df['TotGrLiv2Lot_Ratio'] = train_df['GrLivArea'] / (train_df['LotArea']+0.01)
train_df['TotSF2Lot_Ratio'] = (train_df['1stFlrSF']+train_df['2ndFlrSF']+train_df['TotalBsmtSF']) / (train_df['LotArea']+0.01)
train_df['TotBsmtBath2BsmtSF_Ratio'] = (train_df['BsmtFullBath'] + train_df['BsmtHalfBath'])/(train_df['TotalBsmtSF']+0.01)
train_df['ClosePorch_Factor'] = train_df['ScreenPorch'] + train_df['EnclosedPorch']
## So so
train_df['LowQualFin2TotSF_Ratio'] = train_df['LowQualFinSF'] / (train_df['1stFlrSF']+train_df['2ndFlrSF']+train_df['TotalBsmtSF']+0.01)
## same as kitchen
train_df['Kitchen2Flr_Ratio'] = train_df['KitchenAbvGr'] / ((1 if (train_df['1stFlrSF']>0).bool else 0) + (1 if (train_df['2ndFlrSF']>0).bool else 0) + (1 if (train_df['TotalBsmtSF']>0).bool else 0))

In [51]:
corr_matrix = train_df.corr()
corr_matrix['SalePrice'] = round(corr_matrix['SalePrice'],4)
corr_matrix['SalePrice'].sort_values(ascending=False)

SalePrice            1.0000
OverallQual          0.7910
GrLivArea            0.7086
ExterQual_Num        0.6826
KitchenQual_Num      0.6596
                      ...  
EnclosedPorch       -0.1286
KitchenAbvGr        -0.1359
Kitchen2Flr_Ratio   -0.1359
LotShape_Num        -0.2678
Yr_Factor           -0.5091
Name: SalePrice, Length: 68, dtype: float64

- Variance Inflation Factor

In [None]:
# not finished yet!!
# X = add_constant(train_df)
# pd.Series([variance_inflation_factor(X.values, i)
#           for i in range(X.shape[1])], index=X.columns)

## Data Exploration:

In [None]:
# check distribution of sales price
train_df1.hist(column='SalePrice')

In [None]:
# normalize sale price so that it can be evenly distributed
train_df1['LogPrice'] = np.log(train_df1.SalePrice)
train_df1.hist(column='LogPrice')

In [None]:
# check for more skewed columns
train_df1.skew().sort_values(ascending=False).head(30)

In [None]:
# looking at correlation of numeric features to SalePrice column. 
# this will suggest which columns have a greater relationship with the SalePrice column 
corr = train_df1.corr().abs().unstack().sort_values(ascending=False)['LogPrice']
corr = corr.iloc[1:]
corr = pd.DataFrame(corr, columns = ['Correlation'])

corr = corr[corr.Correlation > 0.50]
corr

In [None]:
cols = []
for ind in corr.index:
    cols.append(ind)

# create pair plot between columns that have correlation 50% and above
sns.pairplot(train_df1[cols])

## Data Visualization (CAN BE DELETED):

In [None]:
#Count the number of houses sold in a year
#year_sold_pivot = train_df.pivot_table(index='YrSold', values='SalePrice', aggfunc='count')
#print(year_sold_pivot)

# Plotting the sum of sales per year
sns.set_context('talk', font_scale=1) 
plt.figure(figsize=(10,5))
sns.barplot(x='YrSold', y='SalePrice', data=train_df[['SalePrice', 'YrSold']], estimator=sum)
plt.xlabel('Year Sold')
plt.ylabel('Number of Sales')
plt.show()

In [None]:
# Plotting the median sale price for each year

#Checking the median price
#year_sold_pivot = train_df.pivot_table(index='YrSold', values='SalePrice', aggfunc=np.median)
#print(year_sold_pivot)

sns.set_context('talk', font_scale=1) 
plt.figure(figsize=(10,5))
sns.barplot(x='YrSold', y='SalePrice', data=train_df[['SalePrice', 'YrSold']], estimator=np.median)
plt.xlabel('Year Sold')
plt.ylabel('Median Sale Price')
plt.show()

In [None]:
#Displaying the median price for each year using box plot.  
sns.set_context('talk', font_scale=1) 
plt.figure(figsize=(10,5))
sns.boxplot(x='YrSold', y='SalePrice', data=train_df[['SalePrice', 'YrSold']])
plt.xlabel('Year Sold')
plt.ylabel('Median Sale Price')
plt.show()

In [None]:
# Displaying the median price based on overall quality of the house using box plot.
sns.set_context('talk', font_scale=1) 
plt.figure(figsize=(12,10))
sns.boxplot(x='OverallQual', y='SalePrice', data=train_df[['SalePrice', 'OverallQual']])
plt.xlabel('Overall Quality')
plt.ylabel('Median Sale Price')
plt.show()

In [None]:
sns.set_context('talk', font_scale=1) 
sns.set_style('dark')
plt.figure(figsize=(15,8))

# Plot GarageArea vs sale price of house considering the number of cars that can
# fit in the garage
sns.scatterplot(x='GarageArea', y='SalePrice', hue=train_df.GarageCars.tolist(),
            palette='Set2', data=train_df[['SalePrice', 'GarageArea','GarageCars']])
plt.show()

In [None]:
# Plotting the median sale price based on external quality
sns.set_context('talk', font_scale=1) 
plt.figure(figsize=(10,5))
sns.barplot(x='ExterQual', y='SalePrice', data=train_df[['SalePrice', 'ExterQual']], estimator=np.median)
plt.xlabel('External Quality')
plt.ylabel('Median Sale Price')
plt.show()

In [None]:
# Plotting the median sale price based on basement quality
sns.set_context('talk', font_scale=1) 
plt.figure(figsize=(10,5))
sns.barplot(x='BsmtQual', y='SalePrice', data=train_df[['SalePrice', 'BsmtQual']], estimator=np.median)
plt.xlabel('Basement Quality')
plt.ylabel('Median Sale Price')
plt.show()

In [None]:
# Plotting the median sale price based on heating and air conditioning quality
sns.set_context('talk', font_scale=1) 
plt.figure(figsize=(10,5))
sns.barplot(x='HeatingQC', y='SalePrice', data=train_df[['SalePrice', 'HeatingQC']], estimator=np.median)
plt.xlabel('Heating Quality')
plt.ylabel('Median Sale Price')
plt.show()

In [None]:
# Plotting the median sale price based on kitchen quality
sns.set_context('talk', font_scale=1) 
plt.figure(figsize=(10,5))
sns.barplot(x='KitchenQual', y='SalePrice', data=train_df[['SalePrice', 'KitchenQual']], estimator=np.median)
plt.xlabel('Kitchen Quality')
plt.ylabel('Median Sale Price')
plt.show()

In [None]:
# Plotting the median sale price based on fireplace quality
sns.set_context('talk', font_scale=1) 
plt.figure(figsize=(10,5))
sns.barplot(x='FireplaceQu', y='SalePrice', data=train_df[['SalePrice', 'FireplaceQu']], estimator=np.median)
plt.xlabel('Fireplace Quality')
plt.ylabel('Median Sale Price')
plt.show()

In [None]:
# Plotting the median sale price based on garage quality
sns.set_context('talk', font_scale=1) 
plt.figure(figsize=(10,5))
sns.barplot(x='GarageQual', y='SalePrice', data=train_df[['SalePrice', 'GarageQual']], estimator=np.median)
plt.xlabel('Garage Quality')
plt.ylabel('Median Sale Price')
plt.show()

In [None]:
# Plotting the median sale price based on pool quality
# sns.set_context('talk', font_scale=1) 
# plt.figure(figsize=(10,5))
# sns.barplot(x='PoolQC', y='SalePrice', data=train_df[['SalePrice', 'PoolQC']], estimator=np.median)
# plt.xlabel('Pool Quality')
# plt.ylabel('Median Sale Price')
# plt.show()

## Feature Engineering:

In [None]:
# sum of quality points.
train_df['SumQuality'] = train_df.ExterQual_Num + train_df.BsmtQual_Num + train_df.HeatingQC_Num + train_df.KitchenQual_Num + train_df.GarageQual_Num + train_df.FireplaceQu_Num + train_df.GarageQual_Num + train_df.PoolQC_Num 
            
#print(train_df['SumQuality'])

# sum of quality points, removing less correlated features (< 0.5).
train_df['SumMIQ'] = train_df.ExterQual_Num + train_df.BsmtQual_Num + train_df.KitchenQual_Num + train_df.FireplaceQu_Num + train_df.GarageQual_Num


In [None]:
# Garage area per car
# May not need this one, the correlation to SalePrice is only slightly higher than GarageArea
train_df['GarageAreaPerCar'] = train_df.GarageArea + train_df.GarageCars 

#print(train_df['GarageAreaPerCar'])

In [None]:
corr = train_df.corr().abs().unstack().sort_values(ascending=False)['SalePrice']
corr.head(40)

In [None]:
################## DISREGARD LOGIC BELOW ######################

In [None]:
# encode object columns
# enc_df = train_df.select_dtypes(include=['object']).apply(LabelEncoder().fit_transform)
# enc_df

In [None]:
# add encoded columns back into train df
# train_df[enc_df.columns] = enc_df
# train_df

In [None]:
# normalize df except for price column
# norm_df = (train_df - train_df.mean()) / (train_df.max() - train_df.min())