In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Getting to know the data and processing it

In [2]:
housing_data = pd.read_csv("home-data-for-ml-course/train.csv")
test_data = pd.read_csv("home-data-for-ml-course/test.csv")
sample_sub = pd.read_csv("home-data-for-ml-course/sample_submission.csv")
print(housing_data.shape)
print(test_data.shape)
print(sample_sub.shape)

(1460, 81)
(1459, 80)
(1459, 2)


In [3]:
# training data columns & data
print("Training data columns:", housing_data.columns.tolist())
housing_data.head()

Training data columns: ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# sample submission data columns & data
print("Sample submission columns:", sample_sub.columns.tolist())
sample_sub.head()

Sample submission columns: ['Id', 'SalePrice']


Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


### We can conclude that our submission should be a csv of 2 columns (id + saleprice) the saleprice is also a float .6f

In [5]:
# data types + missing values
print(housing_data.dtypes)
housing_data.head(4)

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000


In [6]:
na_counts = housing_data.isna().sum()
columnsWithNa = na_counts[na_counts > 0]
print(columnsWithNa.sort_values(ascending=False))

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
Electrical         1
dtype: int64


In [7]:
# dropping columns with too many missing values
housing_data = housing_data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu'], axis=1)

In [8]:
#checking for values before imputing
housing_data.head(4)
housing_data['GarageType']

0       Attchd
1       Attchd
2       Attchd
3       Detchd
4       Attchd
         ...  
1455    Attchd
1456    Attchd
1457    Attchd
1458    Attchd
1459    Attchd
Name: GarageType, Length: 1460, dtype: object

In [9]:
# imputing missing values
housing_data['LotFrontage'] = housing_data['LotFrontage'].fillna(housing_data['LotFrontage'].median())


In [10]:
# GarageType
housing_data['GarageType'].nunique() # 6
housing_data['GarageType'].unique() # Attchd, Detchd, BuiltIn, Basment, CarPort, 2Types, NA
housing_data['GarageType'].value_counts()
mode_value = housing_data['GarageType'].mode()[0]
housing_data['GarageType'] = housing_data['GarageType'].fillna(mode_value)
housing_data['GarageType'] = housing_data['GarageType'].map({'Attchd':1, 'Detchd':2, 'BuiltIn':3, 'Basment':4, 'CarPort':5, '2Types':6})


In [11]:
# GarageYrBlt
housing_data['GarageYrBlt'].nunique() # 97
housing_data['GarageYrBlt'].unique()
housing_data['GarageYrBlt'].value_counts()
housing_data['GarageYrBlt'] = housing_data['GarageYrBlt'].fillna(housing_data['GarageYrBlt'].median())

In [12]:
# GarageFinish
housing_data['GarageFinish'].nunique() # 3
housing_data['GarageFinish'].unique()
housing_data['GarageFinish'].value_counts()
housing_data['GarageFinish'] = housing_data['GarageFinish'].fillna(housing_data['GarageFinish'].mode()[0])
housing_data['GarageFinish'] = housing_data['GarageFinish'].map({'Unf':1, 'RFn':2, 'Fin':3})


In [17]:
# GarageQual and GarageCond
quality_map = {'NA': 0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
housing_data['GarageScore'] = (0.6*housing_data['GarageQual'].map(quality_map) + 0.4*housing_data['GarageCond'].map(quality_map))
housing_data['GarageScore'] = housing_data['GarageScore'].fillna(0)
housing_data = housing_data.drop(['GarageQual', 'GarageCond'], axis=1)

In [18]:
na_counts = housing_data.isna().sum()
columnsWithNa = na_counts[na_counts > 0]
print(columnsWithNa.sort_values(ascending=False))

BsmtExposure    38
BsmtFinType2    38
BsmtQual        37
BsmtCond        37
BsmtFinType1    37
MasVnrArea       8
Electrical       1
dtype: int64


In [24]:
housing_data['Electrical'].value_counts()
housing_data['Electrical'] = housing_data['Electrical'].fillna(housing_data['Electrical'].mode()[0])
mapping = {'SBrkr':1, 'FuseA':2, 'FuseF':3, 'FuseP':4, 'Mix':5}
housing_data['Electrical'] = housing_data['Electrical'].map(mapping)
housing_data["Electrical"]

0       1
1       1
2       1
3       1
4       1
       ..
1455    1
1456    1
1457    1
1458    2
1459    1
Name: Electrical, Length: 1460, dtype: int64

In [25]:
na_counts = housing_data.isna().sum()
columnsWithNa = na_counts[na_counts > 0]
print(columnsWithNa.sort_values(ascending=False))

BsmtExposure    38
BsmtFinType2    38
BsmtQual        37
BsmtCond        37
BsmtFinType1    37
MasVnrArea       8
dtype: int64


In [26]:
housing_data['MasVnrArea'] = housing_data['MasVnrArea'].fillna(housing_data['MasVnrArea'].median())


In [27]:
# mapping of basement features
bsmt_qual_map = {
    'Ex': 5,  
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1
}
bsmt_cond_map = {
    'Ex': 5,  
    'Gd': 4,  
    'TA': 3,  
    'Fa': 2,  
    'Po': 1   
}
bsmt_exposure_map = {
    'Gd': 4,
    'Av': 3,
    'Mn': 2,
    'No': 1
}
bsmt_fintype_map = {
    'GLQ': 6,  
    'ALQ': 5,
    'BLQ': 4,
    'Rec': 3,
    'LwQ': 2,
    'Unf': 1
}
housing_data["BsmtQual"] = housing_data["BsmtQual"].map(bsmt_qual_map)
housing_data["BsmtCond"] = housing_data["BsmtCond"].map(bsmt_cond_map)
housing_data["BsmtExposure"] = housing_data["BsmtExposure"].map(bsmt_exposure_map)
housing_data["BsmtFinType1"] = housing_data["BsmtFinType1"].map(bsmt_fintype_map)
housing_data["BsmtFinType2"] = housing_data["BsmtFinType2"].map(bsmt_fintype_map)
housing_data['BsmtScore'] = (
    housing_data['BsmtQual']*0.3 + housing_data['BsmtCond']*0.25 + housing_data['BsmtExposure']*0.1 +
    housing_data['BsmtFinType1']*0.3 + housing_data['BsmtFinType2']*0.05
)

In [28]:
na_counts = housing_data.isna().sum()
columnsWithNa = na_counts[na_counts > 0]
print(columnsWithNa.sort_values(ascending=False))

BsmtScore       39
BsmtExposure    38
BsmtFinType2    38
BsmtQual        37
BsmtCond        37
BsmtFinType1    37
dtype: int64


In [29]:
housing_data['BsmtScore'] = housing_data['BsmtScore'].fillna(0)
housing_data = housing_data.drop(['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'], axis=1)

In [30]:
na_counts = housing_data.isna().sum()
columnsWithNa = na_counts[na_counts > 0]
print(columnsWithNa.sort_values(ascending=False))

Series([], dtype: int64)


In [31]:
housing_data.head(4)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,GarageScore,BsmtScore
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,2,2008,WD,Normal,208500,3.0,3.9
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,5,2007,WD,Normal,181500,3.0,3.9
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,9,2008,WD,Normal,223500,3.0,4.0
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,2,2006,WD,Abnorml,140000,3.0,3.55


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
