In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

In [12]:
training_set_path = '../dataset/train.csv'
training_set = pd.read_csv(training_set_path)
training_set.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

Create Target Object and Features used in model


In [13]:
y = training_set.SalePrice
features = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition']
X = training_set[features]

Clean the data:

1. Check For Missing Value

In [14]:
missing_values = training_set.isnull().sum()

missing_values_df = pd.DataFrame(missing_values).reset_index()
missing_values_df.columns = ['Column', 'Missing Values']
print(missing_values_df)

           Column  Missing Values
0              Id               0
1      MSSubClass               0
2        MSZoning               0
3     LotFrontage             259
4         LotArea               0
..            ...             ...
76         MoSold               0
77         YrSold               0
78       SaleType               0
79  SaleCondition               0
80      SalePrice               0

[81 rows x 2 columns]


#### Dropping outlier rows

Observed in the above cell output, columns *MasVnrArea* and *Electrical* have very low numbers of missing values within.

It is safe to drop the corresponding rows without having a significant impact to the model.

In [15]:
clean_set = training_set
clean_set.dropna(axis=0, subset=['MasVnrArea', 'Electrical'], inplace=True)

#### Dropping Columns

Observed in the dataset, columns *Neighborhood*, *Condition1*, *Condition2*, *RoofMatl*, *Exterior1st* and *Exterior2nd* have a wide variety of responses, but have a relatively low impact of on the prediction result.

In [None]:
clean_set.drop(['Neighborhood', 'Condition1', 'Condition2', 'RoofMatl', 'Exterior1st', 'Exterior2nd'], axis = 1, inplace=True)

Handle Missing Value

Since the missing valuse in this dataset is meaningful (i.e. Missing value means the house does not have a specific feature.), we cannot just drop all the missing values. 
    Therefore, I will handle the missing values by filling 0, so that the meaning implied by the missing values will not be missed out.

In [16]:
# Replace missing values with 0
clean_set = clean_set.fillna(0)

Clean Data!!

In [17]:
#Binary variable transformation!
#Convert the columns 'CentralAir' and 'PavedDrive' from (Y/N) to (1/0) and (Y/P/N) to (1/1/0)
clean_set['CentralAir'] = clean_set['CentralAir'].replace({'Y': 1, 'N': 0})
clean_set['PavedDrive'] = clean_set['PavedDrive'].replace({'Y': 1, 'P': 1, 'N': 0})

#Convert the data in the column 'Street' to binary, where 'Pave' to 1 and 'Grvl' to 0.
clean_set['Street'] = clean_set['Street'].replace({'Pave': 1, 'Grvl': 0})

#Convert the data in the column 'Alley' to binary.
#The way of doing it is to group 'Pave' and 'Grvl'
#If the property has an alley access, then 1; 
#Otherwise 0.
clean_set['Alley'] = clean_set['Alley'].replace({'Pave': 1, 'Grvl': 1})

#Convert the data in the column 'LotShape' to binary, where 'Reg' to 1 and grouping 'IR1', 'IR2' and 'IR3' to 0.
#If the shape of the property is regular, then 1;
#Otherwise 0.
clean_set['LotShape'] = clean_set['LotShape'].replace({'Reg': 1, 'IR1': 0, 'IR2': 0, 'IR3': 0})

#Convert the data in the column 'LandContour' to binary, where 'Lvl' to 1 and grouping 'Bnk', 'HLS' and 'Low' to 0.
#If the property is near flat/level, then 1;
#Otherwise 0.
clean_set['LandContour'] = clean_set['LandContour'].replace({'Lvl': 1, 'Bnk': 0, 'HLS': 0, 'Low': 0})

#Convert the data in the column 'Utilities' to binary, where 'AllPub' to 1 and grouping 'NoSewr', 'NoSeWa' and 'ELO' to 0.
#If all type of utilities is available, then 1;
#Otherwise 0.
clean_set['Utilities'] = clean_set['Utilities'].replace({'AllPub': 1, 'NoSewr': 0, 'NoSeWa': 0, 'ELO': 0})




  clean_set['CentralAir'] = clean_set['CentralAir'].replace({'Y': 1, 'N': 0})
  clean_set['PavedDrive'] = clean_set['PavedDrive'].replace({'Y': 1, 'P': 1, 'N': 0})
  clean_set['Street'] = clean_set['Street'].replace({'Pave': 1, 'Grvl': 0})
  clean_set['Alley'] = clean_set['Alley'].replace({'Pave': 1, 'Grvl': 1})
  clean_set['LotShape'] = clean_set['LotShape'].replace({'Reg': 1, 'IR1': 0, 'IR2': 0, 'IR3': 0})
  clean_set['LandContour'] = clean_set['LandContour'].replace({'Lvl': 1, 'Bnk': 0, 'HLS': 0, 'Low': 0})
  clean_set['Utilities'] = clean_set['Utilities'].replace({'AllPub': 1, 'NoSewr': 0, 'NoSeWa': 0, 'ELO': 0})


In [18]:
clean_set.select_dtypes(include=['object']).columns.tolist()

['MSZoning',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

Handling Catagorical Columns which are suitable to use One-hot Encoding method.
Those columns include "MSZoning", "BldgType", "MasVnrType"

In [19]:
clean_set['MSZoning'].value_counts().reset_index()

Unnamed: 0,MSZoning,count
0,RL,1145
1,RM,218
2,FV,62
3,RH,16
4,C (all),10


In [20]:
clean_set['MSZoning'].unique()

array(['RL', 'RM', 'C (all)', 'FV', 'RH'], dtype=object)

We will use one-hot encoding method to handle this column.

In [None]:
column_to_transform = 'MSZoning'

# Create an instance of OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Apply the encoder to the specified column
encoded_column = pd.DataFrame(OH_encoder.fit_transform(clean_set[[column_to_transform]]))

# Get the column names for the one-hot encoded columns
encoded_column.columns = OH_encoder.get_feature_names_out([column_to_transform])
