In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [2]:
training_set_path = '../dataset/train.csv'
training_set = pd.read_csv(training_set_path)
training_set.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

Create Target Object and Features used in model


Clean the data:

1. Check For Missing Value

In [3]:
missing_values = training_set.isnull().sum()

missing_values_df = pd.DataFrame(missing_values).reset_index()
missing_values_df.columns = ['Column', 'Missing Values']
print(missing_values_df)

           Column  Missing Values
0              Id               0
1      MSSubClass               0
2        MSZoning               0
3     LotFrontage             259
4         LotArea               0
..            ...             ...
76         MoSold               0
77         YrSold               0
78       SaleType               0
79  SaleCondition               0
80      SalePrice               0

[81 rows x 2 columns]


#### Dropping outlier rows

Observed in the above cell output, columns *MasVnrArea* and *Electrical* have very low numbers of missing values within.

It is safe to drop the corresponding rows without having a significant impact to the model.

In [4]:
clean_set = training_set
clean_set.dropna(axis=0, subset=['MasVnrArea', 'Electrical'], inplace=True)

#### Dropping Columns

Observed in the dataset, columns *Neighborhood*, *Condition1*, *Condition2*, *RoofMatl*, *Exterior1st* and *Exterior2nd* have a wide variety of responses, but have a relatively low impact of on the prediction result.

In [5]:
clean_set.drop(['Neighborhood', 'Condition1', 'Condition2', 'Heating', 'Electrical', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MiscFeature', 'SaleType'], axis = 1, inplace=True)

Handle Missing Value

Since the missing valuse in this dataset is meaningful (i.e. Missing value means the house does not have a specific feature.), we cannot just drop all the missing values. 
    Therefore, I will handle the missing values by filling 0, so that the meaning implied by the missing values will not be missed out.

In [6]:
# Replace missing values with 0
clean_set = clean_set.fillna(0)

#Reset the index
clean_set.reset_index(drop=True, inplace=True)

Clean Data!!

In [7]:
#Binary variable transformation!
#Convert the columns 'CentralAir' from (Y/N) to (1/0)
clean_set['CentralAir'] = clean_set['CentralAir'].replace({'Y': 1, 'N': 0})

#Convert the data in the column 'Street' to binary, where 'Pave' to 1 and 'Grvl' to 0.
clean_set['Street'] = clean_set['Street'].replace({'Pave': 1, 'Grvl': 0})

#Convert the data in the column 'Alley' to binary.
#The way of doing it is to group 'Pave' and 'Grvl'
#If the property has an alley access, then 1; 
#Otherwise 0.
clean_set['Alley'] = clean_set['Alley'].replace({'Pave': 1, 'Grvl': 1})

#Convert the data in the column 'LotShape' to binary, where 'Reg' to 1 and grouping 'IR1', 'IR2' and 'IR3' to 0.
#If the shape of the property is regular, then 1;
#Otherwise 0.
clean_set['LotShape'] = clean_set['LotShape'].replace({'Reg': 1, 'IR1': 0, 'IR2': 0, 'IR3': 0})

#Convert the data in the column 'LandContour' to binary, where 'Lvl' to 1 and grouping 'Bnk', 'HLS' and 'Low' to 0.
#If the property is near flat/level, then 1;
#Otherwise 0.
clean_set['LandContour'] = clean_set['LandContour'].replace({'Lvl': 1, 'Bnk': 0, 'HLS': 0, 'Low': 0})

#Convert the data in the column 'Utilities' to binary, where 'AllPub' to 1 and grouping 'NoSewr', 'NoSeWa' and 'ELO' to 0.
#If all type of utilities is available, then 1;
#Otherwise 0.
clean_set['Utilities'] = clean_set['Utilities'].replace({'AllPub': 1, 'NoSewr': 0, 'NoSeWa': 0, 'ELO': 0})



  clean_set['CentralAir'] = clean_set['CentralAir'].replace({'Y': 1, 'N': 0})
  clean_set['Street'] = clean_set['Street'].replace({'Pave': 1, 'Grvl': 0})
  clean_set['Alley'] = clean_set['Alley'].replace({'Pave': 1, 'Grvl': 1})
  clean_set['LotShape'] = clean_set['LotShape'].replace({'Reg': 1, 'IR1': 0, 'IR2': 0, 'IR3': 0})
  clean_set['LandContour'] = clean_set['LandContour'].replace({'Lvl': 1, 'Bnk': 0, 'HLS': 0, 'Low': 0})
  clean_set['Utilities'] = clean_set['Utilities'].replace({'AllPub': 1, 'NoSewr': 0, 'NoSeWa': 0, 'ELO': 0})


In [8]:
clean_set.select_dtypes(include=['object']).columns.tolist()

['MSZoning',
 'LotConfig',
 'LandSlope',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'SaleCondition']

Handling Catagorical Columns which are suitable to use One-hot Encoding method.
Those columns include "MSZoning", "BldgType", "MasVnrType"

In [9]:
clean_set['MSZoning'].value_counts().reset_index()

Unnamed: 0,MSZoning,count
0,RL,1145
1,RM,218
2,FV,62
3,RH,16
4,C (all),10


In [10]:
clean_set['MSZoning'].unique()

array(['RL', 'RM', 'C (all)', 'FV', 'RH'], dtype=object)

We will use one-hot encoding method to handle this column.

In [11]:
# Specify the columns to transform
columns_to_transform_OH = ['MSZoning', 'LotConfig', 'BldgType', 'HouseStyle', 'Foundation', 'GarageType', 'SaleCondition', 'RoofStyle']

# Convert specified columns to strings
clean_set[columns_to_transform_OH] = clean_set[columns_to_transform_OH].astype(str)

# Create an instance of OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Apply the encoder to the specified columns
encoded_columns = pd.DataFrame(OH_encoder.fit_transform(clean_set[columns_to_transform_OH]))

# Get the column names for the one-hot encoded columns
encoded_columns.columns = OH_encoder.get_feature_names_out(columns_to_transform_OH)

# Concatenate the original DataFrame (excluding the transformed columns) with the new encoded columns
clean_set = pd.concat([clean_set.drop(columns_to_transform_OH, axis=1), encoded_columns], axis=1)


Ordinal Encoding

In [12]:
#Uniform the data type by transforming 0 to str
clean_set['BsmtQual'] = clean_set['BsmtQual'].replace({0: 'NoBsmt'})
clean_set['BsmtCond'] = clean_set['BsmtCond'].replace({0: 'NoBsmt'})
clean_set['BsmtExposure'] = clean_set['BsmtExposure'].replace({0: 'NoBsmt'})
clean_set['BsmtFinType1'] = clean_set['BsmtFinType1'].replace({0: 'NoBsmt'})
clean_set['BsmtFinType2'] = clean_set['BsmtFinType2'].replace({0: 'NoBsmt'})
clean_set['FireplaceQu'] = clean_set['FireplaceQu'].replace({0: 'NoFireplace'})
clean_set['GarageFinish'] = clean_set['GarageFinish'].replace({0: 'NoGarage'})
clean_set['GarageQual'] = clean_set['GarageQual'].replace({0: 'NoGarage'})
clean_set['GarageCond'] = clean_set['GarageCond'].replace({0: 'NoGarage'})
clean_set['PoolQC'] = clean_set['PoolQC'].replace({0: 'NoPool'})
clean_set['Fence'] = clean_set['Fence'].replace({0: 'NoFence'})

In [13]:
#Define the desired order for each categorical feature
LandSlope_order = ['Gtl', 'Mod', 'Sev']
ExterQual_order = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
ExterCond_order = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
BsmtQual_order = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NoBsmt']  
BsmtCond_order = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NoBsmt']  
BsmtExposure_order = ['Gd', 'Av', 'Mn', 'No', 'NoBsmt']    
BsmtFinType1_order = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NoBsmt']  
BsmtFinType2_order = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NoBsmt']  
HeatingQC_order = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
KitchenQual_order = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
Functional_order = ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']
FireplaceQu_order = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NoFireplace']  
GarageFinish_order = ['Fin', 'RFn', 'Unf', 'NoGarage']          
GarageQual_order = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NoGarage']   
GarageCond_order = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NoGarage'] 
PavedDrive_order = ['Y', 'P', 'N']
PoolQC_order = ['Ex', 'Gd', 'TA', 'Fa', 'NoPool']              
Fence_order = ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NoFence']  

# List of columns to transform
columns_to_transform_OE = [
    'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
    'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish',
    'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence'
]

# Convert specified columns to strings
clean_set[columns_to_transform_OE] = clean_set[columns_to_transform_OE].astype(str)


# Create an instance of OrdinalEncoder with specified categories
ordinal_encoder = OrdinalEncoder(categories=[
    LandSlope_order,
    ExterQual_order,
    ExterCond_order,
    BsmtQual_order,
    BsmtCond_order,
    BsmtExposure_order,
    BsmtFinType1_order,
    BsmtFinType2_order,
    HeatingQC_order,
    KitchenQual_order,
    Functional_order,
    FireplaceQu_order,
    GarageFinish_order,
    GarageQual_order,
    GarageCond_order,
    PavedDrive_order,
    PoolQC_order,
    Fence_order
])

# Apply the encoder to the specified columns
encoded_columns = ordinal_encoder.fit_transform(clean_set[columns_to_transform_OE])

# Create a new DataFrame with encoded values
encoded_df = pd.DataFrame(encoded_columns, columns=columns_to_transform_OE)


In [14]:
clean_set_dropped = clean_set.drop(columns=columns_to_transform_OE)

final_df = pd.concat([clean_set_dropped, encoded_df], axis=1)