# House Prices Prediction

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

### Read Data

In [2]:
training_set_path = '../dataset/train.csv'
train0 = pd.read_csv(training_set_path)

testing_set_path = '../dataset/test.csv'
test0 = pd.read_csv(testing_set_path)

## Cleaning

### Drop Outlier

In [3]:
train1 = train0[train0.GrLivArea < 4500]
train1.dropna(axis=0, subset=['MasVnrArea', 'Electrical'], inplace=True)
train1.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train1.dropna(axis=0, subset=['MasVnrArea', 'Electrical'], inplace=True)


### Concrat train and test set

In [4]:
y = train1['SalePrice']
test_ids = test0['Id']

train1.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test0.drop(['Id'], axis=1, inplace=True)

data1 = pd.concat([train1, test0], axis=0).reset_index(drop=True)
data1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train1.drop(['Id', 'SalePrice'], axis=1, inplace=True)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2903,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
2904,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
2905,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
2906,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


### Fill Missing Value (Catagorical)

In [5]:
data2 = data1.copy()

In [6]:
#For these columns, missing value means the feature is not available. So 'None' is filled, which is easier for future features engineering.
for column in [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature'
]:
    data2[column] = data2[column].fillna("None")

#In the data description document, these columns are not supposed to contain missing value.
#Therefore, we are filling the mode of the column as the missing value
for column in [
    'MSZoning',
    'Utilities',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Electrical',
    'KitchenQual',
    'Functional',
    'SaleType'
]:
    data2[column] = data2[column].fillna(data2[column].mode()[0])

### Fill Missing Value (Numeric)

In [7]:
for column in [
    'LotFrontage',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    'BsmtFullBath',
    'BsmtHalfBath',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea'
]:
    data2[column] = data2[column].fillna(0)

## Features Engineering

### Binary Transformation

In [8]:
data3 = data2.copy()

In [9]:
#Convert the columns 'CentralAir' from (Y/N) to (1/0)
data3['CentralAir'] = data3['CentralAir'].replace({'Y': 1, 'N': 0})

#Convert the data in the column 'Street' to binary, where 'Pave' to 1 and 'Grvl' to 0.
data3['Street'] = data3['Street'].replace({'Pave': 1, 'Grvl': 0})

#Convert the data in the column 'Alley' to binary.
#The way of doing it is to group 'Pave' and 'Grvl'
#If the property has an alley access, then 1; 
#Otherwise 0.
data3['Alley'] = data3['Alley'].replace({'Pave': 1, 'Grvl': 1, 'None': 0})

#Convert the data in the column 'LotShape' to binary, where 'Reg' to 1 and grouping 'IR1', 'IR2' and 'IR3' to 0.
#If the shape of the property is regular, then 1;
#Otherwise 0.
data3['LotShape'] = data3['LotShape'].replace({'Reg': 1, 'IR1': 0, 'IR2': 0, 'IR3': 0})

#Convert the data in the column 'LandContour' to binary, where 'Lvl' to 1 and grouping 'Bnk', 'HLS' and 'Low' to 0.
#If the property is near flat/level, then 1;
#Otherwise 0.
data3['LandContour'] = data3['LandContour'].replace({'Lvl': 1, 'Bnk': 0, 'HLS': 0, 'Low': 0})

#Convert the data in the column 'Utilities' to binary, where 'AllPub' to 1 and grouping 'NoSewr', 'NoSeWa' and 'ELO' to 0.
#If all type of utilities is available, then 1;
#Otherwise 0.
data3['Utilities'] = data3['Utilities'].replace({'AllPub': 1, 'NoSewr': 0, 'NoSeWa': 0, 'ELO': 0})

  data3['CentralAir'] = data3['CentralAir'].replace({'Y': 1, 'N': 0})
  data3['Street'] = data3['Street'].replace({'Pave': 1, 'Grvl': 0})
  data3['Alley'] = data3['Alley'].replace({'Pave': 1, 'Grvl': 1, 'None': 0})
  data3['LotShape'] = data3['LotShape'].replace({'Reg': 1, 'IR1': 0, 'IR2': 0, 'IR3': 0})
  data3['LandContour'] = data3['LandContour'].replace({'Lvl': 1, 'Bnk': 0, 'HLS': 0, 'Low': 0})
  data3['Utilities'] = data3['Utilities'].replace({'AllPub': 1, 'NoSewr': 0, 'NoSeWa': 0, 'ELO': 0})


### Ordinal Encoding

In [10]:
data4 = data3.copy()

In [11]:
#Define the desired order for each categorical feature
LandSlope_order = ['Sev', 'Mod', 'Gtl']
ExterQual_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
ExterCond_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
BsmtQual_order = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
BsmtCond_order = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
BsmtExposure_order = ['None', 'No', 'Mn', 'Av', 'Gd']
BsmtFinType1_order = ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
BsmtFinType2_order = ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
HeatingQC_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
KitchenQual_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
Functional_order = ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ']
FireplaceQu_order = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
GarageFinish_order = ['None', 'Unf', 'RFn', 'Fin']
GarageQual_order = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
GarageCond_order = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
PavedDrive_order = ['N', 'P', 'Y']
PoolQC_order = ['None', 'Fa', 'TA', 'Gd', 'Ex']
Fence_order = ['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

# List of columns to transform
columns_to_transform_OE = [
    'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
    'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish',
    'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence'
]

# Convert specified columns to strings
data4[columns_to_transform_OE] = data4[columns_to_transform_OE].astype(str)

# Create an instance of OrdinalEncoder with specified categories
ordinal_encoder = OrdinalEncoder(categories=[
    LandSlope_order,
    ExterQual_order,
    ExterCond_order,
    BsmtQual_order,
    BsmtCond_order,
    BsmtExposure_order,
    BsmtFinType1_order,
    BsmtFinType2_order,
    HeatingQC_order,
    KitchenQual_order,
    Functional_order,
    FireplaceQu_order,
    GarageFinish_order,
    GarageQual_order,
    GarageCond_order,
    PavedDrive_order,
    PoolQC_order,
    Fence_order
])

# Apply the encoder to the specified columns
encoded_columns_OE = ordinal_encoder.fit_transform(data4[columns_to_transform_OE])

# Create a new DataFrame with encoded values
OE_df = pd.DataFrame(encoded_columns_OE, columns=columns_to_transform_OE)

clean_set_dropped = data4.drop(columns=columns_to_transform_OE)

data4 = pd.concat([clean_set_dropped, OE_df], axis=1)

In [None]:
train.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)