# House price prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

np.random.seed(12345)

In this notebook there are a `Pipeline` and a `ColumnTransformer` functionalities used in order to tidy up the code a bit. Aside of that, this is mostly the same as the previous one.

#### Preparing the data

In [2]:
train_set = pd.read_csv('data/train.csv', index_col='Id')
test_set = pd.read_csv('data/test.csv', index_col='Id')

In [3]:
train_set.dropna(axis=0, subset=['SalePrice'], inplace=True)

X = train_set.drop(['SalePrice'], axis=1)
y = train_set['SalePrice']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=1
)

In [4]:
numeric_features = X.select_dtypes(exclude='object').columns
categorical_features = X.select_dtypes(include='object').columns

#### One-Hot encoding

In [5]:
low_cardinality_cols = [col for col in categorical_features if X[col].nunique() < 10]
high_cardinality_cols = list(set(categorical_features) - set(low_cardinality_cols))

In [6]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [7]:
column_transformer = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, numeric_features),
    ('categorical_low_card', categorical_transformer, low_cardinality_cols),
    ('drop_unused', 'drop', categorical_features)
])

In [8]:
column_transformer.fit(X)

OH_X_train = pd.DataFrame(column_transformer.transform(X_train))
OH_X_train.columns = (
    pd.Index(numeric_features.to_list() + categorical_features.to_list() +
             np.arange(OH_X_train.columns.size - X_train.columns.size).tolist())
)

OH_X_valid = pd.DataFrame(column_transformer.transform(X_valid))
OH_X_valid.columns = (
    pd.Index(numeric_features.to_list() + categorical_features.to_list() +
             np.arange(OH_X_valid.columns.size - X_valid.columns.size).tolist())
)

In [9]:
model = RandomForestRegressor(criterion='mse', n_estimators=160, random_state=1)
model.fit(OH_X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=160,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [10]:
predictions = model.predict(OH_X_valid)

In [11]:
mae = mean_absolute_error(predictions, y_valid)
print("MAE:", mae)

MAE: 16567.48028681507


#### Label encoding

In [12]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

In [13]:
column_transformer = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features)
])

In [14]:
column_transformer.fit(X)

label_X_train = pd.DataFrame(column_transformer.transform(X_train))
label_X_train.columns = (
    pd.Index(numeric_features.to_list() + categorical_features.to_list() +
             np.arange(label_X_train.columns.size - X_train.columns.size).tolist())
)

label_X_valid = pd.DataFrame(column_transformer.transform(X_valid))
label_X_valid.columns = (
    pd.Index(numeric_features.to_list() + categorical_features.to_list() +
             np.arange(label_X_valid.columns.size - X_valid.columns.size).tolist())
)

In [15]:
label_encoder = LabelEncoder()

for col in categorical_features:
    label_encoder.fit(pd.concat([label_X_train[col], label_X_valid[col]], axis=0))
    label_X_train[col] = label_encoder.transform(label_X_train[col])
    label_X_valid[col] = label_encoder.transform(label_X_valid[col])

In [16]:
model = RandomForestRegressor(criterion='mse', n_estimators=160, random_state=1)
model.fit(label_X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=160,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [17]:
predictions = model.predict(label_X_valid)

In [18]:
mae = mean_absolute_error(predictions, y_valid)
print("MAE:", mae)

MAE: 16333.587435787671


#### One-Hot + Label encoding

In [19]:
low_cardinality_cols = [col for col in categorical_features if X[col].nunique() < 10]
high_cardinality_cols = list(set(categorical_features) - set(low_cardinality_cols))

In [20]:
numeric_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])
categorical_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])
categorical_onehot = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [21]:
column_imputer = ColumnTransformer(transformers=[
    ('numeric', numeric_imputer, numeric_features),
    ('categorical_impute', categorical_imputer, categorical_features)
])

In [22]:
column_imputer.fit(X)

imputed_X_train = pd.DataFrame(column_imputer.transform(X_train))
imputed_X_train.columns = (
    pd.Index(numeric_features.to_list() + categorical_features.to_list() +
             np.arange(imputed_X_train.columns.size - X_train.columns.size).tolist())
)

imputed_X_valid = pd.DataFrame(column_imputer.transform(X_valid))
imputed_X_valid.columns = (
    pd.Index(numeric_features.to_list() + categorical_features.to_list() +
             np.arange(imputed_X_valid.columns.size - X_valid.columns.size).tolist())
)

In [23]:
column_onehot = ColumnTransformer(transformers=[
    ('numeric', 'passthrough', numeric_features),
    ('categorical_low_card', categorical_onehot, low_cardinality_cols),
    ('drop_encoded', 'drop', low_cardinality_cols),
    ('label_enc', 'passthrough', high_cardinality_cols)
])

In [24]:
column_onehot.fit(pd.concat([imputed_X_train, imputed_X_valid], axis=0))

column_onehot.transform(imputed_X_train)
OH_label_X_train = pd.DataFrame(column_onehot.transform(imputed_X_train))
OH_label_X_train.columns = (
    pd.Index(numeric_features.to_list() +
             np.arange(OH_label_X_train.columns.size - numeric_features.size -
                       len(high_cardinality_cols)).tolist() +
             high_cardinality_cols)
)

OH_label_X_valid = pd.DataFrame(column_onehot.transform(imputed_X_valid))
OH_label_X_valid.columns = (
    pd.Index(numeric_features.to_list() +
             np.arange(OH_label_X_train.columns.size - numeric_features.size -
                       len(high_cardinality_cols)).tolist() +
             high_cardinality_cols)
)

In [25]:
label_encoder = LabelEncoder()

for col in high_cardinality_cols:
    label_encoder.fit(pd.concat([OH_label_X_train[col], OH_label_X_valid[col]], axis=0))
    OH_label_X_train[col] = label_encoder.transform(OH_label_X_train[col])
    OH_label_X_valid[col] = label_encoder.transform(OH_label_X_valid[col])

In [26]:
model = RandomForestRegressor(criterion='mse', n_estimators=160, random_state=1)
model.fit(OH_label_X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=160,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [27]:
predictions = model.predict(OH_label_X_valid)

In [28]:
mae = mean_absolute_error(predictions, y_valid)
print("MAE:", mae)

MAE: 16395.526177226027
