# House price prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

np.random.seed(12345)

This notebook presents some twiddling with the encoding of categorical features in order to make use of those non-numeric columns. Specifically there are two methods mentioned - one-hot encoding and label encoding. Firstly, those two are compared in terms of their efficacy and I work around a bit just to get accustomed to them. Finally, is use both of them simultaneously to get best possible results.

#### Preparing the data

In [2]:
train_set = pd.read_csv('data/train.csv', index_col='Id')
test_set = pd.read_csv('data/test.csv', index_col='Id')

In [3]:
train_set.dropna(axis=0, subset=['SalePrice'], inplace=True)

X = train_set.drop(['SalePrice'], axis=1)
y = train_set['SalePrice']

In [4]:
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test = test_set.drop(cols_with_missing, axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=1
)

#### One-Hot encoding

In [5]:
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
high_cardinality_cols = list(set(object_cols) - set(low_cardinality_cols))

In [6]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [7]:
model = RandomForestRegressor(criterion='mse', n_estimators=160, random_state=1)
model.fit(OH_X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=160,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [8]:
predictions = model.predict(OH_X_valid)

In [9]:
mae = mean_absolute_error(predictions, y_valid)
print("MAE:", mae)

MAE: 16066.349315068494


#### Label encoding

In [10]:
good_label_cols = [col for col in object_cols if set(X_train[col]) == set(X_valid[col])]
bad_label_cols = list(set(object_cols) - set(good_label_cols))

In [11]:
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

label_encoder = LabelEncoder()
for col in good_label_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

In [12]:
model = RandomForestRegressor(criterion='mse', n_estimators=160, random_state=1)
model.fit(label_X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=160,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [13]:
predictions = model.predict(label_X_valid)

In [14]:
mae = mean_absolute_error(predictions, y_valid)
print("MAE:", mae)

MAE: 16210.193664383563


#### Final (One-Hot encoding)

In [15]:
imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(imputer.fit_transform(X_test))
imputed_X_test.columns = X_test.columns

In [16]:
full_X_train = pd.concat([X_train, X_valid], axis=0)
full_y_train = pd.concat([y_train, y_valid], axis=0)

In [17]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_encoder.fit(pd.concat([full_X_train[low_cardinality_cols],
                          imputed_X_test[low_cardinality_cols]],
                         axis=0))

OH_cols_train = pd.DataFrame(OH_encoder.transform(full_X_train[low_cardinality_cols]))
OH_cols_train.index = full_X_train.index
numeric_X_train = full_X_train.drop(object_cols, axis=1)
OH_X_train = pd.concat([numeric_X_train, OH_cols_train], axis=1)

OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols]))
OH_cols_test.index = imputed_X_test.index
numeric_X_test = imputed_X_test.drop(object_cols, axis=1)
OH_X_test = pd.concat([numeric_X_test, OH_cols_test], axis=1)

In [18]:
model = RandomForestRegressor(criterion='mse', n_estimators=160, random_state=1)
model.fit(OH_X_train, full_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=160,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [19]:
preds_final = model.predict(OH_X_test)

#### One-Hot + Label encoding

Inferring from the past results, the best results can be obtained primarily using one-hot encoding for columns with rather low cardinality, and then label-encoding the remaining ones.

In [20]:
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

remaining_cols = high_cardinality_cols

In [21]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_encoder.fit(pd.concat([X_train[low_cardinality_cols],
                          X_valid[low_cardinality_cols]],
                         axis=0))

OH_cols_train = pd.DataFrame(OH_encoder.transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

numeric_X_train = X_train.drop(low_cardinality_cols, axis=1)
numeric_X_valid = X_valid.drop(low_cardinality_cols, axis=1)

OH_label_X_train = pd.concat([numeric_X_train, OH_cols_train], axis=1)
OH_label_X_valid = pd.concat([numeric_X_valid, OH_cols_valid], axis=1)

In [22]:
label_encoder = LabelEncoder()

for col in remaining_cols:
    label_encoder.fit(pd.concat([X_train[col], X_valid[col]], axis=0))
    OH_label_X_train[col] = label_encoder.transform(X_train[col])
    OH_label_X_valid[col] = label_encoder.transform(X_valid[col])

In [23]:
model = RandomForestRegressor(criterion='mse', n_estimators=160, random_state=1)
model.fit(OH_label_X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=160,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [24]:
predictions = model.predict(OH_label_X_valid)

In [25]:
mae = mean_absolute_error(predictions, y_valid)
print("MAE:", mae)

MAE: 15967.157127568491


#### Final (One-Hot + Label encoding)

In [26]:
imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(imputer.fit_transform(X_test))
imputed_X_test.columns = X_test.columns

In [27]:
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

remaining_cols = high_cardinality_cols

In [28]:
full_X_train = pd.concat([X_train, X_valid], axis=0)
full_y_train = pd.concat([y_train, y_valid], axis=0)

In [29]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_encoder.fit(pd.concat([full_X_train[low_cardinality_cols], 
                          imputed_X_test[low_cardinality_cols]],
                         axis=0))

OH_cols_train = pd.DataFrame(OH_encoder.transform(full_X_train[low_cardinality_cols]))
OH_cols_train.index = full_X_train.index
numeric_X_train = full_X_train.drop(low_cardinality_cols, axis=1)
OH_label_X_train = pd.concat([numeric_X_train, OH_cols_train], axis=1)

OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols]))
OH_cols_test.index = imputed_X_test.index
numeric_X_test = imputed_X_test.drop(low_cardinality_cols, axis=1)
OH_label_X_test = pd.concat([numeric_X_test, OH_cols_test], axis=1)

In [30]:
label_encoder = LabelEncoder()

for col in remaining_cols:
    label_encoder.fit(pd.concat([full_X_train[col], imputed_X_test[col]], axis=0))
    OH_label_X_train[col] = label_encoder.transform(full_X_train[col])
    OH_label_X_test[col] = label_encoder.transform(imputed_X_test[col])

In [31]:
model = RandomForestRegressor(criterion='mse', n_estimators=160, random_state=1)
model.fit(OH_label_X_train, full_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=160,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [32]:
preds_final = model.predict(OH_label_X_test)

In [33]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_final})
output.to_csv('submission.csv', index=False)