In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [63]:
X = pd.read_csv('train.csv')
X_test = pd.read_csv('train.csv')

X.dropna(axis=0, subset=['SalePrice'], inplace=True)
X_test.dropna(axis=0, subset=['SalePrice'], inplace=True)

y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

cols_with_missing_values = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing_values, axis=1, inplace=True)
X_test.drop(cols_with_missing_values, axis=1, inplace=True)

In [64]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=0)

In [65]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_datasets(X_train, X_val, y_train, y_val):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    val_preds = model.predict(X_val)
    return mean_absolute_error(y_val, val_preds)

In [66]:
#Approach 1: dropping columns

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_val = X_val.select_dtypes(exclude=['object'])

In [67]:
print("MAE from Approach 1: ")
print(score_datasets(drop_X_train, drop_X_val, y_train, y_val))

MAE from Approach 1: 
17734.536520547947


In [68]:
object_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

good_label_cols = [col for col in object_cols
                   if set(X_train[col])==set(X_val[col])]
bad_label_cols = list(set(object_cols) - set(good_label_cols))

In [69]:
#Approach 2: Label encoding

from sklearn.preprocessing import LabelEncoder

label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_val = X_val.drop(bad_label_cols, axis=1)

label_encoder = LabelEncoder()
for col in good_label_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_val[col] = label_encoder.transform(X_val[col])

In [70]:
print("MAE from Approach 2: ")
print(score_datasets(label_X_train, label_X_val, y_train, y_val))

MAE from Approach 2: 
17259.89101369863


In [71]:
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d=dict(zip(object_cols, object_nunique))

sorted(d.items(), key=lambda x: x[1])

[('Street', 2),
 ('Utilities', 2),
 ('CentralAir', 2),
 ('LandSlope', 3),
 ('PavedDrive', 3),
 ('LotShape', 4),
 ('LandContour', 4),
 ('ExterQual', 4),
 ('KitchenQual', 4),
 ('MSZoning', 5),
 ('LotConfig', 5),
 ('Condition2', 5),
 ('BldgType', 5),
 ('ExterCond', 5),
 ('HeatingQC', 5),
 ('RoofStyle', 6),
 ('Foundation', 6),
 ('Heating', 6),
 ('Functional', 6),
 ('SaleCondition', 6),
 ('RoofMatl', 7),
 ('HouseStyle', 8),
 ('Condition1', 9),
 ('SaleType', 9),
 ('Exterior1st', 15),
 ('Exterior2nd', 16),
 ('Neighborhood', 25)]

In [72]:
low_cardinalty_cols = [col for col in object_cols if X_train[col].nunique() < 10]
high_cardinalty_cols = list(set(object_cols)-set(low_cardinalty_cols))

In [73]:
#Approach 3: One Hot Encoding

from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinalty_cols]))
OH_cols_val = pd.DataFrame(OH_encoder.transform(X_val[low_cardinalty_cols]))

OH_cols_train.index = X_train.index
OH_cols_val.index = X_val.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_val = X_val.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_val = pd.concat([num_X_val, OH_cols_val], axis=1)

In [74]:
print("MAE from Approach 3: ")
print(score_datasets(OH_X_train, OH_X_val, y_train, y_val))

MAE from Approach 3: 
17368.00608219178


In [75]:
object_cols = [col for col in X_test.columns if X_test[col].dtype=='object']

low_cardinalty_cols = [col for col in object_cols if X_test[col].nunique()<10]
high_cardinalty_cols = list(set(object_cols)-set(low_cardinalty_cols))

In [76]:
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinalty_cols]))
OH_cols_test.index = X_test.index
num_X_test = X_test.drop(object_cols, axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [77]:
my_model = RandomForestRegressor(n_estimators=100, random_state=0)
my_model.fit(OH_X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [78]:
OH_X_test.drop(['SalePrice'], axis=1, inplace=True)

In [79]:
print(OH_X_val.shape, OH_X_train.shape)

(365, 155) (1095, 155)


In [80]:
test_preds = my_model.predict(OH_X_test)

In [81]:
output = pd.DataFrame({'Id': OH_X_test.index, 
                      'SalePrice': test_preds})
output.to_csv('Categorical values - submission.csv')