# Categorical Variables

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv("../data/melb_data.csv")

y = data.Price
X = data.drop("Price" , axis=1)

X_train_full , X_test_full , y_train , y_test = train_test_split(X , y , random_state=1 , train_size=0.8 , test_size=0.2)
columns_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(columns_with_missing , axis=1 , inplace=True)
X_test_full.drop(columns_with_missing , axis=1 , inplace=True)

low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

numerical_columns = [col for col in X_train_full if X_train_full[col].dtype in ['int64' , 'float64']]

my_cols = low_cardinality_cols + numerical_columns

X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [81]:
s = (X_train.dtypes == "object")
object_cols = list(s[s].index)
object_cols


['Type', 'Method', 'Regionname']

In [76]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train , X_test , y_train , y_test):
    model = RandomForestRegressor(n_estimators=100 , random_state=1)
    model.fit(X_train , y_train)
    predict = model.predict(X_test)
    return mean_absolute_error(y_test , predict)


### Approach one - Drop columns

In [77]:
drop_X_train = X_train.select_dtypes(exclude="object")
drop_X_test = X_test.select_dtypes(exclude="object")

score_dataset(drop_X_train , drop_X_test , y_train , y_test)

176556.1092096132

### Approach two - Ordinal Encoding

In [91]:
from sklearn.preprocessing import OrdinalEncoder

label_X_train = X_train.copy()
label_X_test = X_test.copy()

ordinal_encoder = OrdinalEncoder()

label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_test[object_cols] = ordinal_encoder.transform(X_test[object_cols])


score_dataset(label_X_train , label_X_test , y_train , y_test)


165018.58378760784

### Approach three - One-hot Encoding

In [146]:
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder(handle_unknown="ignore" , sparse=False)

oh_cols_train =  pd.DataFrame(oh_encoder.fit_transform(X_train[object_cols]) , columns=[item for array in oh_encoder.categories_ for item in array])
oh_cols_test = pd.DataFrame(oh_encoder.transform(X_test[object_cols]) , columns=[item for array in oh_encoder.categories_ for item in array])


oh_cols_train.index = X_train.index
oh_cols_test.index = X_test.index

oh_cols_train
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)


OH_X_train = pd.concat([num_X_train, oh_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, oh_cols_test], axis=1)


score_dataset(OH_X_train , OH_X_test , y_train , y_test)

162363.08246142787