In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
# Read the data
X = pd.read_csv('train.csv', index_col='Id')
X_test = pd.read_csv('test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0 , subset = ['SalePrice'], inplace = True)
y = X.SalePrice
X.drop(['SalePrice'],axis = 1, inplace = True)
# To keep things simple, we'll use only numerical predictors
cols_missing = [i for i in X.columns if X[i].isnull().sum() > 0]
X.drop(cols_missing,axis =1, inplace=True)
X_test.drop(cols_missing,axis =1, inplace=True)
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size =0.8,test_size = 0.2,random_state= 0)

In [3]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error 
# function for comparing different approaches
def score_dataset(X_train,X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

### Drop Categorical Variables¶
The easiest approach to dealing with categorical variables is to simply  <span style="background-color: #FFFF00">remove them from the dataset</span>. This approach will only  <span style="background-color: #FFFF00">work well if the columns did not contain useful information</span>.

In [5]:
# Fill in the lines below: drop columns in training and validation data
drop_X_train = X_train.select_dtypes(exclude = ['object'])
drop_X_valid = X_valid.select_dtypes(exclude = ['object'])

In [6]:
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
17837.82570776256


### Ordinal Encoding¶
<span style="background-color: #FFFF00">Ordinal encoding assigns each unique value to a different integer</span>.

This approach assumes an ordering of the categories: "Never" (0) < "Rarely" (1) < "Most days" (2) < "Every day" (3).

This assumption makes sense in this example, because there is an <span style="background-color: #FFFF00">indisputable ranking to the categories</span>. <span style="background-color: #FFFF00">Not all categorical variables have a clear ordering in the values</span>, but we refer to those that do as <span style="background-color: #FFFF00">ordinal variables</span>. For tree-based models (like decision trees and random forests), you can expect ordinal encoding to work well with <span style="background-color: #FFFF00">ordinal variables</span>.

<span style="background-color: #FFFF00">ranking as nominal variables.</span>

In [8]:
# Categorical columns in the training data
object_cols = [i for i in X_train.columns if X_train[i].dtype == 'object']
# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
bad_label_cols = list(set(object_cols)-set(good_label_cols))
# Problematic columns that will be dropped from the dataset
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be ordinal encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Condition2', 'RoofMatl', 'Functional']


In [9]:
from sklearn.preprocessing import OrdinalEncoder 
# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis = 1)
label_X_valid = X_valid.drop(bad_label_cols, axis = 1)
# Apply ordinal encoder 
od = OrdinalEncoder()
label_X_train[good_label_cols] = od.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = od.transform(X_valid[good_label_cols])


In [10]:
print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
17098.01649543379


### One-Hot Encoding¶
One-hot encoding <span style="background-color: #FFFF00">creates new columns indicating the presence (or absence) of each possible value in the original data</span>. 
In the original dataset, "Color" is a categorical variable with three categories: "Red", "Yellow", and "Green". The corresponding one-hot encoding contains <span style="background-color: #FFFF00">one column for each possible value so in this ex we have new 3 columns, and one row for each row in the original dataset</span>. Wherever the original value was <span style="background-color: #FFFF00">"Red", we put a 1 in the "Red" column; if the original value was "Yellow", we put a 1 in the "Yellow" column, and so on</span>.

<span style="background-color: #FFFF00">not assume an ordering of the categories</span>

In [12]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))
# Print number of unique entries by column, in ascending order
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
sorted(d.items(), key=lambda x: x[1])

[('Street', 2),
 ('Utilities', 2),
 ('CentralAir', 2),
 ('LandSlope', 3),
 ('PavedDrive', 3),
 ('LotShape', 4),
 ('LandContour', 4),
 ('ExterQual', 4),
 ('KitchenQual', 4),
 ('MSZoning', 5),
 ('LotConfig', 5),
 ('BldgType', 5),
 ('ExterCond', 5),
 ('HeatingQC', 5),
 ('Condition2', 6),
 ('RoofStyle', 6),
 ('Foundation', 6),
 ('Heating', 6),
 ('Functional', 6),
 ('SaleCondition', 6),
 ('RoofMatl', 7),
 ('HouseStyle', 8),
 ('Condition1', 9),
 ('SaleType', 9),
 ('Exterior1st', 15),
 ('Exterior2nd', 16),
 ('Neighborhood', 25)]

Co the su  dung " Nếu số lượng giá trị duy nhất nhỏ và có trật tự rõ ràng → Có thể là Ordinal.>10
Nếu số lượng giá trị lớn, không có trật tự → Có thể là Nominal.< 10

In [14]:
# Columns that will be one-hot encoded
low_cardinality_cols = [ col for col in X_train.columns if X_train[col].nunique() < 10]
# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

In [15]:
from sklearn.preprocessing import OneHotEncoder 
oh = OneHotEncoder(handle_unknown = 'ignore')
OH_X_train = pd.DataFrame(oh.fit_transform(X_train[low_cardinality_cols]))
OH_X_valid = pd.DataFrame(oh.transform(X_valid[low_cardinality_cols]))
#return index
OH_X_train.index = X_train.index 
OH_X_valid.index = X_valid.index 
#drop object_cols 
num_x_train = X_train.drop(object_cols, axis = 1)
num_x_valid = X_valid.drop(object_cols, axis = 1)

OH_X_train = pd.concat([OH_X_train,num_x_train],axis = 1)
OH_X_valid = pd.concat([OH_X_valid,num_x_valid],axis = 1)

In [16]:
from sklearn.preprocessing import OneHotEncoder

# Use as many lines of code as you need!
OH_encoder = OneHotEncoder(handle_unknown='ignore')
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)



In [17]:
print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 3 (One-Hot Encoding):


TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.