In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#Git_url
train_url = "https://github.com/neurone0/explore_data/raw/main/melbourne-housing-snapshot/Data/train.csv"
test_url = "https://github.com/neurone0/explore_data/raw/main/melbourne-housing-snapshot/Data/test.csv"
#Read the data:
X = pd.read_csv(train_url,index_col= "Id")
X_test= pd.read_csv(test_url,index_col="Id")

In [3]:
# Remove rows with missing target
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
#separate target from predictors
y = X.SalePrice
X.drop(['SalePrice'],axis= 1,inplace= True)

In [4]:
# To keep things simple, we'll drop columns with missing values
cols_missing = [ col for col in X.columns if X[col].isnull().any()]
X.drop(cols_missing,  axis= 1, inplace= True)
X_test.drop(cols_missing, axis =1, inplace =True)

In [5]:
# Break off validation set from training data
X_train,X_valid,y_train,y_valid = train_test_split(X,y,train_size=0.8,test_size= 0.2,random_state= 0)

In [6]:
#create a function for test scores
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def scores(X_train,X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators= 100, random_state= 0)
    model.fit(X_train,y_train)
    predict = model.predict(X_valid)
    return mean_absolute_error(y_valid,predict)

In [10]:
# drop columns in training and validation data:
# drop_X_train = X_train.select_dtypes(exclude=['object'])
# drop_X_valid = X_train.select_dtypes(exclude=['object'])
drop_X_train = X_train.select_dtypes(include=['number', 'float'])
drop_X_valid = X_valid.select_dtypes(include=['number', 'float'])

print(drop_X_train)
print(drop_X_valid)

      MSSubClass  LotArea  OverallQual  OverallCond  YearBuilt  YearRemodAdd  \
Id                                                                             
619           20    11694            9            5       2007          2007   
871           20     6600            5            5       1962          1962   
93            30    13360            5            7       1921          2006   
818           20    13265            8            5       2002          2002   
303           20    13704            7            5       2001          2002   
...          ...      ...          ...          ...        ...           ...   
764           60     9430            8            5       1999          1999   
836           20     9600            4            7       1950          1995   
1217          90     8930            6            5       1978          1978   
560          120     3196            7            5       2003          2004   
685           60    16770            7  

In [11]:
print("MAE from Approach 1 (Drop categorical variables):")
scores(drop_X_train, drop_X_valid, y_train, y_valid)

MAE from Approach 1 (Drop categorical variables):


17837.82570776256

In [12]:
print("Unique values in 'Condition2' column in training data:", X_train['Condition2'].unique())
print("\nUnique values in 'Condition2' column in validation data:", X_valid['Condition2'].unique())

Unique values in 'Condition2' column in training data: ['Norm' 'PosA' 'Feedr' 'PosN' 'Artery' 'RRAe']

Unique values in 'Condition2' column in validation data: ['Norm' 'RRAn' 'RRNn' 'Artery' 'Feedr' 'PosN']


In [13]:
#Ordinal encoder will throw error if we use Condition2 column


In [None]:
# Categorical columns in the training data
object_cols = [ col for col in X_train.columns if X_train[col].dtype == 'object']
object_cols


In [None]:
# Columns that can be safely ordinal encoded
good_cols = [ col for col in object_cols if set(X_valid[col]).issubset(set(X_train[col]))]
good_cols


In [17]:
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_cols))
bad_label_cols

['RoofMatl', 'Condition2', 'Functional']

In [None]:
print('Categorical columns that will be ordinal encoded:', good_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

In [19]:
from sklearn.preprocessing import OrdinalEncoder
# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

In [21]:
# Apply ordinal encoder 
myencoder = OrdinalEncoder()
label_X_train[good_cols] = myencoder.fit_transform(X_train[good_cols])
label_X_valid[good_cols] = myencoder.transform(X_valid[good_cols])

In [23]:
print("MAE from Approach 2 (Ordinal Encoding):") 
print(scores(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
17098.01649543379


In [29]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

In [30]:
# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])


[('Street', 2),
 ('Utilities', 2),
 ('CentralAir', 2),
 ('LandSlope', 3),
 ('PavedDrive', 3),
 ('LotShape', 4),
 ('LandContour', 4),
 ('ExterQual', 4),
 ('KitchenQual', 4),
 ('MSZoning', 5),
 ('LotConfig', 5),
 ('BldgType', 5),
 ('ExterCond', 5),
 ('HeatingQC', 5),
 ('Condition2', 6),
 ('RoofStyle', 6),
 ('Foundation', 6),
 ('Heating', 6),
 ('Functional', 6),
 ('SaleCondition', 6),
 ('RoofMatl', 7),
 ('HouseStyle', 8),
 ('Condition1', 9),
 ('SaleType', 9),
 ('Exterior1st', 15),
 ('Exterior2nd', 16),
 ('Neighborhood', 25)]

In [31]:
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

In [32]:
# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

In [33]:
print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)

Categorical columns that will be one-hot encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Exterior1st', 'Exterior2nd', 'Neighborhood']


In [35]:
from sklearn.preprocessing import OneHotEncoder
# Use as many lines of code as you need!
myhotencoder= OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [36]:
OH_cols_train = pd.DataFrame(myhotencoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(myhotencoder.transform(X_valid[low_cardinality_cols]))

In [38]:
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

In [39]:
OH_cols_train.index

Index([ 619,  871,   93,  818,  303, 1455,   41,  960,   76, 1390,
       ...
       1095,  600,  278, 1034, 1384,  764,  836, 1217,  560,  685],
      dtype='int64', name='Id', length=1168)

In [41]:
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

In [42]:
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [43]:
# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

In [44]:
print("MAE from Approach 3 (One-Hot Encoding):") 
print(scores(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 3 (One-Hot Encoding):
17525.345719178084
