In [34]:
#Categorical encoding

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

In [38]:
path = "housing-classification-iter3.csv"
data = pd.read_csv(path)

In [40]:
#X and y creation
y = data.pop('Expensive')
X = data

In [None]:
# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

2. Categorical encoding - "MANUAL" approach (Without Pipelines)
In order to encode the categorical columns numerical, we follow the following steps:

Selected the categorical columns.
Fitted a OneHotEncoder to them.
Transformed the categorical columns with the encoder.
Converted the sparse matrix into a dataframe.
Recovered the names of the columns.
Concatenated the one-hot columns with the numerical columns.

In [None]:
#Splitting non-numerical and numerical columns, train and test
X_train_cat = X_train.select_dtypes(exclude="number")
X_test_cat = X_test.select_dtypes(exclude="number")
X_train_num = X_train.select_dtypes(include="number")
X_test_num = X_test.select_dtypes(include="number")

In [None]:
#TREATMENT OF THE NUMERICAL FEATURES 

#Define the imputers to replace the NaNs
num_imputer = SimpleImputer(strategy="mean").set_output(transform='pandas')

#Transforming (and fitting) the (training) data
X_train_imputed_num = num_imputer.fit_transform(X_train_num)
X_test_imputed_num = num_imputer.transform(X_test_num)

In [None]:
#TREATMENT OF THE CATEGORIACAL FEATURES

#NaNs
cat_imputer = SimpleImputer(strategy="constant",fill_value="unknown").set_output(transform='pandas')

In [None]:
#Transforming (and tfitting) the (training) data
X_train_imputed_cat = cat_imputer.fit_transform(X_train_cat)
X_test_imputed_cat = cat_imputer.transform(X_test_cat)


In [None]:
#One-hot encoding
#import
from sklearn.preprocessing import OneHotEncoder

In [None]:
# initialize
my_onehot = OneHotEncoder(drop="first",sparse_output=False).set_output(transform='pandas')

In [None]:
# fit
my_onehot.fit(X_train_imputed_cat)

In [None]:
# transform
X_cat_imputed_onehot_train = my_onehot.transform(X_train_imputed_cat)

In [None]:
#Concatenating "one-hot" columns with numerical columns
X_imputed_train = pd.concat([X_cat_imputed_onehot_train, X_train_imputed_num], axis=1)
X_imputed_train.head()

In [None]:
###3. Categorical encoding - "Automated" approach (Using Pipelines)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Creating the "numeric pipe" and the "categoric pipe"

#select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

In [None]:
#create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(SimpleImputer(strategy="mean"))


In [None]:
#create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False,handle_unknown='ignore')
)

In [None]:
#Using ColumnTransformer a pipeline with 2 branches (the preprocessor)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns),
    ]
)


In [None]:
#Creating the full_pipeline (preprocessor + Decision Tree)
full_pipeline = make_pipeline(preprocessor,
                              DecisionTreeClassifier()).set_output(transform='pandas')

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


In [None]:
#train a DecisionTree with GridSearch cross validation

#parameter grid
param_grid ={
    'columntransformer__num_pipe__simpleimputer__strategy':['mean','median'],
    'decisiontreeclassifier__max_depth': range(2, 12),
    'decisiontreeclassifier__min_samples_leaf': range(3, 10, 2),
    'decisiontreeclassifier__min_samples_split': range(3, 40, 5),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
    }


In [None]:
# define cross validation
search = GridSearchCV(full_pipeline,
                      param_grid=param_grid,
                      cv=5,
                      verbose=0)


In [None]:
# fit
search.fit(X_train,y_train)

print(f"The best parameters are {search.best_params_}")
print("")
print(f"The average accuracy is {search.best_score_}")

In [None]:
# training accuracy
#below we use X_train and not X_train_imputed because imputing is
#built in the pipeline
print(f"The training accuracy is {accuracy_score(y_train, search.predict(X_train))}")

In [None]:
# testing accuracy
print(f"The testing accuracy is {accuracy_score(y_test, search.predict(X_test))}")

The best parameters are {'columntransformer__num_pipe__simpleimputer__strategy': 'mean', 'decisiontreeclassifier__criterion': 'entropy', 'decisiontreeclassifier__max_depth': 6, 'decisiontreeclassifier__min_samples_leaf': 3, 'decisiontreeclassifier__min_samples_split': 38}

The average accuracy is 0.9212317963390925
The training accuracy is 0.9409246575342466
The testing accuracy is 0.9075342465753424