In [20]:
from dec_tree import ClassifierTree, NUM_KIND, CAT_KIND
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier

Using the WeatherAUS dataset for comparison

In [11]:
df = pd.read_csv("data/weatherAUS.csv").dropna()
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
5939,2009-01-01,Cobar,17.9,35.2,0.0,12.0,12.3,SSW,48.0,ENE,...,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,No,0.0,No
5940,2009-01-02,Cobar,18.4,28.9,0.0,14.8,13.0,S,37.0,SSE,...,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,No,0.0,No
5942,2009-01-04,Cobar,19.4,37.6,0.0,10.8,10.6,NNE,46.0,NNE,...,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,No,0.0,No
5943,2009-01-05,Cobar,21.9,38.4,0.0,11.4,12.2,WNW,31.0,WNW,...,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,No,0.0,No
5944,2009-01-06,Cobar,24.2,41.0,0.0,11.2,8.4,WNW,35.0,NW,...,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,No,0.0,No


In [12]:
# dropping leaky var and Date
df.drop(columns=['RISK_MM', 'Date'],inplace=True)

In [13]:
# encode binary vars as bool
df['RainToday'] = (df['RainToday'] == 'Yes')*1
df['RainTomorrow'] = (df['RainTomorrow'] == 'Yes')*1

In [14]:
# train-test split
RNG_SEED = 12345
TGT_VAR = 'RainTomorrow'
X_full = df.drop(columns=[TGT_VAR])
y_full = df[TGT_VAR].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, train_size=0.75, random_state=RNG_SEED)

In [15]:
# peek at train set
X_train.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
61393,Sale,6.5,18.7,0.0,5.2,5.5,W,57.0,SW,W,...,30.0,63.0,87.0,1000.8,1001.5,0.0,8.0,15.5,10.8,0
34006,SydneyAirport,17.9,21.6,11.2,5.0,0.0,SSW,54.0,S,S,...,31.0,94.0,79.0,1023.0,1020.5,8.0,8.0,18.5,19.8,1
91593,Townsville,22.3,30.4,40.0,5.8,5.7,ENE,39.0,E,ENE,...,28.0,78.0,78.0,1001.9,999.2,6.0,5.0,27.3,28.2,1
100491,Nuriootpa,4.9,14.5,0.0,2.4,3.1,SE,17.0,NE,WSW,...,15.0,96.0,80.0,1019.9,1017.6,7.0,7.0,9.2,13.2,0
34819,SydneyAirport,12.1,17.7,6.8,3.2,2.3,SSW,48.0,SSW,S,...,31.0,89.0,75.0,1029.8,1028.4,7.0,7.0,12.8,15.3,1


In [16]:
# discern between cat and num features
cat_vars = X_train.dtypes[X_train.dtypes == 'object'].index.to_list()
num_vars = X_train.dtypes[(X_train.dtypes == 'float') | (X_train.dtypes == 'int')].index.to_list()
assert len(cat_vars)+len(num_vars)==len(X_train.columns)
len(cat_vars), len(num_vars)

(4, 17)

## SciKit Learn Baseline

* using one-hot encoded categorical vars

In [19]:
preprocessor_onehot = ColumnTransformer(
    transformers=[
        ('num','passthrough', num_vars),
        ('cat', OneHotEncoder(sparse=False,dtype=int), cat_vars)
    ]
)

X_train_one_hot = preprocessor_onehot.fit_transform(X_train)
X_test_one_hot = preprocessor_onehot.transform(X_test)

In [22]:
%%time
# SK - init
dtc = DecisionTreeClassifier(random_state=RNG_SEED)

CPU times: total: 0 ns
Wall time: 0 ns


In [23]:
%%time
# SK - fit
dtc.fit(X_train_one_hot, y_train)

CPU times: total: 1.5 s
Wall time: 1.52 s


DecisionTreeClassifier(random_state=12345)

In [24]:
%%time
# SK - predict
y_hat_dtc = dtc.predict(X_test_one_hot)

CPU times: total: 15.6 ms
Wall time: 26.4 ms


## Custom Tree Classifier

* using ordinal encoded categorical vars (as they just need to be encoded as numbers, but ignore values)

In [25]:
preprocessor_ordinal = ColumnTransformer(
    transformers=[
        ('num','passthrough', num_vars),
        ('cat', OrdinalEncoder(dtype=int), cat_vars)
    ]
)
X_train_ordinal = preprocessor_ordinal.fit_transform(X_train)
X_test_ordinal = preprocessor_ordinal.transform(X_test)

In [26]:
%%time
# own - init
ct = ClassifierTree(min_samples=3, criterion="gini", effective_features=X_train_ordinal.shape[1])

CPU times: total: 0 ns
Wall time: 0 ns


In [27]:
%%time
# own - fit
# kinds are all num, then all cat because of ColumnTransformer order
kinds = [NUM_KIND] * len(num_vars) + [CAT_KIND] * len(cat_vars)

ct.fit(X_train_ordinal, y_train, kinds=kinds)

CPU times: total: 18.1 s
Wall time: 18.5 s


<dec_tree.ClassifierTree at 0x183f35353a0>

In [28]:
%%time
# own - predict
y_hat_ct = ct.predict(X_test_ordinal)

CPU times: total: 953 ms
Wall time: 931 ms


## Model comparison

Using:

* F1-score
* ROC-AUC

As this is a binary classification problem (model supports multiclass classification though).

In [32]:
models = [dtc, ct]
names = ["SK-Learn", "Own Tree"]
preds = [y_hat_dtc, y_hat_ct]

print(f"Name\t\t\tF1-score\tROC AUC score")
for model, name, pred in zip(models,names, preds):
    f1 = f1_score(y_test, pred)
    auc = roc_auc_score(y_test, pred)
    print(f"{name}\t\t{f1:.4f}\t\t{auc:.4f}")

Name			F1-score	ROC AUC score
SK-Learn		0.5433		0.7074
Own Tree		0.5378		0.7037


Some insights on the custom model:
* Performs almost exactly the same as the mainstream library implementation
* Can easily be modified (through `effective_features` and `min_samples`) for Forests and Boosting configurations
* The custom Classification model is indeed slower due to several reasons:
    * It uses exact matching for categorical features whenever cardinality drops below a certain threshold (which is better than One-Hot encoding)
    * It's not really optimized and only some parts are Numba-compiled vs. a highly-optimized, CPython-compiled implementation
* Has out-of-the-box support for categorical features and multi-class classification problems, they only need to be ordinal-encoded so that they fit in a NumPy array
* It even has multiple strategies for categorical features, and can easily be expanded with others