# Decision Tree Classifier

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
train = pd.read_parquet("../data/prototype/train.parquet")
stats = pd.read_parquet("../data/prototype/train_statistics.parquet")
validation = pd.read_parquet("../data/prototype/validation.parquet")

FEATURE_COLUMNS = [
    "Time_spent_Alone",
    "Social_event_attendance",
    "Going_outside",
    "Friends_circle_size",
    "Post_frequency",
]
TARGET_COLUMN = "Personality"
SEED = 42

In [3]:
# Default strategy for NaNs
for col in FEATURE_COLUMNS:
    validation[col] = validation[col].fillna(value=stats[col]["mean"])

In [4]:
X_train = train[FEATURE_COLUMNS].values
y_train = train[TARGET_COLUMN].values
X_test = validation[FEATURE_COLUMNS].values
y_test = validation[TARGET_COLUMN].values

In [5]:
results = []
for criterion in ["gini", "entropy", "log_loss"]:
    for max_features in ["sqrt", "log2", len(FEATURE_COLUMNS), 3, 1]:
        for splitter in ["best", "random"]:
            model = DecisionTreeClassifier(
                criterion=criterion,
                max_features=max_features,
                splitter=splitter,
                random_state=SEED,
            )
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            results.append(
                {
                    "Criterion": criterion,
                    "Max_Features": max_features,
                    "Splitter": splitter,
                    "Accuracy": accuracy_score(y_test, preds),
                }
            )
results = pd.DataFrame(results).sort_values("Accuracy", ascending=False)
results

Unnamed: 0,Criterion,Max_Features,Splitter,Accuracy
0,gini,sqrt,best,0.94143
2,gini,log2,best,0.94143
10,entropy,sqrt,best,0.940891
12,entropy,log2,best,0.940891
22,log_loss,log2,best,0.940891
20,log_loss,sqrt,best,0.940891
19,entropy,1,random,0.939271
29,log_loss,1,random,0.939271
9,gini,1,random,0.939271
7,gini,3,random,0.939001
