# Random Forest Classifier

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_parquet("../data/prototype/train.parquet")
stats = pd.read_parquet("../data/prototype/train_statistics.parquet")
validation = pd.read_parquet("../data/prototype/validation.parquet")

FEATURE_COLUMNS = [
    "Time_spent_Alone",
    "Social_event_attendance",
    "Going_outside",
    "Friends_circle_size",
    "Post_frequency",
]
TARGET_COLUMN = "Personality"
SEED = 42

In [3]:
# Default strategy for NaNs
for col in FEATURE_COLUMNS:
    validation[col] = validation[col].fillna(value=stats[col]["mean"])

In [4]:
X_train = train[FEATURE_COLUMNS].values
y_train = train[TARGET_COLUMN].values
X_test = validation[FEATURE_COLUMNS].values
y_test = validation[TARGET_COLUMN].values

In [5]:
results = []

for n_estimators in [50, 100, 500, 1000]:
    for criterion in ["gini", "entropy", "log_loss"]:
        for max_features in ["sqrt", "log2", len(FEATURE_COLUMNS), 3, 1]:
            model = RandomForestClassifier(
                n_estimators=n_estimators,
                criterion=criterion,
                max_features=max_features,
                random_state=SEED,
            )
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            results.append(
                {
                    "Estimators": n_estimators,
                    "Criterion": criterion,
                    "Max_Features": max_features,
                    "Accuracy": accuracy_score(y_test, preds),
                }
            )
results = pd.DataFrame(results).sort_values("Accuracy", ascending=False)
results

Unnamed: 0,Estimators,Criterion,Max_Features,Accuracy
59,1000,log_loss,1,0.966802
54,1000,entropy,1,0.966802
19,100,gini,1,0.966532
34,500,gini,1,0.966532
49,1000,gini,1,0.966532
44,500,log_loss,1,0.966262
39,500,entropy,1,0.966262
29,100,log_loss,1,0.965992
24,100,entropy,1,0.965992
4,50,gini,1,0.965722
