In [1]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("../data/train_data_preprocessed.csv")
X = data.drop(['smoking'], axis=1)
y = data['smoking']

In [3]:
def objective(trial):
    
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int("max_depth", 10, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 8)
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    
    rf = RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth,
        min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
        criterion=criterion, random_state=42
    )
    
    
    scores = cross_val_score(rf, X, y, cv=3, scoring='accuracy')
    accuracy = scores.mean()
    
    return accuracy

In [4]:
import optuna
import logging
optuna.logging.get_logger("optuna").setLevel(logging.INFO)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)  


[I 2024-06-10 12:00:16,996] A new study created in memory with name: no-name-a83a49dd-0f75-4224-be0a-04c0eb7c6b41
[I 2024-06-10 12:06:55,565] Trial 0 finished with value: 0.7744602502017757 and parameters: {'n_estimators': 380, 'max_depth': 46, 'min_samples_split': 8, 'min_samples_leaf': 1, 'criterion': 'entropy'}. Best is trial 0 with value: 0.7744602502017757.
[I 2024-06-10 12:08:19,384] Trial 1 finished with value: 0.7721045197740114 and parameters: {'n_estimators': 125, 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 5, 'criterion': 'gini'}. Best is trial 0 with value: 0.7744602502017757.
[I 2024-06-10 12:17:20,350] Trial 2 finished with value: 0.7739810330912027 and parameters: {'n_estimators': 683, 'max_depth': 31, 'min_samples_split': 10, 'min_samples_leaf': 8, 'criterion': 'gini'}. Best is trial 0 with value: 0.7744602502017757.
[I 2024-06-10 12:25:26,532] Trial 3 finished with value: 0.7743492736077483 and parameters: {'n_estimators': 445, 'max_depth': 33, 'min_sa

KeyboardInterrupt: 

In [None]:


best_rf_params = study.best_params
print("Best hyperparameters: ", best_rf_params)


best_rf = RandomForestClassifier(**best_rf_params, random_state=42)
best_rf.fit(X, y)


test = pd.read_csv("../data/test_data_preprocessed.csv")
test_with_id = pd.read_csv("../data/test.csv")

In [None]:

y_prob = best_rf.predict_proba(test)
class_index = list(best_rf.classes_).index(1)
smoking_probabilities = y_prob[:, class_index]

In [None]:
submission = pd.DataFrame({
    'id': test_with_id['id'],
    'smoking': smoking_probabilities
})

submission.to_csv('submission1.csv', index=False)