In [2]:
import numpy as np
import pandas as pd

from agent import *
from env import Env_Classifier_CrossEntropy
from utils import Experiment_Classiflier

import os
import warnings
from sklearn.neighbors import KNeighborsClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.linear_model import LogisticRegression  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
from sklearn.svm import SVC  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.ensemble import RandomForestClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.tree import DecisionTreeClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.ensemble import AdaBoostClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

dataset_ = pd.read_csv("./Obesity.csv")
data = dataset_[dataset_.drop(columns=["NObeyesdad"]).columns].values
target = dataset_["NObeyesdad"].values
dataset = dict()
dataset["data"] = data
dataset["target"] = target

print(dataset_.shape)

(2111, 21)


In [4]:
# cell for quickly testing model
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import accuracy_score, log_loss
from sklearn.base import clone

warnings.filterwarnings("ignore")
model_list = [
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=15),
    KNeighborsClassifier(n_neighbors=25),
    KNeighborsClassifier(n_neighbors=35),
    KNeighborsClassifier(n_neighbors=45),
    KNeighborsClassifier(n_neighbors=55),
    KNeighborsClassifier(n_neighbors=65),
    KNeighborsClassifier(n_neighbors=75), 
    
    LogisticRegression(penalty="l2", fit_intercept=True, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="l2", fit_intercept=True, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="l2", fit_intercept=False, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="l2", fit_intercept=False, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="none", fit_intercept=True, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="none", fit_intercept=True, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="none", fit_intercept=False, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="none", fit_intercept=False, random_state=0, max_iter=100, C=2),
    
    RandomForestClassifier(n_estimators=10, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=20, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=30, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=50, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=10, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=20, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=30, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=50, criterion="entropy", max_depth=5, random_state=0),
    
    AdaBoostClassifier(n_estimators=10, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=20, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=30, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=40, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=10, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=20, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=30, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=40, learning_rate=0.1, random_state=0),
]

def GetBestArm(model_list, n_ground_truth, dataset):
    Match_Index_to_Model = dict()
    for ii, model in enumerate(model_list):
        Match_Index_to_Model[ii + 1] = model
    cross_entropy_ = np.zeros((len(Match_Index_to_Model), n_ground_truth))
    running_time_ = np.zeros((len(Match_Index_to_Model), n_ground_truth))
    for arm_index in range(1, len(Match_Index_to_Model) + 1):
        for exp_index in tqdm(range(n_ground_truth)):
            # split the dataset with different random seed
            new_random_state = np.random.randint(0, 2**31 - 1)
            X_train, X_test, Y_train, Y_test = train_test_split(dataset["data"], dataset["target"], test_size=0.3, random_state=new_random_state)

            t1 = time.time()
            model = clone(Match_Index_to_Model[arm_index])
            model.fit(X_train, Y_train)
            y_test_predict_proba = model.predict_proba(X_test)
            t2 = time.time()
            
            cross_entropy_[arm_index - 1, exp_index] = -log_loss(Y_test, y_test_predict_proba)
            running_time_[arm_index - 1, exp_index] = t2 - t1
            
    cross_entropy_mean_ = np.mean(cross_entropy_, axis=1)
    running_time_mean_ = np.mean(running_time_, axis=1)
    best_arm = np.argmax(cross_entropy_mean_) + 1
    print(f"best arm is {best_arm}, best model is {model_list[best_arm-1].__str__()}")
    for ii, model in enumerate(model_list):
        print(f"{model.__str__()}, entropy {-cross_entropy_mean_[ii]}, running time {running_time_mean_[ii]}")
    return best_arm, Match_Index_to_Model, cross_entropy_, running_time_

best_arm, Match_Index_to_Model, cross_entropy_, running_time_ = GetBestArm(
    model_list=model_list, 
    n_ground_truth=500, 
    dataset=dataset
)
print(np.sum(np.mean(running_time_, axis=1)))

"""
"""

100%|██████████| 500/500 [00:11<00:00, 43.61it/s]
100%|██████████| 500/500 [00:11<00:00, 42.60it/s]
100%|██████████| 500/500 [00:12<00:00, 40.65it/s]
100%|██████████| 500/500 [00:12<00:00, 39.01it/s]
100%|██████████| 500/500 [00:13<00:00, 37.36it/s]
100%|██████████| 500/500 [00:14<00:00, 35.15it/s]
100%|██████████| 500/500 [00:14<00:00, 35.17it/s]
100%|██████████| 500/500 [00:15<00:00, 33.15it/s]
100%|██████████| 500/500 [00:40<00:00, 12.33it/s]
100%|██████████| 500/500 [00:40<00:00, 12.26it/s]
100%|██████████| 500/500 [00:37<00:00, 13.40it/s]
100%|██████████| 500/500 [00:37<00:00, 13.45it/s]
100%|██████████| 500/500 [00:40<00:00, 12.31it/s]
100%|██████████| 500/500 [00:40<00:00, 12.26it/s]
100%|██████████| 500/500 [00:37<00:00, 13.35it/s]
100%|██████████| 500/500 [00:37<00:00, 13.42it/s]
100%|██████████| 500/500 [00:12<00:00, 40.83it/s]
100%|██████████| 500/500 [00:23<00:00, 21.74it/s]
100%|██████████| 500/500 [00:33<00:00, 14.81it/s]
100%|██████████| 500/500 [00:54<00:00,  9.10it/s]


best arm is 4, best model is KNeighborsClassifier(n_neighbors=35)
KNeighborsClassifier(), entropy 1.4282596677462256, running time 0.020610714435577392
KNeighborsClassifier(n_neighbors=15), entropy 0.8076407710572835, running time 0.021264644622802736
KNeighborsClassifier(n_neighbors=25), entropy 0.7334286480191543, running time 0.021996756076812743
KNeighborsClassifier(n_neighbors=35), entropy 0.7181393793749339, running time 0.023150825023651123
KNeighborsClassifier(n_neighbors=45), entropy 0.7345972331280052, running time 0.024248215675354005
KNeighborsClassifier(n_neighbors=55), entropy 0.7546210076750651, running time 0.02574489164352417
KNeighborsClassifier(n_neighbors=65), entropy 0.7758838082309837, running time 0.026076392650604248
KNeighborsClassifier(n_neighbors=75), entropy 0.7916558549203006, running time 0.027808285236358643
LogisticRegression(C=1, random_state=0), entropy 0.8616643147096255, running time 0.07862513828277588
LogisticRegression(C=2, random_state=0), entrop




'\n'