In [2]:
import numpy as np
import pandas as pd

from agent import *
from env import Env_Classifier_CrossEntropy
from utils import Experiment_Classiflier

import os
import warnings
from sklearn.neighbors import KNeighborsClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.linear_model import LogisticRegression  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
from sklearn.svm import SVC  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.ensemble import RandomForestClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.tree import DecisionTreeClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.ensemble import AdaBoostClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
from sklearn.neural_network import MLPClassifier # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
from sklearn.naive_bayes import GaussianNB # https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB
from xgboost import XGBClassifier

uci_hand_written_dataset = pd.read_csv("optdigits.tra", header=None)
data = uci_hand_written_dataset[uci_hand_written_dataset.columns[:-1]].values
target = uci_hand_written_dataset[uci_hand_written_dataset.columns[-1]].values
dataset = dict()
dataset["data"] = data
dataset["target"] = target

# index = dataset["target"]==7
# dataset["target"][index] = 0
# print(f"number of label 7 is {np.sum(index)}")
# index = dataset["target"]==9
# dataset["target"][index] = 1
# print(f"number of label 9 is {np.sum(index)}")

In [5]:
# cell for quickly testing model
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import accuracy_score, log_loss
from sklearn.base import clone

warnings.filterwarnings("ignore")
model_list = [
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=15),
    KNeighborsClassifier(n_neighbors=25),
    KNeighborsClassifier(n_neighbors=35),
    KNeighborsClassifier(n_neighbors=45),
    KNeighborsClassifier(n_neighbors=55),
    KNeighborsClassifier(n_neighbors=65),
    KNeighborsClassifier(n_neighbors=75),
    
    LogisticRegression(penalty="l2", fit_intercept=True, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="l2", fit_intercept=True, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="l2", fit_intercept=False, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="l2", fit_intercept=False, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="none", fit_intercept=True, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="none", fit_intercept=True, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="none", fit_intercept=False, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="none", fit_intercept=False, random_state=0, max_iter=100, C=2),
    
    RandomForestClassifier(n_estimators=10, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=20, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=30, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=50, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=10, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=20, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=30, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=50, criterion="entropy", max_depth=5, random_state=0),
    
    AdaBoostClassifier(n_estimators=10, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=20, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=30, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=40, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=10, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=20, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=30, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=40, learning_rate=0.1, random_state=0),
]

def GetBestArm(model_list, n_ground_truth, dataset):
    Match_Index_to_Model = dict()
    for ii, model in enumerate(model_list):
        Match_Index_to_Model[ii + 1] = model
    cross_entropy_ = np.zeros((len(Match_Index_to_Model), n_ground_truth))
    running_time_ = np.zeros((len(Match_Index_to_Model), n_ground_truth))
    for arm_index in range(1, len(Match_Index_to_Model) + 1):
        for exp_index in tqdm(range(n_ground_truth)):
            # split the dataset with different random seed
            new_random_state = np.random.randint(0, 2**31 - 1)
            X_train, X_test, Y_train, Y_test = train_test_split(dataset["data"], dataset["target"], test_size=0.3, random_state=new_random_state)

            t1 = time.time()
            model = clone(Match_Index_to_Model[arm_index])
            model.fit(X_train, Y_train)
            y_test_predict_proba = model.predict_proba(X_test)
            t2 = time.time()
            
            cross_entropy_[arm_index - 1, exp_index] = -log_loss(Y_test, y_test_predict_proba)
            running_time_[arm_index - 1, exp_index] = t2 - t1
            
    cross_entropy_mean_ = np.mean(cross_entropy_, axis=1)
    running_time_mean_ = np.mean(running_time_, axis=1)
    best_arm = np.argmax(cross_entropy_mean_) + 1
    print(f"best arm is {best_arm}, best model is {model_list[best_arm-1].__str__()}")
    for ii, model in enumerate(model_list):
        print(f"{model.__str__()}, entropy {-cross_entropy_mean_[ii]}, running time {running_time_mean_[ii]}")
    return best_arm, Match_Index_to_Model, cross_entropy_, running_time_

best_arm, Match_Index_to_Model, cross_entropy_, running_time_ = GetBestArm(
    model_list=model_list, 
    n_ground_truth=500, 
    dataset=dataset
)
print(np.sum(np.mean(running_time_, axis=1)))

100%|██████████| 500/500 [01:07<00:00,  7.42it/s]
100%|██████████| 500/500 [01:08<00:00,  7.25it/s]
100%|██████████| 500/500 [01:09<00:00,  7.16it/s]
100%|██████████| 500/500 [01:11<00:00,  6.99it/s]
100%|██████████| 500/500 [01:11<00:00,  6.98it/s]
100%|██████████| 500/500 [01:13<00:00,  6.80it/s]
100%|██████████| 500/500 [01:14<00:00,  6.75it/s]
100%|██████████| 500/500 [01:15<00:00,  6.66it/s]
100%|██████████| 500/500 [05:30<00:00,  1.51it/s]
100%|██████████| 500/500 [05:32<00:00,  1.50it/s]
100%|██████████| 500/500 [05:14<00:00,  1.59it/s]
100%|██████████| 500/500 [05:13<00:00,  1.60it/s]
100%|██████████| 500/500 [04:47<00:00,  1.74it/s]
100%|██████████| 500/500 [04:51<00:00,  1.72it/s]
100%|██████████| 500/500 [04:30<00:00,  1.85it/s]
100%|██████████| 500/500 [04:31<00:00,  1.84it/s]
100%|██████████| 500/500 [00:27<00:00, 18.19it/s]
100%|██████████| 500/500 [00:51<00:00,  9.75it/s]
100%|██████████| 500/500 [01:16<00:00,  6.54it/s]
100%|██████████| 500/500 [02:03<00:00,  4.05it/s]


best arm is 3, best model is KNeighborsClassifier(n_neighbors=25)
KNeighborsClassifier(), entropy 0.17309136984299073, running time 0.12401463842391967
KNeighborsClassifier(n_neighbors=15), entropy 0.11820580348558951, running time 0.12774243307113647
KNeighborsClassifier(n_neighbors=25), entropy 0.10913544354129512, running time 0.12945653581619262
KNeighborsClassifier(n_neighbors=35), entropy 0.12525639822818352, running time 0.13375097322463989
KNeighborsClassifier(n_neighbors=45), entropy 0.13989013353223156, running time 0.13484599542617798
KNeighborsClassifier(n_neighbors=55), entropy 0.15967655394930938, running time 0.1396622338294983
KNeighborsClassifier(n_neighbors=65), entropy 0.17401579813966356, running time 0.14140053176879883
KNeighborsClassifier(n_neighbors=75), entropy 0.19083425337391285, running time 0.14379663133621215
LogisticRegression(C=1, random_state=0), entropy 0.19028056472997606, running time 0.6509899697303771
LogisticRegression(C=2, random_state=0), entrop




"\nbest arm is 3, best model is KNeighborsClassifier(n_neighbors=25)\nKNeighborsClassifier(), entropy 0.18467507755926565, running time 0.11919330835342407\nKNeighborsClassifier(n_neighbors=15), entropy 0.12356718310822404, running time 0.12370215892791749\nKNeighborsClassifier(n_neighbors=25), entropy 0.11127790999967598, running time 0.12634235382080078\nKNeighborsClassifier(n_neighbors=35), entropy 0.12283831429212935, running time 0.12813546180725097\nKNeighborsClassifier(n_neighbors=45), entropy 0.1394320152265807, running time 0.13292078495025636\nKNeighborsClassifier(n_neighbors=55), entropy 0.15795143035350562, running time 0.13462753295898439\nKNeighborsClassifier(n_neighbors=65), entropy 0.17143417621781118, running time 0.13891361474990846\nKNeighborsClassifier(n_neighbors=75), entropy 0.18909514857921017, running time 0.13817764759063722\nDecisionTreeClassifier(random_state=0), entropy 3.8284743076276846, running time 0.042206389904022215\nDecisionTreeClassifier(criterion='