In [2]:
import numpy as np
import pandas as pd

from agent import *
from env import Env_Classifier_CrossEntropy
from utils import Experiment_Classiflier

import os
import warnings
from sklearn.neighbors import KNeighborsClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.linear_model import LogisticRegression  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
from sklearn.svm import SVC  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.ensemble import RandomForestClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.tree import DecisionTreeClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.ensemble import AdaBoostClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

mnist_dataset_3_8 = pd.read_csv("./mnist-train-3-8.csv")
mnist_dataset_3_8 = mnist_dataset_3_8[0:mnist_dataset_3_8.shape[0]//4]
data = mnist_dataset_3_8[mnist_dataset_3_8.columns[1:]].values
target = mnist_dataset_3_8["label"].values
dataset = dict()
dataset["data"] = data
dataset["target"] = target

index = dataset["target"]==3
dataset["target"][index] = 0
print(f"number of label 3 is {np.sum(index)}")
index = dataset["target"]==8
dataset["target"][index] = 1
print(f"number of label 8 is {np.sum(index)}")

number of label 3 is 1086
number of label 8 is 1017


In [5]:
# cell for quickly testing model
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import accuracy_score, log_loss
from sklearn.base import clone

warnings.filterwarnings("ignore")
model_list = [
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=15),
    KNeighborsClassifier(n_neighbors=25),
    KNeighborsClassifier(n_neighbors=35),
    KNeighborsClassifier(n_neighbors=45),
    KNeighborsClassifier(n_neighbors=55),
    KNeighborsClassifier(n_neighbors=65),
    KNeighborsClassifier(n_neighbors=75),
    
    LogisticRegression(penalty="l2", fit_intercept=True, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="l2", fit_intercept=True, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="l2", fit_intercept=False, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="l2", fit_intercept=False, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="none", fit_intercept=True, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="none", fit_intercept=True, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="none", fit_intercept=False, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="none", fit_intercept=False, random_state=0, max_iter=100, C=2),
    
    RandomForestClassifier(n_estimators=10, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=20, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=30, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=50, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=10, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=20, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=30, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=50, criterion="entropy", max_depth=5, random_state=0),
    
    AdaBoostClassifier(n_estimators=10, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=20, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=30, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=40, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=10, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=20, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=30, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=40, learning_rate=0.1, random_state=0),
]

def GetBestArm(model_list, n_ground_truth, dataset):
    Match_Index_to_Model = dict()
    for ii, model in enumerate(model_list):
        Match_Index_to_Model[ii + 1] = model
    cross_entropy_ = np.zeros((len(Match_Index_to_Model), n_ground_truth))
    running_time_ = np.zeros((len(Match_Index_to_Model), n_ground_truth))
    for arm_index in range(1, len(Match_Index_to_Model) + 1):
        for exp_index in tqdm(range(n_ground_truth)):
            # split the dataset with different random seed
            new_random_state = np.random.randint(0, 2**31 - 1)
            X_train, X_test, Y_train, Y_test = train_test_split(dataset["data"], dataset["target"], test_size=0.3, random_state=new_random_state)

            t1 = time.time()
            model = clone(Match_Index_to_Model[arm_index])
            model.fit(X_train, Y_train)
            y_test_predict_proba = model.predict_proba(X_test)
            t2 = time.time()
            
            cross_entropy_[arm_index - 1, exp_index] = -log_loss(Y_test, y_test_predict_proba)
            running_time_[arm_index - 1, exp_index] = t2 - t1
            
    cross_entropy_mean_ = np.mean(cross_entropy_, axis=1)
    running_time_mean_ = np.mean(running_time_, axis=1)
    best_arm = np.argmax(cross_entropy_mean_) + 1
    print(f"best arm is {best_arm}, best model is {model_list[best_arm-1].__str__()}")
    for ii, model in enumerate(model_list):
        print(f"{model.__str__()}, entropy {-cross_entropy_mean_[ii]}, running time {running_time_mean_[ii]}")
    return best_arm, Match_Index_to_Model, cross_entropy_, running_time_

best_arm, Match_Index_to_Model, cross_entropy_, running_time_ = GetBestArm(
    model_list=model_list, 
    n_ground_truth=500, 
    dataset=dataset
)
print(np.sum(np.mean(running_time_, axis=1)))

"""
"""

100%|██████████| 500/500 [00:45<00:00, 11.10it/s]
100%|██████████| 500/500 [00:46<00:00, 10.84it/s]
100%|██████████| 500/500 [00:46<00:00, 10.79it/s]
100%|██████████| 500/500 [00:46<00:00, 10.72it/s]
100%|██████████| 500/500 [00:49<00:00, 10.20it/s]
100%|██████████| 500/500 [00:51<00:00,  9.75it/s]
100%|██████████| 500/500 [00:52<00:00,  9.56it/s]
100%|██████████| 500/500 [00:52<00:00,  9.61it/s]
100%|██████████| 500/500 [01:30<00:00,  5.53it/s]
100%|██████████| 500/500 [01:33<00:00,  5.37it/s]
100%|██████████| 500/500 [01:29<00:00,  5.57it/s]
100%|██████████| 500/500 [01:31<00:00,  5.47it/s]
100%|██████████| 500/500 [00:58<00:00,  8.53it/s]
100%|██████████| 500/500 [00:57<00:00,  8.68it/s]
100%|██████████| 500/500 [00:58<00:00,  8.48it/s]
100%|██████████| 500/500 [00:57<00:00,  8.74it/s]
100%|██████████| 500/500 [00:37<00:00, 13.29it/s]
100%|██████████| 500/500 [01:07<00:00,  7.45it/s]
100%|██████████| 500/500 [01:35<00:00,  5.22it/s]
100%|██████████| 500/500 [02:34<00:00,  3.23it/s]


best arm is 2, best model is KNeighborsClassifier(n_neighbors=15)
KNeighborsClassifier(), entropy 0.1837041642931709, running time 0.06561910915374757
KNeighborsClassifier(n_neighbors=15), entropy 0.10503985055627499, running time 0.06835254383087158
KNeighborsClassifier(n_neighbors=25), entropy 0.11485629153777194, running time 0.0688281626701355
KNeighborsClassifier(n_neighbors=35), entropy 0.1296782155080673, running time 0.07092141914367676
KNeighborsClassifier(n_neighbors=45), entropy 0.14110107640672898, running time 0.07319995641708374
KNeighborsClassifier(n_neighbors=55), entropy 0.15225645238830954, running time 0.07696837854385376
KNeighborsClassifier(n_neighbors=65), entropy 0.16139769517910485, running time 0.077932626247406
KNeighborsClassifier(n_neighbors=75), entropy 0.17108689239767189, running time 0.07909664201736451
LogisticRegression(C=1, random_state=0), entropy 0.774909640149267, running time 0.15779634141921997
LogisticRegression(C=2, random_state=0), entropy 0.8




'\n'