In [2]:
import numpy as np
import pandas as pd

from agent import *
from env import Env_Classifier_CrossEntropy
from utils import Experiment_Classiflier

import os
import warnings
from sklearn.neighbors import KNeighborsClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.linear_model import LogisticRegression  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
from sklearn.svm import SVC  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.ensemble import RandomForestClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.tree import DecisionTreeClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.ensemble import AdaBoostClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

dataset_ = pd.read_csv("./Arcene.csv")
data = dataset_[dataset_.columns[:-1]].values
target = dataset_[dataset_.columns[-1]].values
dataset = dict()
dataset["data"] = data
dataset["target"] = target

index = dataset["target"]==-1
print(f"number of label -1 is {np.sum(index)}")
index = dataset["target"]==1
print(f"number of label 1 is {np.sum(index)}")

number of label -1 is 112
number of label 1 is 88


In [5]:
# cell for quickly testing model
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import accuracy_score, log_loss
from sklearn.base import clone

warnings.filterwarnings("ignore")
model_list = [
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=15),
    KNeighborsClassifier(n_neighbors=25),
    KNeighborsClassifier(n_neighbors=35),
    KNeighborsClassifier(n_neighbors=45),
    KNeighborsClassifier(n_neighbors=55),
    KNeighborsClassifier(n_neighbors=65),
    KNeighborsClassifier(n_neighbors=75),
    
    LogisticRegression(penalty="l2", fit_intercept=True, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="l2", fit_intercept=True, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="l2", fit_intercept=False, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="l2", fit_intercept=False, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="none", fit_intercept=True, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="none", fit_intercept=True, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="none", fit_intercept=False, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="none", fit_intercept=False, random_state=0, max_iter=100, C=2),
    
    RandomForestClassifier(n_estimators=10, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=20, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=30, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=50, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=10, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=20, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=30, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=50, criterion="entropy", max_depth=5, random_state=0),
    
    AdaBoostClassifier(n_estimators=10, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=20, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=30, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=40, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=10, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=20, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=30, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=40, learning_rate=0.1, random_state=0),
]

def GetBestArm(model_list, n_ground_truth, dataset):
    Match_Index_to_Model = dict()
    for ii, model in enumerate(model_list):
        Match_Index_to_Model[ii + 1] = model
    cross_entropy_ = np.zeros((len(Match_Index_to_Model), n_ground_truth))
    running_time_ = np.zeros((len(Match_Index_to_Model), n_ground_truth))
    for arm_index in range(1, len(Match_Index_to_Model) + 1):
        for exp_index in tqdm(range(n_ground_truth)):
            # split the dataset with different random seed
            new_random_state = np.random.randint(0, 2**31 - 1)
            X_train, X_test, Y_train, Y_test = train_test_split(dataset["data"], dataset["target"], test_size=0.3, random_state=new_random_state)

            t1 = time.time()
            model = clone(Match_Index_to_Model[arm_index])
            model.fit(X_train, Y_train)
            y_test_predict_proba = model.predict_proba(X_test)
            t2 = time.time()
            
            cross_entropy_[arm_index - 1, exp_index] = -log_loss(Y_test, y_test_predict_proba)
            running_time_[arm_index - 1, exp_index] = t2 - t1
            
    cross_entropy_mean_ = np.mean(cross_entropy_, axis=1)
    running_time_mean_ = np.mean(running_time_, axis=1)
    best_arm = np.argmax(cross_entropy_mean_) + 1
    print(f"best arm is {best_arm}, best model is {model_list[best_arm-1].__str__()}")
    for ii, model in enumerate(model_list):
        print(f"{model.__str__()}, entropy {-cross_entropy_mean_[ii]}, running time {running_time_mean_[ii]}")
    return best_arm, Match_Index_to_Model, cross_entropy_, running_time_

best_arm, Match_Index_to_Model, cross_entropy_, running_time_ = GetBestArm(
    model_list=model_list, 
    n_ground_truth=500, 
    dataset=dataset
)
print(np.sum(np.mean(running_time_, axis=1)))

"""
"""

100%|██████████| 500/500 [00:24<00:00, 20.51it/s]
100%|██████████| 500/500 [00:24<00:00, 20.13it/s]
100%|██████████| 500/500 [00:25<00:00, 19.96it/s]
100%|██████████| 500/500 [00:25<00:00, 19.71it/s]
100%|██████████| 500/500 [00:24<00:00, 20.30it/s]
100%|██████████| 500/500 [00:25<00:00, 19.47it/s]
100%|██████████| 500/500 [00:26<00:00, 19.02it/s]
100%|██████████| 500/500 [00:25<00:00, 19.63it/s]
100%|██████████| 500/500 [04:00<00:00,  2.08it/s]
100%|██████████| 500/500 [04:01<00:00,  2.07it/s]
100%|██████████| 500/500 [03:59<00:00,  2.09it/s]
100%|██████████| 500/500 [03:56<00:00,  2.11it/s]
100%|██████████| 500/500 [02:01<00:00,  4.10it/s]
100%|██████████| 500/500 [02:02<00:00,  4.07it/s]
100%|██████████| 500/500 [02:04<00:00,  4.03it/s]
100%|██████████| 500/500 [02:02<00:00,  4.08it/s]
100%|██████████| 500/500 [00:33<00:00, 14.96it/s]
100%|██████████| 500/500 [00:54<00:00,  9.11it/s]
100%|██████████| 500/500 [01:15<00:00,  6.64it/s]
100%|██████████| 500/500 [01:56<00:00,  4.30it/s]


best arm is 20, best model is RandomForestClassifier(max_depth=5, n_estimators=50, random_state=0)
KNeighborsClassifier(), entropy 0.8573784438836206, running time 0.02016094970703125
KNeighborsClassifier(n_neighbors=15), entropy 0.48823275431585744, running time 0.02092872667312622
KNeighborsClassifier(n_neighbors=25), entropy 0.5653357645218335, running time 0.02131120681762695
KNeighborsClassifier(n_neighbors=35), entropy 0.6154464899710326, running time 0.02155689764022827
KNeighborsClassifier(n_neighbors=45), entropy 0.6312098161154652, running time 0.020849319458007814
KNeighborsClassifier(n_neighbors=55), entropy 0.6416978484210916, running time 0.02188170623779297
KNeighborsClassifier(n_neighbors=65), entropy 0.6525717378145706, running time 0.02233941602706909
KNeighborsClassifier(n_neighbors=75), entropy 0.6429523214170524, running time 0.02245058298110962
LogisticRegression(C=1, random_state=0), entropy 0.6942695875865583, running time 0.44920317220687866
LogisticRegression(




'\n'