In [2]:
import numpy as np
import pandas as pd

from agent import *
from env import Env_Classifier_CrossEntropy
from utils import Experiment_Classiflier

import os
import warnings
from sklearn.neighbors import KNeighborsClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.linear_model import LogisticRegression  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
from sklearn.svm import SVC  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.ensemble import RandomForestClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.tree import DecisionTreeClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.ensemble import AdaBoostClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

madelon = pd.read_csv("./madelon.csv")
data = madelon[madelon.columns[:-1]].values
target = madelon[madelon.columns[-1]].values
dataset = dict()
dataset["data"] = data
dataset["target"] = target

index = dataset["target"]==-1
print(f"number of label -1 is {np.sum(index)}")
index = dataset["target"]==1
print(f"number of label 1 is {np.sum(index)}")

number of label -1 is 1000
number of label 1 is 1000


In [5]:
# cell for quickly testing model
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import accuracy_score, log_loss
from sklearn.base import clone

warnings.filterwarnings("ignore")
model_list = [
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=15),
    KNeighborsClassifier(n_neighbors=25),
    KNeighborsClassifier(n_neighbors=35),
    KNeighborsClassifier(n_neighbors=45),
    KNeighborsClassifier(n_neighbors=55),
    KNeighborsClassifier(n_neighbors=65),
    KNeighborsClassifier(n_neighbors=75),
    
    LogisticRegression(penalty="l2", fit_intercept=True, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="l2", fit_intercept=True, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="l2", fit_intercept=False, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="l2", fit_intercept=False, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="none", fit_intercept=True, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="none", fit_intercept=True, random_state=0, max_iter=100, C=2),
    LogisticRegression(penalty="none", fit_intercept=False, random_state=0, max_iter=100, C=1),
    LogisticRegression(penalty="none", fit_intercept=False, random_state=0, max_iter=100, C=2),
    
    RandomForestClassifier(n_estimators=10, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=20, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=30, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=50, criterion="gini", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=10, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=20, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=30, criterion="entropy", max_depth=5, random_state=0),
    RandomForestClassifier(n_estimators=50, criterion="entropy", max_depth=5, random_state=0),
    
    AdaBoostClassifier(n_estimators=10, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=20, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=30, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=40, learning_rate=1.0, random_state=0),
    AdaBoostClassifier(n_estimators=10, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=20, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=30, learning_rate=0.1, random_state=0),
    AdaBoostClassifier(n_estimators=40, learning_rate=0.1, random_state=0),
]

def GetBestArm(model_list, n_ground_truth, dataset):
    Match_Index_to_Model = dict()
    for ii, model in enumerate(model_list):
        Match_Index_to_Model[ii + 1] = model
    cross_entropy_ = np.zeros((len(Match_Index_to_Model), n_ground_truth))
    running_time_ = np.zeros((len(Match_Index_to_Model), n_ground_truth))
    for arm_index in range(1, len(Match_Index_to_Model) + 1):
        for exp_index in tqdm(range(n_ground_truth)):
            # split the dataset with different random seed
            new_random_state = np.random.randint(0, 2**31 - 1)
            X_train, X_test, Y_train, Y_test = train_test_split(dataset["data"], dataset["target"], test_size=0.3, random_state=new_random_state)

            t1 = time.time()
            model = clone(Match_Index_to_Model[arm_index])
            model.fit(X_train, Y_train)
            y_test_predict_proba = model.predict_proba(X_test)
            t2 = time.time()
            
            cross_entropy_[arm_index - 1, exp_index] = -log_loss(Y_test, y_test_predict_proba)
            running_time_[arm_index - 1, exp_index] = t2 - t1
            
    cross_entropy_mean_ = np.mean(cross_entropy_, axis=1)
    running_time_mean_ = np.mean(running_time_, axis=1)
    best_arm = np.argmax(cross_entropy_mean_) + 1
    print(f"best arm is {best_arm}, best model is {model_list[best_arm-1].__str__()}")
    for ii, model in enumerate(model_list):
        print(f"{model.__str__()}, entropy {-cross_entropy_mean_[ii]}, running time {running_time_mean_[ii]}")
    return best_arm, Match_Index_to_Model, cross_entropy_, running_time_

best_arm, Match_Index_to_Model, cross_entropy_, running_time_ = GetBestArm(
    model_list=model_list, 
    n_ground_truth=500, 
    dataset=dataset
)
print(np.sum(np.mean(running_time_, axis=1)))

"""
"""

100%|██████████| 500/500 [00:35<00:00, 14.13it/s]
100%|██████████| 500/500 [00:37<00:00, 13.39it/s]
100%|██████████| 500/500 [00:36<00:00, 13.64it/s]
100%|██████████| 500/500 [00:37<00:00, 13.33it/s]
100%|██████████| 500/500 [00:38<00:00, 12.99it/s]
100%|██████████| 500/500 [00:39<00:00, 12.51it/s]
100%|██████████| 500/500 [00:40<00:00, 12.31it/s]
100%|██████████| 500/500 [00:41<00:00, 12.01it/s]
100%|██████████| 500/500 [01:18<00:00,  6.38it/s]
100%|██████████| 500/500 [01:16<00:00,  6.50it/s]
100%|██████████| 500/500 [01:14<00:00,  6.71it/s]
100%|██████████| 500/500 [01:16<00:00,  6.51it/s]
100%|██████████| 500/500 [01:17<00:00,  6.45it/s]
100%|██████████| 500/500 [01:18<00:00,  6.37it/s]
100%|██████████| 500/500 [01:16<00:00,  6.56it/s]
100%|██████████| 500/500 [01:17<00:00,  6.47it/s]
100%|██████████| 500/500 [00:52<00:00,  9.49it/s]
100%|██████████| 500/500 [01:38<00:00,  5.10it/s]
100%|██████████| 500/500 [02:22<00:00,  3.50it/s]
100%|██████████| 500/500 [03:53<00:00,  2.14it/s]


best arm is 2, best model is KNeighborsClassifier(n_neighbors=15)
KNeighborsClassifier(), entropy 1.1990018623740697, running time 0.05369347286224365
KNeighborsClassifier(n_neighbors=15), entropy 0.5516399321157754, running time 0.055459813594818115
KNeighborsClassifier(n_neighbors=25), entropy 0.5554227256481399, running time 0.056111196041107175
KNeighborsClassifier(n_neighbors=35), entropy 0.5667594163036422, running time 0.05826449203491211
KNeighborsClassifier(n_neighbors=45), entropy 0.5744467092859586, running time 0.05991172885894775
KNeighborsClassifier(n_neighbors=55), entropy 0.5843701947646704, running time 0.06277630281448364
KNeighborsClassifier(n_neighbors=65), entropy 0.5913861129310771, running time 0.06348921918869019
KNeighborsClassifier(n_neighbors=75), entropy 0.5981026251609437, running time 0.06583350896835327
LogisticRegression(C=1, random_state=0), entropy 1.0090203212077722, running time 0.140706374168396
LogisticRegression(C=2, random_state=0), entropy 1.020




'\n'