In [1]:
%matplotlib inline
from matplotlib import pyplot as plt

import numpy as np
import joblib
import pandas as pd
from collections import OrderedDict

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold,KFold,RandomizedSearchCV
from sklearn.metrics import roc_auc_score,accuracy_score,roc_curve,precision_score,average_precision_score
from sklearn.datasets import fetch_covtype, load_svmlight_file
from sklearn.base import clone
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.kernel_ridge import KernelRidge

from tqdm import tqdm


import datetime
import sys
import os

import pathlib

import KTBoost.KTBoost as KTBoost


In [5]:
sys.path.append("../bitboost/python/bitboost")
import bitboost

Exception: BitBoost library could not be located

In [30]:
sys.path.append("../infiniteboost/research")
from SparseInfiniteBoosting import InfiniteBoosting

In [31]:
allDataSetsPaths = []
dataSetName = "classification_datasets"
for file in os.listdir(f"../{dataSetName}"):
    if file.endswith(".csv"):
        allDataSetsPaths.append(os.path.join(f"../{dataSetName}", file))

In [32]:
def getBadLabel(data):
    TH_to_other = 10
    countSeries = data.iloc[:,-1].value_counts()
    badLabels = countSeries[countSeries< TH_to_other].index
    return badLabels

def getDataFromPath(path):
    data = pd.read_csv(allDataSetsPaths[12])
    data = data.dropna()
    badLabels = getBadLabel(data)
    data.iloc[:,-1] = data.iloc[:,-1].apply(lambda x : "ohter" if x in list(badLabels) else x)

    #check if we still have bad lables under the TH
    badLabels = getBadLabel(data)
    data = data[~data.iloc[:,-1].isin(badLabels)]


    strCoulmns = data.dtypes[data.dtypes == "object"].index
    if len(strCoulmns) > 0:
        le = preprocessing.LabelEncoder()
        for i in strCoulmns:
            data[i] = data[i].astype('str')
            data[i] = le.fit_transform(data[i])
    X = data.iloc[:, :-1]
    y = data.iloc[:,-1]
    return X, y

In [33]:
for i,path in enumerate(allDataSetsPaths): 
    name = path.split('\\')[-1].split('.')[0]
    X, y = getDataFromPath(path)
#     print ('id', i ,name, 'shape: ', X.shape, 'with', len(y.unique()),'labels') 

## exemple

In [36]:
np.random.seed(42)
models = {}

distributions = dict(estimator__n_estimators=np.arange(50,300,20),estimator__max_leaf_nodes = [2,4,5])
models['infiboost'] = [InfiniteBoosting(),distributions]
distributions = dict(estimator__max_depth=np.arange(3,10),estimator__n_estimators=np.arange(50,300,20))
models['KTBoost'] = [KTBoost.BoostingClassifier(),distributions]


In [37]:


measuers = pd.DataFrame(columns = ["Dataset_Name","AlgoName","CrossVal","HP_vals",
                                   "ACC","TPR","FPR","Precsion","ROC","Precstion_Recall",
                                   "Training_Time","Inference_Time"])

index = 0
crossValNum = 0
for AlgoName,items in models.items():
    model = items[0]
    dist = items[1]
    print(f"run on {AlgoName}..")
    for datasetName in tqdm(allDataSetsPaths[5:18]):
        X, y = getDataFromPath(datasetName)
        X = X.values
        y = y.values
        multiclass = False
        if(len(np.unique(y))>2):
                multiclass = True
        folder = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
        for train_indices, test_indices in folder.split(X, y):
            row = []
            X_train = X[train_indices]
            X_test = X[test_indices]
            y_train = y[train_indices]
            y_test = y[test_indices]
            clf = OneVsRestClassifier(clone(model))
            RS = RandomizedSearchCV(clf, dist, random_state=42,n_iter = 2,cv = 2)

            now = datetime.datetime.now()
            RS.fit(X_train, y_train)
            time_stop = datetime.datetime.now()

            y_pred = RS.predict(X_test)
            y_pred_proba = RS.predict_proba(X_test)

            #collect measures
            Dataname = datasetName.split("\\")[1].split(".")[0]
            row.append(Dataname)
            row.append(AlgoName)
            row.append(crossValNum)
            row.append(str(RS.best_params_))
            acc = accuracy_score(y_test,y_pred)
            row.append(acc)

        #     #soultion for multiclass
        #     for i in range(n_classes):
        #     fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        #     roc_auc[i] = auc(fpr[i], tpr[i])

        #     fpr, tpr, _ = roc_curve(y_test,y_pred)
            row.append("tpr")
            row.append("fpr")
            precsion = precision_score(y_test, y_pred, average='macro')
            row.append(precsion)
            if(multiclass):
                ROC = roc_auc_score(y_test, y_pred_proba,multi_class = "ovr")
            else:
                ROC = roc_auc_score(y_test, y_pred_proba[:,1])
            row.append(ROC)
        #     APS = average_precision_score(y_test,y_pred_proba)
            row.append("APS")
            T_time = str(time_stop- now)
            row.append(T_time)
            row.append("Inference_Time")

            measuers.loc[index] = row

            crossValNum+=1
            index+=1
    #     del clf


  0%|          | 0/13 [00:00<?, ?it/s]

run on infiboost..


100%|██████████| 13/13 [01:59<00:00,  9.17s/it]
  0%|          | 0/13 [00:00<?, ?it/s]

run on KTBoost..


100%|██████████| 13/13 [00:26<00:00,  2.06s/it]


In [78]:
measuers

Unnamed: 0,Dataset_Name,AlgoName,CrossVal,HP_vals,ACC,TPR,FPR,Precsion,ROC,Precstion_Recall,Training_Time,Inference_Time
0,abalon,infiboost,0,"{'estimator__n_estimators': 190, 'estimator__m...",0.644806,tpr,fpr,0.629621,0.815838,APS,0:00:03.742512,Inference_Time
1,abalon,infiboost,1,"{'estimator__n_estimators': 250, 'estimator__m...",0.628831,tpr,fpr,0.622022,0.793414,APS,0:00:04.056371,Inference_Time
2,acute-inflammation,infiboost,2,"{'estimator__n_estimators': 190, 'estimator__m...",1.0,tpr,fpr,1.0,1.0,APS,0:00:00.783319,Inference_Time
3,acute-inflammation,infiboost,3,"{'estimator__n_estimators': 250, 'estimator__m...",1.0,tpr,fpr,1.0,1.0,APS,0:00:00.887792,Inference_Time
4,acute-nephritis,infiboost,4,"{'estimator__n_estimators': 250, 'estimator__m...",1.0,tpr,fpr,1.0,1.0,APS,0:00:00.871812,Inference_Time
5,acute-nephritis,infiboost,5,"{'estimator__n_estimators': 250, 'estimator__m...",1.0,tpr,fpr,1.0,1.0,APS,0:00:00.834767,Inference_Time
6,abalon,KTBoost,6,"{'estimator__n_estimators': 230, 'estimator__m...",0.649114,tpr,fpr,0.642916,0.828526,APS,0:00:07.427137,Inference_Time
7,abalon,KTBoost,7,"{'estimator__n_estimators': 70, 'estimator__ma...",0.620211,tpr,fpr,0.618095,0.812144,APS,0:00:06.106812,Inference_Time
8,acute-inflammation,KTBoost,8,"{'estimator__n_estimators': 70, 'estimator__ma...",1.0,tpr,fpr,1.0,1.0,APS,0:00:00.152584,Inference_Time
9,acute-inflammation,KTBoost,9,"{'estimator__n_estimators': 70, 'estimator__ma...",1.0,tpr,fpr,1.0,1.0,APS,0:00:00.158543,Inference_Time


In [93]:
def getBadLabel(data):
    countSeries = data.iloc[:,-1].value_counts()
    badLabels = countSeries[countSeries< TH_to_other].index
    return badLabels

TH_to_other = 10
data = pd.read_csv(allDataSetsPaths[12])
data = data.dropna()
badLabels = getBadLabel(data)
data.iloc[:,-1] = data.iloc[:,-1].apply(lambda x : "ohter" if x in list(badLabels) else x)

#check if we still have bad lables under the TH
badLabels = getBadLabel(data)
data = data[~data.iloc[:,-1].isin(badLabels)]


strCoulmns = data.dtypes[data.dtypes == "object"].index
if len(strCoulmns) > 0:
    le = preprocessing.LabelEncoder()
    for i in strCoulmns:
        data[i] = data[i].astype('str')
        data[i] = le.fit_transform(data[i])


In [94]:
data.iloc[:,-1].value_counts()

1    48
2    46
3    29
0    20
4    13
Name: symboling, dtype: int64

In [88]:
data.iloc[:,-1].value_counts()

1    48
2    46
3    29
0    20
4    13
5     3
Name: symboling, dtype: int64