In [1]:
import os
import random
import sys
import time
import math

import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from scipy.stats import mode

In [24]:
class Node:
    def __init__(self):

        self.right = None
        self.left = None
        
        self.prototype = None
        
        self.column = None
        self.threshold = None
        
        self.probas = None
        self.depth = None
        
        self.is_terminal = False
        self.model = None
        
class PrototypeTreeClassifier:
    def __init__(self,
                train_features,
                 feature_types = ["min", "max", "mean"], 
                 max_depth = 3, 
                 min_samples_leaf = 1, 
                 min_samples_split = 2, 
                 prototype_count = 1,
                 use_prototype_learner=True,
                 early_stopping_round = 3):

        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.prototype_count = prototype_count
        self.feature_types = feature_types
        self.train_features = train_features
        self.use_prototype_learner = use_prototype_learner
        self.Tree = None
        self.early_stopping_round = early_stopping_round
        
    def prototype(self, bags, features, labels, prototype_count):
        number_of_rows = features.shape[0]
        random_indices = np.random.choice(number_of_rows, 
                                          size=prototype_count, 
                                          replace=False)

        prot = features[random_indices, :]
        if len(prot.shape) == 1:
            prot = prot.reshape(1, prot.shape[0])
        return prot

    def nodeProbas(self, y):
        # for each unique label calculate the probability for it
        probas = []

        for one_class in self.classes:
            proba = y[y == one_class].shape[0] / y.shape[0]
            probas.append(proba)
        
        
        return np.asarray(probas)

    def features_via_prototype(self, feature_types, features, bag_ids, prototypes):
        distances = self.calculate_distances(features, prototypes)
        
        bin_count  = np.unique(bag_ids, return_counts=True)[1]
        _, index  = np.unique(bag_ids, return_index=True)

        feature_list = []
        for i in range(0, prototypes.shape[0]):
            if "max" in feature_types:
                group_max = np.maximum.reduceat(distances[:, i], index)
                max_vals = np.repeat(group_max, bin_count)
                feature_list.append(max_vals)

            if "min" in feature_types:
                group_min = np.minimum.reduceat(distances[:, i], index)
                min_vals = np.repeat(group_min, bin_count)
                feature_list.append(min_vals)

            if "mean" in feature_types:
                group_mean = np.add.reduceat(distances[:, i], index)
                mean_vals = np.repeat(group_mean/bin_count, bin_count)
                feature_list.append(mean_vals)
        
        return np.array(np.transpose(feature_list))

    def dist1d(self, features, prototypes, distance_type="l2"):
        if distance_type == "l2":

            distance = np.linalg.norm(features - prototypes, axis=1)
        elif distance_type == "l1":
            distance = np.abs(features - prototypes)
            distance = np.sum(distance, axis=1)

        return distance

    def calculate_distances(self, features, prototypes):
        feature_list = []
        
        for i in range(0, prototypes.shape[0]):
            data = self.dist1d(features, prototypes[i], distance_type="l2")
            feature_list.append(data)
        data = np.column_stack(feature_list)

        return data

    def calcBestSplit(self, features, features_via_prototype, labels, bag_ids):
        ids, index  = np.unique(bag_ids, return_index=True)
        
        log_reg = LogisticRegression(random_state=42)
        model = log_reg.fit(features_via_prototype[index], labels[index])
        
        if len(np.unique(labels)) == 1:
            model = None
        
        predictions = model.predict(features_via_prototype)
                
        features_left = features[predictions == 0]
        features_right = features[predictions == 1]

        labels_left = labels[predictions == 0]
        labels_right = labels[predictions == 1]

        bag_ids_left = bag_ids[predictions == 0]
        bag_ids_right = bag_ids[predictions == 1]

        return model, features_left, features_right, labels_left, labels_right, bag_ids_left, bag_ids_right

    def buildDT(self, features, labels, bag_ids, node):
            '''
            Recursively builds decision tree from the top to bottom
            '''
            # checking for the terminal conditions

            if node.depth >= self.max_depth:
                node.is_terminal = True
                return

            if len(np.unique(bag_ids)) < self.min_samples_split:
                node.is_terminal = True
                return

            if np.unique(labels).shape[0] == 1:
                node.is_terminal = True
                return
            
            node.prototype = self.prototype(bag_ids, features, labels, self.prototype_count)
            features_updated = self.features_via_prototype(self.feature_types, features, bag_ids, node.prototype)
            
            # calculating current split
            (model,
             features_left, 
             features_right, 
             labels_left, 
             labels_right, 
             bag_ids_left, 
             bag_ids_right) = self.calcBestSplit(features, 
                                                 features_updated, 
                                                 labels, 
                                                 bag_ids)
            
            if model is None:
                node.is_terminal = True
                return

            if len(np.unique(bag_ids_left)) < self.min_samples_leaf or len(np.unique(bag_ids_right)) < self.min_samples_leaf:
                node.is_terminal = True
                return
            
            node.model = model
            
            _, index_left  = np.unique(bag_ids_left, return_index=True)
            _, index_right  = np.unique(bag_ids_right, return_index=True)
            
            # creating left and right child nodes
            node.left = Node()
            node.left.depth = node.depth + 1
            node.left.probas = self.nodeProbas(labels_left[index_left])

            node.right = Node()
            node.right.depth = node.depth + 1
            node.right.probas = self.nodeProbas(labels_right[index_right])

            # splitting recursively
            
            self.buildDT(features_right, labels_right, bag_ids_right, node.right)
            self.buildDT(features_left, labels_left, bag_ids_left, node.left)

    def fit(self, features, labels, bag_ids):
        '''
        Standard fit function to run all the model training
        '''
        self.classes = np.unique(labels)

        self.Tree = Node()
        self.Tree.depth = 1
        
        self.buildDT(features, labels, bag_ids, self.Tree)

    def predictSample(self, features, bag_ids, node):
        '''
        Passes one object through decision tree and return the probability of it to belong to each class
        '''

        # if we have reached the terminal node of the tree
        #if node.is_terminal:
        #    if node.model:
        #        return node.model.predict(features_updated)
        #    else:
        #        return node.probas
        if node.is_terminal:
            return node.probas

        features_updated = self.features_via_prototype(self.feature_types, features, bag_ids, node.prototype)
        
        if node.is_terminal:
            return node.model.predict(features_updated)

        predictions = node.model.predict(features_updated)
        
        if predictions[0] == 1:
            probas = self.predictSample(features, bag_ids, node.right)
        else:
            probas = self.predictSample(features, bag_ids, node.left)

        return probas

    def predict(self, features, bag_ids):
        '''
        Returns the labels for each X
        '''

        if type(features) == pd.DataFrame:
            X = np.asarray(features)

        sort_index = np.argsort(bag_ids)
        bag_ids = bag_ids[sort_index]
        features = features[sort_index]

        features_updated = self.features_via_prototype(self.feature_types, features, bag_ids, self.Tree.prototype)

        index  = np.unique(bag_ids, return_index=True)[1]
        count  = np.unique(bag_ids, return_counts=True)[1]
        index = np.append(index, bag_ids.shape[0])   
        predictions = []

        for i in range(0, len(index) - 1):
            pred = np.argmax(self.predictSample(features[index[i]:index[i+1]], 
                                                bag_ids[index[i]:index[i+1]], 
                                                self.Tree))
            pred = np.repeat(pred, count[i])
            predictions = np.concatenate((predictions, pred), axis=0)

        return np.asarray(predictions)


def split_features_labels_bags(data):
    features = data[data.columns[~data.columns.isin([0, 1])]].to_numpy()
    labels = data[0].to_numpy()
    bag_ids = data[1].to_numpy()

    #sort_index = np.argsort(bag_ids)
    #bag_ids = bag_ids[sort_index]
    #features = features[sort_index]
    
    return (features, labels, bag_ids)

def train_test_split(dataset, rep, fold, explained_variance, fit_on_full = False, custom=False):
    data = pd.read_csv(f"./datasets/{dataset}.csv", header=None)
    testbags =  pd.read_csv(f"./datasets/{dataset}.csv_rep{rep}_fold{fold}.txt", header=None)
    
    if custom:
        min_limit = testbags.min()[0]
        max_limit = testbags.max()[0]
        size = testbags.size
        size_pos = size // 2
        pos = list(range(min_limit, min_limit + size_pos))
        neg = list(range(max_limit - size_pos + 1, max_limit + 1))
        testbags = pd.DataFrame([*pos, *neg])
          
    train_data = data[~data[1].isin(testbags[0].tolist())]    
    test_data = data[data[1].isin(testbags[0].tolist())]
    
    (train_features, train_labels, train_bag_ids) = split_features_labels_bags(train_data)
    (test_features, test_labels, test_bag_ids) = split_features_labels_bags(test_data)
    
    if explained_variance < 1:
        pipe = Pipeline([('pca', PCA(n_components = explained_variance, 
                         svd_solver = "full")), 
         ('scaler', StandardScaler()), ])
    else:
        pipe = Pipeline([('scaler', StandardScaler()), ])
    
    if fit_on_full:
        pipe.fit(data[data.columns[~data.columns.isin([0,1])]].to_numpy())
    else:
        pipe.fit(train_features)

    train_features = pipe.transform(train_features)
    test_features = pipe.transform(test_features)
    
    return (
        train_features, 
        train_labels, 
        train_bag_ids,
        test_features, 
        test_labels,
        test_bag_ids)

In [5]:
dataset = "Musk1"

i = 1
j = 1

(train_features,
     train_labels,
     train_bag_ids,
     test_features,
     test_labels,
     test_bag_ids) = train_test_split(dataset, i, j, 1, fit_on_full = True)

In [31]:
from sklearn.linear_model import LogisticRegression

parameters = [[0.00001, 0.05], [0.00001, 0.05],[0.00001, 0.05], [0.00001, 0.05], [0.00001, 0.05], [1],[1]]

model = PrototypeForest(size=1,
                        max_depth=3,
                        min_samples_leaf=2,
                        min_samples_split=4,
                        prototype_count=1,
                        early_stopping_round= 5,
                        use_prototype_learner = True)

model.fit(train_features, train_labels, train_bag_ids)

probas = model.predict_proba(test_features, test_bag_ids)

_, index  = np.unique(test_bag_ids, return_index=True)

score = roc_auc_score(test_labels[index], probas[index])
end_time = time.time()

print(f"Score is {score}")

Score is 0.7083333333333334


In [14]:
from sklearn.linear_model import LogisticRegression

model = PrototypeTreeClassifier(
    max_depth=4,
    min_samples_leaf=2,
    min_samples_split=4,
    prototype_count=1,
    early_stopping_round= 5,
    use_prototype_learner = False,
    train_features = train_features
)

model.fit(train_features, train_labels, train_bag_ids)

# model.score(features_via_prototype[index], labels[index])

probas = model.predict(test_features, test_bag_ids)

_, index  = np.unique(test_bag_ids, return_index=True)

score = roc_auc_score(test_labels[index], probas[index])

print(score)

0.7000000000000002


In [25]:
class PrototypeForest:
    def __init__(self, size,
                feature_types = ["min", "mean", "max"],
                max_depth = 8, 
                min_samples_leaf = 2, 
                min_samples_split = 2, 
                prototype_count = 1,
                use_prototype_learner = True,
                early_stopping_round = 10):
        self.size = size
        self._trees = []
        self._tuning_trees = []
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.prototype_count = prototype_count
        self.use_prototype_learner = use_prototype_learner
        self.early_stopping_round = early_stopping_round
        
    def sample(self, features, labels, bag_ids):
        ids, index  = np.unique(bag_ids, return_index=True)
        group_min = np.minimum.reduceat(labels, index)
        pos_bag_size = math.ceil(np.where(group_min == 1)[0].shape[0] * 0.8)
        neg_bag_size = math.ceil(np.where(group_min == 0)[0].shape[0] * 0.8)
        
        bags_pos = np.random.choice(ids[np.where(group_min == 1)], pos_bag_size, replace=False)
        bags_neg = np.random.choice(ids[np.where(group_min == 0)], neg_bag_size, replace=False)
        
        df = pd.DataFrame(np.concatenate([train_bag_ids.reshape(train_bag_ids.shape[0],1),
                                          train_labels.reshape(train_labels.shape[0],1)],
                                         axis=1))
        indices_pos = df[df[0].isin(bags_pos)].index.to_numpy()
        indices_neg = df[df[0].isin(bags_neg)].index.to_numpy()
        inbag_indices = np.concatenate((indices_pos, indices_neg))
        oo_bag_mask = np.ones(labels.shape[0], dtype=bool)
        oo_bag_mask[inbag_indices] = False
        outbag_indices = np.where(oo_bag_mask == 1)
        
        return inbag_indices, outbag_indices
    
    def fit(self, features, labels, bag_ids):
        for i in range(self.size):
            if (self.use_prototype_learner) & (i%10==1):
                print(f"Tree {i} will be trained")
            
            (inbag_indices, _) = self.sample(features, labels, bag_ids)
            inbag_features = features[inbag_indices]
            inbag_labels = labels[inbag_indices]
            inbag_bag_ids = bag_ids[inbag_indices]
            tree = PrototypeTreeClassifier(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                min_samples_split=self.min_samples_split,
                prototype_count = self.prototype_count,
                use_prototype_learner = self.use_prototype_learner,
                train_features = inbag_features,
                early_stopping_round = self.early_stopping_round
            )
            tree.fit(inbag_features, inbag_labels, inbag_bag_ids)
            self._trees.append(tree)
            
    def predict(self, features, bag_ids):
        temp = [t.predict(features, bag_ids) for t in self._trees]
        preds = np.transpose(np.array(temp))
        return mode(preds,1)[0]
    
    def predict_proba(self, features, bag_ids):
        temp = [t.predict(features, bag_ids) for t in self._trees]
        preds = np.transpose(np.array(temp))
        return np.sum(preds==1, axis=1)/self.size

def generate_random(lower, upper):
    random_number = random.random()
    random_number = random_number + lower
    random_range = upper - lower
    random_number = random_number*random_range
    return random_number

In [10]:
dataset = "Musk1"

(train_features,
     train_labels,
     train_bag_ids,
     test_features,
     test_labels,
     test_bag_ids) = train_test_split(dataset, i, j, 1, fit_on_full = True)

In [77]:
from sklearn.linear_model import LogisticRegression

model = PrototypeForest(size=30,
                        max_depth=4,
                        min_samples_leaf=2,
                        min_samples_split=4,
                        prototype_count=1,
                        early_stopping_round= 5,
                        use_prototype_learner = False)

model.fit(train_features, train_labels, train_bag_ids)

probas = model.predict_proba(test_features, test_bag_ids)

_, index  = np.unique(test_bag_ids, return_index=True)

score = roc_auc_score(test_labels[index], probas[index])
end_time = time.time()

print(f"Score is {score}")

Score is 0.8


In [None]:
for i in range(1,6):
    for j in range(1, 11):
        print(f"Rep {i}, fold {j}")
        start_time = time.time()

        (train_features,
             train_labels,
             train_bag_ids,
             test_features,
             test_labels,
             test_bag_ids) = train_test_split(dataset, i, j, best_var, fit_on_full = True)

        model = PrototypeForest(size=best_size,
                                max_depth=best_depth,
                                min_samples_leaf=2,
                                min_samples_split=4,
                                prototype_count=1,
                                early_stopping_round= 5,
                                use_prototype_learner = False)

        model.fit(train_features, train_labels, train_bag_ids)

        probas = model.predict_proba(test_features, test_bag_ids)

        _, index  = np.unique(test_bag_ids, return_index=True)

        score = roc_auc_score(test_labels[index], probas[index])
        end_time = time.time()
        info_list_row = [dataset, i, j, best_size, best_depth, best_var, score, end_time - start_time]
        info_list.append(info_list_row)
        all_accuracy.append(score)
        print(f"Score is {score}")

print(f"Accuracy for {dataset} is {sum(all_accuracy)/len(all_accuracy)}")
perf_df = pd.DataFrame(info_list, columns=["dataset", "rep", "fold", "best_size", "best_depth", "best_var",  "auc", "time"])
perf_df.to_csv(f"./performance_linear/{dataset}.csv")

Best size is 100 and best depth is 8 and best var is 1.0 for dataset CorelHorses
Rep 1, fold 1


In [14]:
train_features

array([[ 0.1809131 , -0.88252743, -0.43062779, ..., -0.17946317,
         0.55638614, -0.06861517],
       [ 0.1809131 , -0.80318093, -0.90819777, ..., -0.17946317,
         0.53912936, -0.06861517],
       [ 0.1809131 , -0.80318093, -0.90819777, ..., -0.17946317,
         0.53912936, -0.05014186],
       ...,
       [ 0.2362567 ,  0.20565318,  0.85736399, ..., -0.17946317,
         0.12496664, -1.28785397],
       [ 0.01488231,  0.70440263,  1.53753942, ...,  0.38295103,
         0.5046158 ,  0.74421069],
       [ 0.73434907, -0.0097159 ,  0.79947672, ...,  1.25781757,
         0.65992682,  1.15062363]])

In [17]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)
fitted = log_reg.fit(train_features, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
predicted = fitted.predict(train_features)
check = np.c_[train_features, predicted]
check[:, 0:-1]

In [29]:
check = np.c_[train_features, predicted]

In [30]:
check[:, 0:-1]

array([[ 0.1809131 , -0.88252743, -0.43062779, ..., -0.17946317,
         0.55638614, -0.06861517],
       [ 0.1809131 , -0.80318093, -0.90819777, ..., -0.17946317,
         0.53912936, -0.06861517],
       [ 0.1809131 , -0.80318093, -0.90819777, ..., -0.17946317,
         0.53912936, -0.05014186],
       ...,
       [ 0.2362567 ,  0.20565318,  0.85736399, ..., -0.17946317,
         0.12496664, -1.28785397],
       [ 0.01488231,  0.70440263,  1.53753942, ...,  0.38295103,
         0.5046158 ,  0.74421069],
       [ 0.73434907, -0.0097159 ,  0.79947672, ...,  1.25781757,
         0.65992682,  1.15062363]])

In [82]:
all_accuracy = []

dataset = "Musk2"

for i in range(1,6):
    for j in range(1, 11):
        print(f"Rep {i}, fold {j}")
        start_time = time.time()

        (train_features,
             train_labels,
             train_bag_ids,
             test_features,
             test_labels,
             test_bag_ids) = train_test_split(dataset, i, j, 1, fit_on_full = True)

        model = PrototypeForest(size=8,
                                max_depth=100,
                                min_samples_leaf=2,
                                min_samples_split=4,
                                prototype_count=1,
                                early_stopping_round= 5,
                                use_prototype_learner = False)

        model.fit(train_features, train_labels, train_bag_ids)

        probas = model.predict_proba(test_features, test_bag_ids)

        _, index  = np.unique(test_bag_ids, return_index=True)

        score = roc_auc_score(test_labels[index], probas[index])
        end_time = time.time()
        all_accuracy.append(score)
        print(f"Score is {score}")


Rep 1, fold 1
Score is 0.75
Rep 1, fold 2
Score is 0.9285714285714286
Rep 1, fold 3
Score is 0.8541666666666666
Rep 1, fold 4
Score is 0.75
Rep 1, fold 5
Score is 0.9791666666666666
Rep 1, fold 6
Score is 1.0
Rep 1, fold 7
Score is 0.9375
Rep 1, fold 8
Score is 0.8125
Rep 1, fold 9
Score is 0.8125
Rep 1, fold 10
Score is 0.7857142857142857
Rep 2, fold 1
Score is 0.9107142857142858
Rep 2, fold 2
Score is 0.875
Rep 2, fold 3
Score is 0.7916666666666666
Rep 2, fold 4
Score is 0.7916666666666667
Rep 2, fold 5
Score is 0.7291666666666667
Rep 2, fold 6
Score is 1.0
Rep 2, fold 7
Score is 0.7083333333333334
Rep 2, fold 8
Score is 0.7708333333333334
Rep 2, fold 9
Score is 0.8958333333333334
Rep 2, fold 10
Score is 0.9761904761904763
Rep 3, fold 1
Score is 0.8928571428571428
Rep 3, fold 2
Score is 0.75
Rep 3, fold 3
Score is 0.7083333333333334
Rep 3, fold 4
Score is 0.7083333333333333
Rep 3, fold 5
Score is 0.5625
Rep 3, fold 6
Score is 1.0
Rep 3, fold 7
Score is 0.6666666666666666
Rep 3, fold 

In [84]:
sum(all_accuracy)/len(all_accuracy)

0.8410714285714285

In [21]:
groups = pd.read_csv("./dataset_groups.csv")

group_id = 2

datasets = groups[groups["Group"] == int(group_id)]["dataset"].to_list()
best_params = pd.read_csv("./best_params.csv")

check = os.listdir("./performance_linear")
ran_already = [x.split(".")[0] for x in check]
datasets = list(set(datasets) - set(ran_already))

In [22]:
datasets

['HermitWarbler',
 'Harddrive1',
 'Web4',
 'Web2',
 'SwainsonsThrush',
 'Mutagenesis1',
 'Web5',
 'Newsgroups9',
 'Web3',
 'Newsgroups15',
 'Newsgroups7',
 'Web8',
 'Red-breastedNuthatch',
 'Newsgroups8',
 'Tiger',
 'Musk2',
 'Newsgroups3']

In [28]:
datasets = ["Mutagenesis1"]

for dataset in datasets:
    scores = []
    info_list = []

    PCA_vals = best_params[best_params["dataset"] == dataset]["PCA"].values.tolist()
    best_depth = best_params[best_params["dataset"] == dataset]["max_depth"].values[0]
    best_size = best_params[best_params["dataset"] == dataset]["ntree"].values[0]

    best_depth = 4
    best_size = 100
    
    if(len(PCA_vals[0]) > 1):
        PCA_vals = PCA_vals[0].split("-")
        PCA_vals = [float(x) for x in PCA_vals]
    else:
        PCA_vals = float(best_params[best_params["dataset"] == dataset]["PCA"].values[0])
    
    if(isinstance(PCA_vals, list)):
        for k in PCA_vals:
            (train_features,
                    train_labels,
                    train_bag_ids,
                    test_features,
                    test_labels,
                    test_bag_ids) = train_test_split(dataset, 5, 10, k, fit_on_full = False, custom=True)

            model = PrototypeForest(size=best_size,
                                    max_depth=best_depth,
                                    min_samples_leaf=2,
                                    min_samples_split=4,
                                    prototype_count=1,
                                    early_stopping_round= 3,
                                    use_prototype_learner = False)

            model.fit(train_features, train_labels, train_bag_ids)

            probas = model.predict_proba(test_features, test_bag_ids)

            score = roc_auc_score(test_labels, probas)
            scores.append([k, score])
    
            df = pd.DataFrame(scores, columns = ["variance","score"])
            print(df)
            best_row = df.iloc[df["score"].argmax()]
            best_var = best_row.get("variance")
    else:
        best_var = PCA_vals


    all_accuracy = []

    print(f"Best size is {best_size} and best depth is {best_depth} and best var is {best_var} for dataset {dataset}")

    for i in range(1,6):
        for j in range(1, 11):
            print(f"Rep {i}, fold {j}")
            start_time = time.time()

            (train_features,
                 train_labels,
                 train_bag_ids,
                 test_features,
                 test_labels,
                 test_bag_ids) = train_test_split(dataset, i, j, best_var, fit_on_full = True)

            model = PrototypeForest(size=best_size,
                                    max_depth=best_depth,
                                    min_samples_leaf=2,
                                    min_samples_split=4,
                                    prototype_count=1,
                                    early_stopping_round= 5,
                                    use_prototype_learner = False)

            model.fit(train_features, train_labels, train_bag_ids)

            probas = model.predict_proba(test_features, test_bag_ids)

            _, index  = np.unique(test_bag_ids, return_index=True)

            score = roc_auc_score(test_labels[index], probas[index])
            end_time = time.time()
            info_list_row = [dataset, i, j, best_size, best_depth, best_var, score, end_time - start_time]
            info_list.append(info_list_row)
            all_accuracy.append(score)
            print(f"Score is {score}")

    print(f"Accuracy for {dataset} is {sum(all_accuracy)/len(all_accuracy)}")
    perf_df = pd.DataFrame(info_list, columns=["dataset", "rep", "fold", "best_size", "best_depth", "best_var",  "auc", "time"])
    perf_df.to_csv(f"./performance_linear_new/{dataset}.csv")

Best size is 100 and best depth is 4 and best var is 1.0 for dataset Mutagenesis1
Rep 1, fold 1
Score is 0.7243589743589743
Rep 1, fold 2
Score is 0.6282051282051282
Rep 1, fold 3
Score is 0.8846153846153847
Rep 1, fold 4
Score is 0.8141025641025641
Rep 1, fold 5
Score is 0.717948717948718
Rep 1, fold 6
Score is 0.6785714285714286
Rep 1, fold 7
Score is 0.9880952380952381
Rep 1, fold 8
Score is 0.5476190476190477
Rep 1, fold 9
Score is 0.8263888888888888
Rep 1, fold 10
Score is 0.625
Rep 2, fold 1
Score is 0.8846153846153846
Rep 2, fold 2
Score is 0.6794871794871795
Rep 2, fold 3
Score is 0.6666666666666667
Rep 2, fold 4
Score is 0.576923076923077
Rep 2, fold 5
Score is 0.8141025641025641
Rep 2, fold 6
Score is 0.6726190476190476
Rep 2, fold 7
Score is 0.8214285714285714
Rep 2, fold 8
Score is 0.8869047619047619
Rep 2, fold 9
Score is 0.9305555555555556
Rep 2, fold 10
Score is 0.5902777777777778
Rep 3, fold 1
Score is 0.858974358974359
Rep 3, fold 2
Score is 0.8525641025641026
Rep 3, f

In [23]:
for dataset in datasets:
    scores = []
    info_list = []

    PCA_vals = best_params[best_params["dataset"] == dataset]["PCA"].values.tolist()
    best_depth = best_params[best_params["dataset"] == dataset]["max_depth"].values[0]
    best_size = best_params[best_params["dataset"] == dataset]["ntree"].values[0]

    if(len(PCA_vals[0]) > 1):
        PCA_vals = PCA_vals[0].split("-")
        PCA_vals = [float(x) for x in PCA_vals]
    else:
        PCA_vals = float(best_params[best_params["dataset"] == dataset]["PCA"].values[0])
    
    if(isinstance(PCA_vals, list)):
        for k in PCA_vals:
            (train_features,
                    train_labels,
                    train_bag_ids,
                    test_features,
                    test_labels,
                    test_bag_ids) = train_test_split(dataset, 5, 10, k, fit_on_full = False, custom=True)

            model = PrototypeForest(size=best_size,
                                    max_depth=best_depth,
                                    min_samples_leaf=2,
                                    min_samples_split=4,
                                    prototype_count=1,
                                    early_stopping_round= 3,
                                    use_prototype_learner = False)

            model.fit(train_features, train_labels, train_bag_ids)

            probas = model.predict_proba(test_features, test_bag_ids)

            score = roc_auc_score(test_labels, probas)
            scores.append([k, score])
    
            df = pd.DataFrame(scores, columns = ["variance","score"])
            print(df)
            best_row = df.iloc[df["score"].argmax()]
            best_var = best_row.get("variance")
    else:
        best_var = PCA_vals


    all_accuracy = []

    print(f"Best size is {best_size} and best depth is {best_depth} and best var is {best_var} for dataset {dataset}")

    for i in range(1,6):
        for j in range(1, 11):
            print(f"Rep {i}, fold {j}")
            start_time = time.time()

            (train_features,
                 train_labels,
                 train_bag_ids,
                 test_features,
                 test_labels,
                 test_bag_ids) = train_test_split(dataset, i, j, best_var, fit_on_full = True)

            model = PrototypeForest(size=best_size,
                                    max_depth=best_depth,
                                    min_samples_leaf=2,
                                    min_samples_split=4,
                                    prototype_count=1,
                                    early_stopping_round= 5,
                                    use_prototype_learner = False)

            model.fit(train_features, train_labels, train_bag_ids)

            probas = model.predict_proba(test_features, test_bag_ids)

            _, index  = np.unique(test_bag_ids, return_index=True)

            score = roc_auc_score(test_labels[index], probas[index])
            end_time = time.time()
            info_list_row = [dataset, i, j, best_size, best_depth, best_var, score, end_time - start_time]
            info_list.append(info_list_row)
            all_accuracy.append(score)
            print(f"Score is {score}")

    print(f"Accuracy for {dataset} is {sum(all_accuracy)/len(all_accuracy)}")
    perf_df = pd.DataFrame(info_list, columns=["dataset", "rep", "fold", "best_size", "best_depth", "best_var",  "auc", "time"])
    perf_df.to_csv(f"./performance_linear/{dataset}.csv")

Best size is 100 and best depth is 8 and best var is 1.0 for dataset HermitWarbler
Rep 1, fold 1
Score is 0.9107142857142857
Rep 1, fold 2
Score is 0.8005952380952381
Rep 1, fold 3
Score is 0.8482142857142857
Rep 1, fold 4
Score is 0.9421768707482994
Rep 1, fold 5
Score is 0.8945578231292517
Rep 1, fold 6
Score is 0.9064625850340137
Rep 1, fold 7
Score is 0.9693877551020409
Rep 1, fold 8
Score is 0.8571428571428572
Rep 1, fold 9
Score is 0.8993055555555557
Rep 1, fold 10
Score is 0.9878472222222222
Rep 2, fold 1
Score is 0.988095238095238
Rep 2, fold 2
Score is 0.9092261904761906
Rep 2, fold 3
Score is 0.8541666666666666
Rep 2, fold 4
Score is 0.8792517006802721
Rep 2, fold 5
Score is 0.9098639455782314
Rep 2, fold 6
Score is 0.7993197278911565
Rep 2, fold 7
Score is 0.9812925170068028
Rep 2, fold 8
Score is 0.9778911564625851
Rep 2, fold 9
Score is 0.8871527777777779
Rep 2, fold 10
Score is 0.9322916666666667
Rep 3, fold 1
Score is 0.90625
Rep 3, fold 2
Score is 0.8541666666666666
Rep

Score is 0.3
Rep 4, fold 8
Score is 0.5
Rep 4, fold 9
Score is 0.5
Rep 4, fold 10
Score is 0.33333333333333337
Rep 5, fold 1
Score is 0.5
Rep 5, fold 2
Score is 0.5
Rep 5, fold 3
Score is 0.5
Rep 5, fold 4
Score is 0.7083333333333334
Rep 5, fold 5
Score is 0.16666666666666669
Rep 5, fold 6
Score is 0.4
Rep 5, fold 7
Score is 0.5
Rep 5, fold 8
Score is 0.5
Rep 5, fold 9
Score is 0.4166666666666667
Rep 5, fold 10
Score is 0.33333333333333337
Accuracy for Web2 is 0.48283333333333334
Best size is 100 and best depth is 8 and best var is 1.0 for dataset SwainsonsThrush
Rep 1, fold 1
Score is 0.7287234042553191
Rep 1, fold 2
Score is 0.4787234042553192
Rep 1, fold 3
Score is 0.723404255319149
Rep 1, fold 4
Score is 0.6156914893617021
Rep 1, fold 5
Score is 0.8683510638297872
Rep 1, fold 6
Score is 0.6143617021276595
Rep 1, fold 7
Score is 0.553191489361702
Rep 1, fold 8
Score is 0.6728723404255319
Rep 1, fold 9
Score is 0.7241847826086957
Rep 1, fold 10
Score is 0.7537993920972644
Rep 2, fold

KeyboardInterrupt: 