In [16]:
list(range(4))

[0, 1, 2, 3]

In [9]:
import os
import random
import sys
import time
import math

import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.pipeline import Pipeline
from scipy.stats import mode

class Node:
    def __init__(self):

        self.right = None
        self.left = None
        
        self.prototype = None
        
        self.column = None
        self.threshold = None
        
        self.probas = None
        self.depth = None
        
        self.is_terminal = False
        
class PrototypeTreeClassifier:
    def __init__(self,
                train_features,
                 feature_types = ["min", "max", "mean"], 
                 max_depth = 3, 
                 min_samples_leaf = 1, 
                 min_samples_split = 2, 
                 prototype_count = 1,
                 use_prototype_learner=True,
                 early_stopping_round = 3):

        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.prototype_count = prototype_count
        self.feature_types = feature_types
        self.train_features = train_features
        self.use_prototype_learner = use_prototype_learner
        self.Tree = None
        self.early_stopping_round = early_stopping_round
        
    def prototype(self, bags, features, labels, prototype_count):
        number_of_rows = features.shape[0]
        random_indices = np.random.choice(number_of_rows, 
                                          size=prototype_count, 
                                          replace=False)

        prot = features[random_indices, :]
        if len(prot.shape) == 1:
            prot = prot.reshape(1, prot.shape[0])
        return prot

    def nodeProbas(self, y):
        # for each unique label calculate the probability for it
        probas = []

        return np.asarray(np.sum(y)/y.size)

    def features_via_prototype(self, feature_types, features, bag_ids, prototypes):
        distances = self.calculate_distances(features, prototypes)
        
        bin_count  = np.unique(bag_ids, return_counts=True)[1]
        _, index  = np.unique(bag_ids, return_index=True)

        feature_list = []
        for i in range(0, prototypes.shape[0]):
            if "max" in feature_types:
                group_max = np.maximum.reduceat(distances[:, i], index)
                max_vals = np.repeat(group_max, bin_count)
                feature_list.append(max_vals)

            if "min" in feature_types:
                group_min = np.minimum.reduceat(distances[:, i], index)
                min_vals = np.repeat(group_min, bin_count)
                feature_list.append(min_vals)

            if "mean" in feature_types:
                group_mean = np.add.reduceat(distances[:, i], index)
                mean_vals = np.repeat(group_mean/bin_count, bin_count)
                feature_list.append(mean_vals)
        
        return np.array(np.transpose(feature_list))

    def dist1d(self, features, prototypes, distance_type="l2"):
        if distance_type == "l2":

            distance = np.linalg.norm(features - prototypes, axis=1)
        elif distance_type == "l1":
            distance = np.abs(features - prototypes)
            distance = np.sum(distance, axis=1)

        return distance

    def calculate_distances(self, features, prototypes):
        feature_list = []
        
        for i in range(0, prototypes.shape[0]):
            data = self.dist1d(features, prototypes[i], distance_type="l2")
            feature_list.append(data)
        data = np.column_stack(feature_list)

        return data

    def calcBestSplit(self, features, features_via_prototype, labels, bag_ids):
        ids, index  = np.unique(bag_ids, return_index=True)
        
        bdc = tree.DecisionTreeRegressor(
            min_samples_split=2,
            criterion="mae"
        )
        bdc.fit(features_via_prototype[index], labels[index])
        
        threshold = bdc.tree_.threshold[0]
        split_col = bdc.tree_.feature[0]

        features_left = features[features_via_prototype[:,split_col] <= bdc.tree_.threshold[0]]
        features_right = features[features_via_prototype[:,split_col] > bdc.tree_.threshold[0]]

        labels_left = labels[features_via_prototype[:,split_col] <= bdc.tree_.threshold[0]]
        labels_right = labels[features_via_prototype[:,split_col] > bdc.tree_.threshold[0]]

        bag_ids_left = bag_ids[features_via_prototype[:,split_col] <= bdc.tree_.threshold[0]]
        bag_ids_right = bag_ids[features_via_prototype[:,split_col] > bdc.tree_.threshold[0]]

        return split_col, threshold, features_left, features_right, labels_left, labels_right, bag_ids_left, bag_ids_right

    def buildDT(self, features, labels, bag_ids, node):
            '''
            Recursively builds decision tree from the top to bottom
            '''
            # checking for the terminal conditions

            if node.depth >= self.max_depth:
                node.is_terminal = True
                return

            if len(np.unique(bag_ids)) < self.min_samples_split:
                node.is_terminal = True
                return

            if np.unique(labels).shape[0] == 1:
                node.is_terminal = True
                return
            
            node.prototype = self.prototype(bag_ids, features, labels, self.prototype_count)
            features_updated = self.features_via_prototype(self.feature_types, features, bag_ids, node.prototype)
            
            # calculating current split
            (splitCol, 
             thresh, 
             features_left, 
             features_right, 
             labels_left, 
             labels_right, 
             bag_ids_left, 
             bag_ids_right) = self.calcBestSplit(features, 
                                                 features_updated, 
                                                 labels, 
                                                 bag_ids)
            
            if splitCol is None:
                node.is_terminal = True
                return

            if len(np.unique(bag_ids_left)) < self.min_samples_leaf or len(np.unique(bag_ids_right)) < self.min_samples_leaf:
                node.is_terminal = True
                return
            
            node.column = splitCol
            node.threshold = thresh
            
            _, index_left  = np.unique(bag_ids_left, return_index=True)
            _, index_right  = np.unique(bag_ids_right, return_index=True)
            
            # creating left and right child nodes
            node.left = Node()
            node.left.depth = node.depth + 1
            node.left.probas = self.nodeProbas(labels_left[index_left])

            node.right = Node()
            node.right.depth = node.depth + 1
            node.right.probas = self.nodeProbas(labels_right[index_right])

            # splitting recursively
            
            self.buildDT(features_right, labels_right, bag_ids_right, node.right)
            self.buildDT(features_left, labels_left, bag_ids_left, node.left)

    def fit(self, features, labels, bag_ids):
        '''
        Standard fit function to run all the model training
        '''
        self.Tree = Node()
        self.Tree.depth = 1
        
        self.buildDT(features, labels, bag_ids, self.Tree)

    def predictSample(self, features, bag_ids, node):
        '''
        Passes one object through decision tree and return the probability of it to belong to each class
        '''

        # if we have reached the terminal node of the tree
        if node.is_terminal:
            return node.probas

        features_updated = self.features_via_prototype(self.feature_types, features, bag_ids, node.prototype)

        if features_updated[0][node.column] > node.threshold:
            probas = self.predictSample(features, bag_ids, node.right)
        else:
            probas = self.predictSample(features, bag_ids, node.left)

        return probas

    def predict(self, features, bag_ids):
        '''
        Returns the labels for each X
        '''

        if type(features) == pd.DataFrame:
            X = np.asarray(features)

        sort_index = np.argsort(bag_ids)
        bag_ids = bag_ids[sort_index]
        features = features[sort_index]

        features_updated = self.features_via_prototype(self.feature_types, features, bag_ids, self.Tree.prototype)

        index  = np.unique(bag_ids, return_index=True)[1]
        count  = np.unique(bag_ids, return_counts=True)[1]
        index = np.append(index, bag_ids.shape[0])   
        predictions = []

        for i in range(0, len(index) - 1):
            pred = self.predictSample(features[index[i]:index[i+1]], 
                                                bag_ids[index[i]:index[i+1]], 
                                                self.Tree)
            
            pred = np.repeat(pred, count[i])
            predictions = np.concatenate((predictions, pred), axis=0)
        
        return np.asarray(predictions)

class PrototypeForest:
    def __init__(self, size,
                feature_types = ["min", "mean", "max"],
                max_depth = 8, 
                min_samples_leaf = 2, 
                min_samples_split = 2, 
                prototype_count = 1,
                use_prototype_learner = True,
                early_stopping_round = 10):
        self.size = size
        self._trees = []
        self._tuning_trees = []
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.prototype_count = prototype_count
        self.use_prototype_learner = use_prototype_learner
        self.early_stopping_round = early_stopping_round
        
    def sample(self, features, labels, bag_ids):
        ids, index  = np.unique(bag_ids, return_index=True)
        group_min = np.minimum.reduceat(labels, index)
        bag_size = math.ceil(group_min.shape[0] * 0.8)
        bags_all = np.random.choice(ids, bag_size, replace=False)
        
        df = pd.DataFrame(np.concatenate([train_bag_ids.reshape(train_bag_ids.shape[0],1),
                                          train_labels.reshape(train_labels.shape[0],1)],
                                         axis=1))
        
        indices_all = df[df[0].isin(bags_all)].index.to_numpy()
        inbag_indices = indices_all
        oo_bag_mask = np.ones(labels.shape[0], dtype=bool)
        oo_bag_mask[inbag_indices] = False
        outbag_indices = np.where(oo_bag_mask == 1)
        
        return inbag_indices, outbag_indices

    
    def fit(self, features, labels, bag_ids):
        for i in range(self.size):
            
            (inbag_indices, _) = self.sample(features, labels, bag_ids)
            inbag_features = features[inbag_indices]
            inbag_labels = labels[inbag_indices]
            inbag_bag_ids = bag_ids[inbag_indices]
            tree = PrototypeTreeClassifier(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                min_samples_split=self.min_samples_split,
                prototype_count = self.prototype_count,
                use_prototype_learner = self.use_prototype_learner,
                train_features = inbag_features,
                early_stopping_round = self.early_stopping_round
            )
            tree.fit(inbag_features, inbag_labels, inbag_bag_ids)
            while tree.Tree.right is None:
                tree.fit(inbag_features, inbag_labels, inbag_bag_ids)
            self._trees.append(tree)
            
    def predict(self, features, bag_ids):
        temp = [t.predict(features, bag_ids) for t in self._trees]
        preds = np.transpose(np.array(temp))
        return np.sum(preds, axis=1)/self.size
   
def split_features_labels_bags(data):
    features = data[data.columns[~data.columns.isin([0, 1])]].to_numpy()
    labels = data[0].to_numpy()
    bag_ids = data[1].to_numpy()

    #sort_index = np.argsort(bag_ids)
    #bag_ids = bag_ids[sort_index]
    #features = features[sort_index]
    
    return (features, labels, bag_ids)

def train_test_split(dataset, rep, fold, explained_variance, fit_on_full = False, custom=False, cols=None):
    #data = pd.read_csv(f"./datasets_regression/{dataset}.csv", header=None, sep=" ")
    #testbags =  pd.read_csv(f"./datasets_regression/{dataset}.csv_rep{rep}_fold{fold}.txt", header=None)
    data = pd.read_csv(f"./datasets_regression/{dataset}.csv", header=None, sep=" ")
    testbags =  pd.read_csv(f"./datasets_regression/cv/{dataset}.csv_rep{rep}_fold{fold}.txt", header=None)

    if cols:
        data = data[list(range(cols))]
    
    if custom:
        min_limit = testbags.min()[0]
        max_limit = testbags.max()[0]
        size = testbags.size
        size_pos = size // 2
        pos = list(range(min_limit, min_limit + size_pos))
        neg = list(range(max_limit - size_pos + 1, max_limit + 1))
        testbags = pd.DataFrame([*pos, *neg])
          
    train_data = data[~data[1].isin(testbags[0].tolist())]    
    
    #for i in range(2, 94):
    #    clean_data = train_data[(train_data[i] != 0) & (train_data[i] != -32767)]
    #    mean = clean_data[i].mean()
    #    train_data[(train_data[i] == 0) | (train_data[i] == -32767)] = mean

    test_data = data[data[1].isin(testbags[0].tolist())]
    
    (train_features, train_labels, train_bag_ids) = split_features_labels_bags(train_data)
    (test_features, test_labels, test_bag_ids) = split_features_labels_bags(test_data)
    
    if explained_variance < 1:
        pipe = Pipeline([('pca', PCA(n_components = explained_variance, 
                         svd_solver = "full")), 
         ('scaler', StandardScaler()), ])
    else:
        pipe = Pipeline([('scaler', StandardScaler()), ])
    
    if fit_on_full:
        pipe.fit(data[data.columns[~data.columns.isin([0,1])]].to_numpy())
    else:
        pipe.fit(train_features)

    train_features = pipe.transform(train_features)
    test_features = pipe.transform(test_features)
    
    return (
        train_features, 
        train_labels, 
        train_bag_ids,
        test_features, 
        test_labels,
        test_bag_ids)


In [8]:
data = pd.read_csv(f"./datasets_regression/{dataset}.csv", header=None, sep=" ")
testbags =  pd.read_csv(f"./datasets_regression/{dataset}.csv_rep{1}_fold{1}.txt", header=None)
test_data = data[data[1].isin(testbags[0].tolist())]


In [11]:
data[[0, 1, 2, 3]]


Unnamed: 0,0,1,2,3
0,37.0,12292001,1245,2020
1,37.0,12292001,1178,1901
2,37.0,12292001,1176,1853
3,37.0,12292001,1152,1990
4,37.0,12292001,1218,1964
...,...,...,...,...
73495,67.4,25132005,803,3145
73496,67.4,25132005,286,1895
73497,67.4,25132005,4039,4647
73498,67.4,25132005,624,3896


In [9]:
test_data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84,85,86,87,88,89,90,91,92,93
count,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,...,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0
mean,35.479592,16160640.0,4274.526054,4924.480272,2260.082041,3263.006735,1350.348231,2289.631905,1292.212789,2370.944082,...,1099.509728,2281.934354,2032.708027,3013.751361,1196.607347,2309.484966,1797.909116,2840.644558,1530.174762,2509.893265
std,19.193455,2623683.0,2720.590336,2390.654906,2343.990403,2053.580156,1613.29098,1471.426861,1382.825208,1321.166016,...,955.215997,1033.679838,2361.484367,2114.502216,1180.305094,1134.447049,2215.338142,2011.604882,1463.154617,1356.105557
min,0.0,12292000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0
25%,31.0,14232000.0,1510.5,3026.75,620.0,1867.0,694.0,1612.0,614.75,1694.0,...,767.0,1871.0,740.0,1836.0,751.0,1857.0,861.0,1938.75,896.0,1932.75
50%,39.0,15902000.0,4629.0,5194.0,1140.0,2614.0,891.0,1866.0,909.0,2089.0,...,1026.0,2244.0,1041.0,2310.0,1011.0,2214.0,1153.0,2405.0,1133.0,2291.0
75%,44.0,17892000.0,6564.25,6905.0,3396.0,4375.25,1082.25,2352.0,1303.25,2791.0,...,1216.0,2620.0,1582.0,3097.25,1230.25,2587.0,1453.0,2888.0,1400.0,2756.0
max,83.6,25132000.0,13815.0,13744.0,15624.0,14341.0,13650.0,12017.0,13995.0,13591.0,...,13983.0,14645.0,12695.0,13584.0,13534.0,13990.0,15876.0,14601.0,15903.0,15834.0


In [38]:
for i in range(2, 94):
    clean_data = data[(data[i] != 0) & (data[i] != -32767)]
    mean = clean_data[i].mean()
    data[(data[i] == 0) | (data[i] == -32767)] = mean

In [None]:
 df.loc[ df[“column_name”] == “some_value”, “column_name”] = “value”

In [34]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93
0,37.0,12292001,1245,2020,1073,1762,1321,2063,7286,7384,-32767,-32767,1176,2003,1391,2236,878,1728,878,1728,1440,2234,1592,2579,1461,2457,1538,2575,1321,2338,1372,2561,812,2099,1168,2590,954,2832,1197,2983,1028,2472,1044,2407,1044,2407,1629,2939,1088,2365,1385,2544,1553,2829,1441,2572,1692,2966,1473,2536,1484,2577,1362,2545,1131,2209,951,1987,1028,2101,1068,2138,1244,2335,1092,2094,1115,2112,1338,2343,1357,2306,1259,2202,1292,2189,1224,2039,1138,2106,1460,2343,1285,2140
1,37.0,12292001,1178,1901,1021,1721,1285,2151,1489,2453,-32767,-32767,4622,4749,1131,1948,1105,1876,1105,1876,1265,2048,1361,2248,1397,2348,1158,2182,1113,2190,1159,2394,814,2184,1025,2587,983,2968,895,2840,625,2538,814,2541,814,2541,1353,3111,1282,3082,1043,2619,718,2362,934,2501,1180,2870,942,2542,1107,2490,1005,2532,800,2171,708,1851,805,1953,793,1842,897,1977,703,1718,1080,2065,1117,2039,984,1958,888,1984,715,1795,805,1760,3671,4422,1222,2206,996,1909
2,37.0,12292001,1176,1853,925,1660,1000,1730,6649,6802,-32767,-32767,1308,2145,1290,2080,813,1637,813,1637,1471,2213,1652,2545,1618,2522,1523,2462,1113,1939,1390,2465,786,1965,1181,2581,1128,2911,1192,2991,1074,2554,961,2373,961,2373,1561,2949,927,2445,1205,2364,1166,2227,1369,2519,1151,2205,1407,2449,1383,2441,1230,2326,1083,2032,897,1760,934,1849,932,1784,1098,1916,992,1754,930,1631,1134,1908,1155,1881,760,1325,1022,1727,975,1616,845,1747,1248,2015,931,1608
3,37.0,12292001,1152,1990,1069,1869,1520,2512,1651,2781,-32767,-32767,3225,3621,1224,2113,728,1480,728,1480,1457,2529,1327,2699,1454,2881,1304,3449,1113,3696,835,3824,550,2587,779,3643,652,4354,419,3412,665,2753,916,2440,916,2440,2409,4069,1823,3432,2291,3733,1557,2740,1503,2628,1569,2879,1435,2546,1375,2496,1158,2600,1013,2345,934,2103,1012,2326,1049,2143,1164,2224,1258,2516,1060,2037,1136,2105,1182,2230,1000,1918,1041,1912,1033,1769,1005,1934,1270,2278,1113,1919
4,37.0,12292001,1218,1964,1134,1821,1393,2306,1629,2708,-32767,-32767,5281,5334,1246,2061,757,1501,757,1501,1358,2158,1396,2297,1100,2000,1250,2309,1126,2164,1235,2592,720,2111,1045,2553,976,3063,822,2849,621,2543,886,3130,886,3130,1348,3136,1300,3211,1098,2771,932,3873,955,2683,1365,3125,1091,2603,1047,2570,1000,2479,824,2230,747,2004,868,2017,897,1968,896,1940,882,1849,1083,2128,1084,2046,1064,1953,1073,2172,762,1996,1028,1830,936,2470,1324,2298,953,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73495,67.4,25132005,803,3145,665,2321,461,2170,617,2409,-32767,-32767,654,3823,367,2114,309,2251,545,2629,803,2591,576,2490,795,2635,970,2523,1120,2781,1143,2851,1056,2263,1044,2177,1196,2338,1047,2059,953,1802,1052,1968,1166,2030,1055,1872,1195,2033,1063,1838,1209,2091,1056,1829,1180,2049,990,1740,1117,1896,1039,1809,1089,1903,892,1639,966,1660,951,1582,1079,1810,754,1324,630,1304,661,1328,627,1256,596,1242,559,1198,682,1338,748,1427,640,1280,656,1338
73496,67.4,25132005,286,1895,472,2230,543,2655,625,2836,-32767,-32767,1264,2613,453,2650,485,3034,577,3040,805,2990,544,2990,881,2923,939,2698,1082,3099,1329,3160,1115,2521,1124,2398,1315,2512,1210,2309,1063,2194,1198,2273,1373,2442,1245,2146,1401,2475,1323,2293,1425,2482,1350,2337,1433,2450,1247,2218,1385,2332,1274,2196,1357,2297,1196,2082,1147,1928,1151,2013,1359,2157,853,1485,660,1541,798,1847,694,1584,1148,2190,708,1471,716,1628,989,1957,706,1485,872,1739
73497,67.4,25132005,4039,4647,444,2313,537,2369,587,2474,-32767,-32767,1484,2584,427,2125,1630,3601,604,2623,790,2738,477,2432,833,2795,1050,2600,1213,2903,1258,2939,1429,2876,1252,2417,1307,2547,1078,2326,1100,2117,1200,2316,1383,2482,1296,2263,1399,2490,1210,2265,1273,2283,1209,2199,1281,2337,1134,2125,1251,2249,1244,2189,1352,2386,1401,2463,1090,1942,1031,1818,960,1762,755,1433,899,1789,761,1737,763,1872,687,1727,745,1659,951,2065,762,1753,962,2224,662,1651
73498,67.4,25132005,624,3896,538,2652,846,2536,701,2677,-32767,-32767,575,4194,542,2818,2090,4729,774,2793,861,3637,370,2880,1059,3021,955,2840,1161,3312,1307,3383,1325,3100,1155,2523,1224,2713,1166,2559,1163,2319,1299,2619,1486,2862,1367,2491,1525,2883,1441,2620,1508,2815,1460,2621,1562,2818,1434,2588,1542,2891,1457,2734,1575,2743,1454,2630,1357,2364,1275,2199,1398,2510,957,1735,899,2010,1028,2281,1059,2388,973,2257,846,1916,1098,2617,885,2305,1119,2618,898,2043


In [10]:
pd.set_option('display.max_columns', None)

test_data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93
count,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0,14700.0
mean,35.479592,16160640.0,4274.526054,4924.480272,2260.082041,3263.006735,1350.348231,2289.631905,1292.212789,2370.944082,-8742.086735,-8117.92381,1168.023197,2443.358027,949.865782,2116.671905,1009.62415,2316.385782,981.321156,2243.589864,1024.10415,2412.186259,996.081293,2501.477075,1022.563537,2527.577619,900.197347,2491.537619,935.122041,2818.808503,886.014762,2669.705238,905.570612,2755.207007,846.921905,2731.110816,931.870272,2793.141769,884.242517,2850.252653,865.399524,2698.675918,836.032041,2622.266395,907.529456,2862.602245,939.642857,2958.196463,870.730748,2931.097891,842.863946,2886.632177,910.537211,2861.132653,908.980884,2829.522041,934.386599,2890.509116,847.526395,2738.185034,767.950952,2711.173197,780.72585,2606.41932,833.14381,2694.877075,809.628299,2514.732041,901.923333,2464.303741,880.134422,2351.626327,887.876531,2222.734014,891.342585,2132.107823,899.461224,2091.952857,908.602721,2066.539388,1000.826531,2256.586939,964.003673,2169.06102,1099.509728,2281.934354,2032.708027,3013.751361,1196.607347,2309.484966,1797.909116,2840.644558,1530.174762,2509.893265
std,19.193455,2623683.0,2720.590336,2390.654906,2343.990403,2053.580156,1613.29098,1471.426861,1382.825208,1321.166016,15198.296981,15598.824514,1205.370981,1253.586922,958.57349,1071.496286,908.221136,1113.688813,792.82938,900.686617,800.95139,939.122441,1004.815679,1179.728455,838.928573,988.783785,760.48883,937.775902,775.341708,1024.716632,777.467288,964.024269,710.431176,940.242337,742.653849,915.566118,652.348099,855.960273,634.977682,930.519574,618.525679,847.289621,543.290231,836.279989,524.810759,918.211846,548.632612,894.282439,490.247393,932.089785,489.839096,938.250246,490.653675,865.150339,484.169166,880.769235,506.121124,925.562019,537.487632,899.780189,474.607214,919.6534,447.761509,843.83456,454.548687,839.228416,439.394897,785.335589,461.621157,757.958446,448.68743,719.158328,426.515811,706.906255,417.031543,661.460995,413.087608,648.245661,420.117525,673.935437,498.760456,763.830478,421.808805,729.561703,955.215997,1033.679838,2361.484367,2114.502216,1180.305094,1134.447049,2215.338142,2011.604882,1463.154617,1356.105557
min,0.0,12292000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-32767.0,-32767.0,0.0,0.0,0.0,-45.0,0.0,0.0,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-16.0,0.0,0.0,0.0,0.0,0.0,-13.0,0.0,-16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-15.0,0.0,-14.0,0.0,0.0,0.0,-12.0,0.0,-8.0,0.0,0.0,0.0,0.0,0.0,-15.0,0.0,0.0,0.0,0.0,0.0,-18.0,0.0,-9.0,0.0,-12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0
25%,31.0,14232000.0,1510.5,3026.75,620.0,1867.0,694.0,1612.0,614.75,1694.0,-32767.0,-32767.0,732.0,1857.0,611.0,1683.0,669.0,1795.0,666.0,1867.0,707.0,2007.75,598.0,1943.0,658.0,2078.0,551.0,2068.0,574.0,2348.0,554.0,2268.0,569.0,2378.75,517.0,2394.0,582.0,2542.0,532.0,2492.0,495.0,2415.0,522.0,2283.0,609.0,2463.0,571.0,2656.0,557.0,2544.75,527.0,2503.0,566.0,2573.0,584.0,2509.0,601.0,2525.0,518.0,2354.0,460.0,2291.0,487.0,2264.0,535.0,2379.0,515.0,2223.0,575.0,2198.0,554.0,2098.0,654.0,1936.75,659.0,1911.75,674.0,1875.0,720.0,1825.0,784.0,1964.0,778.0,1870.0,767.0,1871.0,740.0,1836.0,751.0,1857.0,861.0,1938.75,896.0,1932.75
50%,39.0,15902000.0,4629.0,5194.0,1140.0,2614.0,891.0,1866.0,909.0,2089.0,786.0,1605.0,980.0,2197.0,872.0,1959.0,957.0,2140.0,978.0,2164.0,1033.0,2346.5,882.0,2341.0,995.0,2493.0,805.0,2470.0,840.0,2829.0,786.0,2666.0,836.0,2802.0,732.0,2751.0,844.5,2878.0,787.0,2901.0,780.0,2782.0,777.0,2719.0,835.0,2937.0,870.0,3071.0,794.5,2991.5,747.0,2925.0,847.0,2940.5,851.0,2896.0,870.0,2967.0,755.0,2789.0,681.0,2748.0,684.0,2647.0,767.0,2802.0,750.5,2595.0,880.0,2552.0,880.0,2426.0,872.0,2303.0,892.0,2181.0,906.0,2132.0,933.0,2110.0,1034.0,2321.0,1008.0,2235.0,1026.0,2244.0,1041.0,2310.0,1011.0,2214.0,1153.0,2405.0,1133.0,2291.0
75%,44.0,17892000.0,6564.25,6905.0,3396.0,4375.25,1082.25,2352.0,1303.25,2791.0,978.0,1908.0,1210.0,2756.25,1061.0,2331.25,1157.0,2633.25,1166.0,2561.0,1234.0,2802.0,1143.0,2880.0,1232.0,2954.0,1109.0,2936.0,1136.0,3364.0,1033.0,3146.0,1096.0,3237.0,1003.25,3190.0,1175.0,3216.25,1120.0,3355.0,1103.0,3119.0,1051.0,3116.0,1150.0,3460.0,1250.0,3468.0,1118.0,3497.25,1096.0,3395.0,1219.0,3312.0,1195.0,3302.0,1216.0,3368.0,1091.0,3222.0,981.0,3227.25,1000.25,3043.0,1067.0,3150.0,1054.0,2948.0,1187.0,2883.0,1162.0,2740.0,1101.0,2614.0,1122.0,2462.0,1125.0,2412.0,1121.0,2417.0,1239.0,2656.0,1202.0,2591.0,1216.0,2620.0,1582.0,3097.25,1230.25,2587.0,1453.0,2888.0,1400.0,2756.0
max,83.6,25132000.0,13815.0,13744.0,15624.0,14341.0,13650.0,12017.0,13995.0,13591.0,9371.0,9311.0,14527.0,13470.0,13807.0,12737.0,12735.0,12674.0,10482.0,9176.0,10479.0,10595.0,11574.0,11079.0,10813.0,10955.0,9782.0,9456.0,10104.0,8877.0,10114.0,9229.0,9291.0,8611.0,10862.0,9184.0,10102.0,8012.0,9314.0,7716.0,8433.0,7235.0,8757.0,7417.0,7915.0,6471.0,8059.0,6630.0,5736.0,6048.0,5768.0,7539.0,4524.0,6062.0,5370.0,6361.0,5102.0,6464.0,6078.0,6601.0,7425.0,7681.0,4304.0,5711.0,4370.0,5822.0,4796.0,6333.0,4259.0,5228.0,4117.0,4986.0,4214.0,5890.0,3870.0,5158.0,4502.0,5145.0,11294.0,10707.0,8626.0,7969.0,6068.0,5990.0,13983.0,14645.0,12695.0,13584.0,13534.0,13990.0,15876.0,14601.0,15903.0,15834.0


In [21]:
data[data[2] < 0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84,85,86,87,88,89,90,91,92,93


In [None]:
perf_df = pd.DataFrame(info_list, columns=["dataset", "rep", "fold", "best_size", "best_depth", "auc", "time"])
perf_df.to_csv(f"./performance/{dataset}.csv")

In [3]:
info_list = []

for i in range(1, 46):
    col_no = i*2 + 2
    dataset = "WheatYields"
    print(i)
    
    (train_features,
         train_labels,
         train_bag_ids,
         test_features,
         test_labels,
         test_bag_ids) = train_test_split(dataset, 1, 1, 1, fit_on_full = True, cols = col_no)

    model = PrototypeForest(size=100,
                            max_depth=8,
                            min_samples_leaf=2,
                            min_samples_split=4,
                            prototype_count=1,
                            early_stopping_round= 5)

    model.fit(train_features, train_labels, train_bag_ids)

    probas = model.predict(test_features, test_bag_ids)

    pred_df = pd.DataFrame(probas, columns=["prediction"])
    pred_df.to_csv(f"./performance/prediction_reg_wheat_{i}.csv")
    
    _, index  = np.unique(test_bag_ids, return_index=True)

    score = metrics.mean_absolute_error(test_labels[index], probas[index])

    mean = score/(np.sum(test_labels)/test_labels.size)
    info_list_row = [i, mean]
    
    info_list.append(info_list_row)
    
    
perf_df = pd.DataFrame(info_list, columns=["i", "score"])
perf_df.to_csv(f"./performance/reg_wheat.csv")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45


In [29]:
perf_df

Unnamed: 0,i,score
0,1,0.378126
1,2,0.378834
2,3,0.374876
3,4,0.37128
4,5,0.37288
5,6,0.372153
6,7,0.374168
7,8,0.371579
8,9,0.373235
9,10,0.374019


In [28]:
perf_df.to_csv(f"./performance/reg_wheat.csv")

In [None]:
pd.DataFrame(info_list, columns=["dataset", "rep", "fold", "best_size", "best_depth", "auc", "time"])

In [3]:
score/(np.sum(test_labels)/test_labels.size)

1.0664532574477603

In [108]:
dataset = "CornYields"

(train_features,
     train_labels,
     train_bag_ids,
     test_features,
     test_labels,
     test_bag_ids) = train_test_split(dataset, 1, 1, 1, fit_on_full = True)

model = PrototypeForest(size=100,
                        max_depth=8,
                        min_samples_leaf=2,
                        min_samples_split=4,
                        prototype_count=1,
                        early_stopping_round= 5)

model.fit(train_features, train_labels, train_bag_ids)

probas = model.predict(test_features, test_bag_ids)

_, index  = np.unique(test_bag_ids, return_index=True)

score = metrics.mean_absolute_error(test_labels[index], probas[index])

score/(np.sum(test_labels)/test_labels.size)

Tree 1 will be trained
Tree 11 will be trained
Tree 21 will be trained
Tree 31 will be trained
Tree 41 will be trained
Tree 51 will be trained
Tree 61 will be trained
Tree 71 will be trained
Tree 81 will be trained
Tree 91 will be trained


0.5250362515717638

In [4]:
info_list = []

for i in range(1, 46):
    col_no = i*2 + 2
    dataset = "CornYields"
    print(i)
    
    (train_features,
         train_labels,
         train_bag_ids,
         test_features,
         test_labels,
         test_bag_ids) = train_test_split(dataset, 1, 1, 1, fit_on_full = True, cols = col_no)

    model = PrototypeForest(size=100,
                            max_depth=8,
                            min_samples_leaf=2,
                            min_samples_split=4,
                            prototype_count=1,
                            early_stopping_round= 5)

    model.fit(train_features, train_labels, train_bag_ids)

    probas = model.predict(test_features, test_bag_ids)
    
    pred_df = pd.DataFrame(probas, columns=["prediction"])
    pred_df.to_csv(f"./performance/prediction_reg_corn_{i}.csv")

    _, index  = np.unique(test_bag_ids, return_index=True)

    score = metrics.mean_absolute_error(test_labels[index], probas[index])

    mean = score/(np.sum(test_labels)/test_labels.size)
    info_list_row = [i, mean]
    
    info_list.append(info_list_row)

perf_df = pd.DataFrame(info_list, columns=["i", "score"])
perf_df.to_csv(f"./performance/reg_corn.csv")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45


In [10]:
def train_test_split(dataset, rep, fold, explained_variance, fit_on_full = False, custom=False, cols=None):
    #data = pd.read_csv(f"./datasets_regression/{dataset}.csv", header=None, sep=" ")
    #testbags =  pd.read_csv(f"./datasets_regression/{dataset}.csv_rep{rep}_fold{fold}.txt", header=None)
    data = pd.read_csv(f"./datasets_regression/syn_new/{dataset}.csv", header=None, sep=",")
    testbags =  pd.read_csv(f"./datasets_regression/syn_new/cv/{dataset}.csv_rep{rep}_fold{fold}.txt", sep=",")
    
    data = np.round(data,2)
    
    if cols:
        data = data[list(range(cols))]
    
    if custom:
        min_limit = testbags.min()[0]
        max_limit = testbags.max()[0]
        size = testbags.size
        size_pos = size // 2
        pos = list(range(min_limit, min_limit + size_pos))
        neg = list(range(max_limit - size_pos + 1, max_limit + 1))
        testbags = pd.DataFrame([*pos, *neg])

    train_data = data[~data[1].isin(testbags["x"].tolist())]    
    
    #for i in range(2, 94):
    #    clean_data = train_data[(train_data[i] != 0) & (train_data[i] != -32767)]
    #    mean = clean_data[i].mean()
    #    train_data[(train_data[i] == 0) | (train_data[i] == -32767)] = mean

    test_data = data[data[1].isin(testbags["x"].tolist())]
    
    (train_features, train_labels, train_bag_ids) = split_features_labels_bags(train_data)
    (test_features, test_labels, test_bag_ids) = split_features_labels_bags(test_data)
    
    if explained_variance < 1:
        pipe = Pipeline([('pca', PCA(n_components = explained_variance, 
                         svd_solver = "full")), 
         ('scaler', StandardScaler()), ])
    else:
        pipe = Pipeline([('scaler', StandardScaler()), ])
    
    if fit_on_full:
        pipe.fit(data[data.columns[~data.columns.isin([0,1])]].to_numpy())
    else:
        pipe.fit(train_features)

    train_features = pipe.transform(train_features)
    test_features = pipe.transform(test_features)
    
    train_features = np.round(train_features,2)
    test_features = np.round(test_features,2)

    return (
        train_features, 
        train_labels, 
        train_bag_ids,
        test_features, 
        test_labels,
        test_bag_ids)


In [None]:
import os

folders = os.listdir("/home/erdemb/libs/mil/datasets_regression/syn_new/")
datasets = [x for x in folders if x != "cv"]
datasets = [x.split(".")[0] for x in datasets]

info_list = []

for dataset in datasets:
    for rep in range(1, 10):
        for fold in range(1, 10):
            print(f"dataset {dataset}, rep {rep}, fold {fold}")
            (train_features,
                 train_labels,
                 train_bag_ids,
                 test_features,
                 test_labels,
                 test_bag_ids) = train_test_split(dataset, 1, 1, 1, fit_on_full = True)

            model = PrototypeForest(size=100,
                                    max_depth=8,
                                    min_samples_leaf=2,
                                    min_samples_split=4,
                                    prototype_count=1,
                                    early_stopping_round= 5)

            model.fit(train_features, train_labels, train_bag_ids)

            probas = model.predict(test_features, test_bag_ids)

            pred_df = pd.DataFrame(probas, columns=["prediction"])
            pred_df.to_csv(f"./performance/prediction_{dataset}_rep_{rep}_fold_{fold}.csv")

            _, index  = np.unique(test_bag_ids, return_index=True)

            score = metrics.mean_absolute_error(test_labels[index], probas[index])

            mean = score/(np.sum(test_labels)/test_labels.size)
            info_list_row = [dataset, rep, fold, mean]

            info_list.append(info_list_row)

perf_df = pd.DataFrame(info_list, columns=["dataset", "rep", "fold", "score"])
perf_df.to_csv(f"./performance/{dataset}_rep_{rep}_fold_{fold}.csv")

dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 1
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 2
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 3
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 4
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 5
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 6
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 7
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 8
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 9
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 2, fold 1
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 2, fold 2
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 2, fold 3
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 2, fold 4
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 2, fold 5
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 2, fold 6
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 2, fold 7
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 2, fold 8
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 2, fold 9
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 3, fold 1
dataset nBag

dataset nBag_50_nFeat_2_nInsPerBag_10, rep 9, fold 5
dataset nBag_50_nFeat_2_nInsPerBag_10, rep 9, fold 6
dataset nBag_50_nFeat_2_nInsPerBag_10, rep 9, fold 7
dataset nBag_50_nFeat_2_nInsPerBag_10, rep 9, fold 8
dataset nBag_50_nFeat_2_nInsPerBag_10, rep 9, fold 9
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 1, fold 1
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 1, fold 2
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 1, fold 3
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 1, fold 4
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 1, fold 5
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 1, fold 6
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 1, fold 7
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 1, fold 8
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 1, fold 9
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 2, fold 1
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 2, fold 2
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 2, fold 3
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 2, fold 4
dataset nBag_25_nFeat_2_nInsPerBag_10, rep 2, 

dataset nBag_25_nFeat_5_nInsPerBag_10, rep 8, fold 7
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 8, fold 8
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 8, fold 9
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 9, fold 1
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 9, fold 2
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 9, fold 3
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 9, fold 4
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 9, fold 5
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 9, fold 6
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 9, fold 7
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 9, fold 8
dataset nBag_25_nFeat_5_nInsPerBag_10, rep 9, fold 9
dataset nBag_50_nFeat_10_nInsPerBag_5, rep 1, fold 1
dataset nBag_50_nFeat_10_nInsPerBag_5, rep 1, fold 2
dataset nBag_50_nFeat_10_nInsPerBag_5, rep 1, fold 3
dataset nBag_50_nFeat_10_nInsPerBag_5, rep 1, fold 4
dataset nBag_50_nFeat_10_nInsPerBag_5, rep 1, fold 5
dataset nBag_50_nFeat_10_nInsPerBag_5, rep 1, fold 6
dataset nBag_50_nFeat_10_nInsPerBag_5, rep 1, 

dataset nBag_50_nFeat_10_nInsPerBag_10, rep 7, fold 8
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 7, fold 9
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 8, fold 1
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 8, fold 2
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 8, fold 3
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 8, fold 4
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 8, fold 5
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 8, fold 6
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 8, fold 7
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 8, fold 8
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 8, fold 9
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 9, fold 1
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 9, fold 2
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 9, fold 3
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 9, fold 4
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 9, fold 5
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 9, fold 6
dataset nBag_50_nFeat_10_nInsPerBag_10, rep 9, fold 7
dataset nBag_50_nFeat_10_nIn

dataset nBag_50_nFeat_2_nInsPerBag_2, rep 7, fold 3
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 7, fold 4
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 7, fold 5
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 7, fold 6
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 7, fold 7
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 7, fold 8
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 7, fold 9
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 8, fold 1
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 8, fold 2
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 8, fold 3
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 8, fold 4
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 8, fold 5
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 8, fold 6
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 8, fold 7
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 8, fold 8
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 8, fold 9
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 9, fold 1
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 9, fold 2
dataset nBag_50_nFeat_2_nInsPerBag_2, rep 9, fold 3
dataset nBag

dataset nBag_50_nFeat_5_nInsPerBag_5, rep 6, fold 6
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 6, fold 7
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 6, fold 8
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 6, fold 9
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 7, fold 1
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 7, fold 2
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 7, fold 3
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 7, fold 4
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 7, fold 5
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 7, fold 6
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 7, fold 7
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 7, fold 8
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 7, fold 9
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 8, fold 1
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 8, fold 2
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 8, fold 3
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 8, fold 4
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 8, fold 5
dataset nBag_50_nFeat_5_nInsPerBag_5, rep 8, fold 6
dataset nBag

dataset nBag_25_nFeat_2_nInsPerBag_5, rep 5, fold 9
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 6, fold 1
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 6, fold 2
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 6, fold 3
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 6, fold 4
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 6, fold 5
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 6, fold 6
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 6, fold 7
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 6, fold 8
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 6, fold 9
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 7, fold 1
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 7, fold 2
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 7, fold 3
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 7, fold 4
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 7, fold 5
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 7, fold 6
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 7, fold 7
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 7, fold 8
dataset nBag_25_nFeat_2_nInsPerBag_5, rep 7, fold 9
dataset nBag

In [14]:
perf_df

Unnamed: 0,dataset,rep,fold,score
0,nBag_25_nFeat_2_nInsPerBag_2,1,1,0.383455
1,nBag_25_nFeat_2_nInsPerBag_2,1,2,0.749022
2,nBag_25_nFeat_2_nInsPerBag_2,1,3,0.993996
3,nBag_25_nFeat_2_nInsPerBag_2,1,4,0.708635
4,nBag_25_nFeat_2_nInsPerBag_2,1,5,0.882609
...,...,...,...,...
1453,nBag_25_nFeat_10_nInsPerBag_2,9,5,-4.086732
1454,nBag_25_nFeat_10_nInsPerBag_2,9,6,-2.660253
1455,nBag_25_nFeat_10_nInsPerBag_2,9,7,-2.922680
1456,nBag_25_nFeat_10_nInsPerBag_2,9,8,-2.026717
