In [16]:
list(range(4))

[0, 1, 2, 3]

In [72]:
import os
import random
import sys
import time
import math

import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.pipeline import Pipeline
from scipy.stats import mode

class Node:
    def __init__(self):

        self.right = None
        self.left = None
        
        self.prototype = None
        
        self.column = None
        self.threshold = None
        
        self.probas = None
        self.depth = None
        
        self.is_terminal = False
        
class PrototypeTreeClassifier:
    def __init__(self,
                train_features,
                 feature_types = ["min", "max", "mean"], 
                 max_depth = 3, 
                 min_samples_leaf = 1, 
                 min_samples_split = 2, 
                 prototype_count = 1,
                 use_prototype_learner=True,
                 early_stopping_round = 3):

        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.prototype_count = prototype_count
        self.feature_types = feature_types
        self.train_features = train_features
        self.use_prototype_learner = use_prototype_learner
        self.Tree = None
        self.early_stopping_round = early_stopping_round
        
    def prototype(self, bags, features, labels, prototype_count):
        number_of_rows = features.shape[0]
        random_indices = np.random.choice(number_of_rows, 
                                          size=prototype_count, 
                                          replace=False)

        prot = features[random_indices, :]
        if len(prot.shape) == 1:
            prot = prot.reshape(1, prot.shape[0])
        return prot

    def nodeProbas(self, y):
        # for each unique label calculate the probability for it
        probas = []

        return np.asarray(np.sum(y)/y.size)

    def features_via_prototype(self, feature_types, features, bag_ids, prototypes):
        distances = self.calculate_distances(features, prototypes)
        
        bin_count  = np.unique(bag_ids, return_counts=True)[1]
        _, index  = np.unique(bag_ids, return_index=True)

        feature_list = []
        for i in range(0, prototypes.shape[0]):
            if "max" in feature_types:
                group_max = np.maximum.reduceat(distances[:, i], index)
                max_vals = np.repeat(group_max, bin_count)
                feature_list.append(max_vals)

            if "min" in feature_types:
                group_min = np.minimum.reduceat(distances[:, i], index)
                min_vals = np.repeat(group_min, bin_count)
                feature_list.append(min_vals)

            if "mean" in feature_types:
                group_mean = np.add.reduceat(distances[:, i], index)
                mean_vals = np.repeat(group_mean/bin_count, bin_count)
                feature_list.append(mean_vals)
        
        return np.array(np.transpose(feature_list))

    def dist1d(self, features, prototypes, distance_type="l2"):
        if distance_type == "l2":

            distance = np.linalg.norm(features - prototypes, axis=1)
        elif distance_type == "l1":
            distance = np.abs(features - prototypes)
            distance = np.sum(distance, axis=1)

        return distance

    def calculate_distances(self, features, prototypes):
        feature_list = []
        
        for i in range(0, prototypes.shape[0]):
            data = self.dist1d(features, prototypes[i], distance_type="l2")
            feature_list.append(data)
        data = np.column_stack(feature_list)

        return data

    def calcBestSplit(self, features, features_via_prototype, labels, bag_ids):
        ids, index  = np.unique(bag_ids, return_index=True)
        
        bdc = tree.DecisionTreeRegressor(
            min_samples_split=2,
            criterion="mae"
        )
        bdc.fit(features_via_prototype[index], labels[index])
        
        threshold = bdc.tree_.threshold[0]
        split_col = bdc.tree_.feature[0]

        features_left = features[features_via_prototype[:,split_col] <= bdc.tree_.threshold[0]]
        features_right = features[features_via_prototype[:,split_col] > bdc.tree_.threshold[0]]

        labels_left = labels[features_via_prototype[:,split_col] <= bdc.tree_.threshold[0]]
        labels_right = labels[features_via_prototype[:,split_col] > bdc.tree_.threshold[0]]

        bag_ids_left = bag_ids[features_via_prototype[:,split_col] <= bdc.tree_.threshold[0]]
        bag_ids_right = bag_ids[features_via_prototype[:,split_col] > bdc.tree_.threshold[0]]

        return split_col, threshold, features_left, features_right, labels_left, labels_right, bag_ids_left, bag_ids_right

    def buildDT(self, features, labels, bag_ids, node):
            '''
            Recursively builds decision tree from the top to bottom
            '''
            # checking for the terminal conditions

            if node.depth >= self.max_depth:
                node.is_terminal = True
                return

            if len(np.unique(bag_ids)) < self.min_samples_split:
                node.is_terminal = True
                return

            if np.unique(labels).shape[0] == 1:
                node.is_terminal = True
                return
            
            node.prototype = self.prototype(bag_ids, features, labels, self.prototype_count)
            features_updated = self.features_via_prototype(self.feature_types, features, bag_ids, node.prototype)
            
            # calculating current split
            (splitCol, 
             thresh, 
             features_left, 
             features_right, 
             labels_left, 
             labels_right, 
             bag_ids_left, 
             bag_ids_right) = self.calcBestSplit(features, 
                                                 features_updated, 
                                                 labels, 
                                                 bag_ids)
            
            if splitCol is None:
                node.is_terminal = True
                return

            if len(np.unique(bag_ids_left)) < self.min_samples_leaf or len(np.unique(bag_ids_right)) < self.min_samples_leaf:
                node.is_terminal = True
                return
            
            node.column = splitCol
            node.threshold = thresh
            
            _, index_left  = np.unique(bag_ids_left, return_index=True)
            _, index_right  = np.unique(bag_ids_right, return_index=True)
            
            # creating left and right child nodes
            node.left = Node()
            node.left.depth = node.depth + 1
            node.left.probas = self.nodeProbas(labels_left[index_left])

            node.right = Node()
            node.right.depth = node.depth + 1
            node.right.probas = self.nodeProbas(labels_right[index_right])

            # splitting recursively
            
            self.buildDT(features_right, labels_right, bag_ids_right, node.right)
            self.buildDT(features_left, labels_left, bag_ids_left, node.left)

    def fit(self, features, labels, bag_ids):
        '''
        Standard fit function to run all the model training
        '''
        self.Tree = Node()
        self.Tree.depth = 1
        
        self.buildDT(features, labels, bag_ids, self.Tree)

    def predictSample(self, features, bag_ids, node):
        '''
        Passes one object through decision tree and return the probability of it to belong to each class
        '''

        # if we have reached the terminal node of the tree
        if node.is_terminal:
            return node.probas

        features_updated = self.features_via_prototype(self.feature_types, features, bag_ids, node.prototype)

        if features_updated[0][node.column] > node.threshold:
            probas = self.predictSample(features, bag_ids, node.right)
        else:
            probas = self.predictSample(features, bag_ids, node.left)

        return probas

    def predict(self, features, bag_ids):
        '''
        Returns the labels for each X
        '''

        if type(features) == pd.DataFrame:
            X = np.asarray(features)

        sort_index = np.argsort(bag_ids)
        bag_ids = bag_ids[sort_index]
        features = features[sort_index]

        features_updated = self.features_via_prototype(self.feature_types, features, bag_ids, self.Tree.prototype)

        index  = np.unique(bag_ids, return_index=True)[1]
        count  = np.unique(bag_ids, return_counts=True)[1]
        index = np.append(index, bag_ids.shape[0])   
        predictions = []

        for i in range(0, len(index) - 1):
            pred = self.predictSample(features[index[i]:index[i+1]], 
                                                bag_ids[index[i]:index[i+1]], 
                                                self.Tree)
            
            pred = np.repeat(pred, count[i])
            predictions = np.concatenate((predictions, pred), axis=0)
        
        return np.asarray(predictions)

class PrototypeForest:
    def __init__(self, size,
                feature_types = ["min", "mean", "max"],
                max_depth = 8, 
                min_samples_leaf = 2, 
                min_samples_split = 2, 
                prototype_count = 1,
                use_prototype_learner = True,
                early_stopping_round = 10):
        self.size = size
        self._trees = []
        self._tuning_trees = []
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.prototype_count = prototype_count
        self.use_prototype_learner = use_prototype_learner
        self.early_stopping_round = early_stopping_round
        
    def sample(self, features, labels, bag_ids):
        ids, index  = np.unique(bag_ids, return_index=True)
        group_min = np.minimum.reduceat(labels, index)
        bag_size = math.ceil(group_min.shape[0] * 0.8)
        bags_all = np.random.choice(ids, bag_size, replace=False)
        
        df = pd.DataFrame(np.concatenate([train_bag_ids.reshape(train_bag_ids.shape[0],1),
                                          train_labels.reshape(train_labels.shape[0],1)],
                                         axis=1))
        
        indices_all = df[df[0].isin(bags_all)].index.to_numpy()
        inbag_indices = indices_all
        oo_bag_mask = np.ones(labels.shape[0], dtype=bool)
        oo_bag_mask[inbag_indices] = False
        outbag_indices = np.where(oo_bag_mask == 1)
        
        return inbag_indices, outbag_indices

    
    def fit(self, features, labels, bag_ids):
        for i in range(self.size):
            
            (inbag_indices, _) = self.sample(features, labels, bag_ids)
            inbag_features = features[inbag_indices]
            inbag_labels = labels[inbag_indices]
            inbag_bag_ids = bag_ids[inbag_indices]
            tree = PrototypeTreeClassifier(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                min_samples_split=self.min_samples_split,
                prototype_count = self.prototype_count,
                use_prototype_learner = self.use_prototype_learner,
                train_features = inbag_features,
                early_stopping_round = self.early_stopping_round
            )
            tree.fit(inbag_features, inbag_labels, inbag_bag_ids)
            while tree.Tree.right is None:
                tree.fit(inbag_features, inbag_labels, inbag_bag_ids)
            self._trees.append(tree)
            
    def predict(self, features, bag_ids):
        temp = [t.predict(features, bag_ids) for t in self._trees]
        preds = np.transpose(np.array(temp))
        return np.sum(preds, axis=1)/self.size
   
def split_features_labels_bags(data):
    features = data[data.columns[~data.columns.isin([0, 1])]].to_numpy()
    labels = data[0].to_numpy()
    bag_ids = data[1].to_numpy()

    #sort_index = np.argsort(bag_ids)
    #bag_ids = bag_ids[sort_index]
    #features = features[sort_index]
    
    return (features, labels, bag_ids)

def train_test_split(dataset, rep, fold, explained_variance, fit_on_full = False, custom=False, cols=None):
    data = pd.read_csv(f"./datasets_regression/{dataset}.csv", header=None, sep=" ")
    testbags =  pd.read_csv(f"./datasets_regression/{dataset}.csv_rep{rep}_fold{fold}.txt", header=None)
    #data = pd.read_csv(f"./datasets_regression/{dataset}.csv", header=None, sep=" ")
    #testbags =  pd.read_csv(f"./datasets_regression/cv/{dataset}.csv_rep{rep}_fold{fold}.txt", header=None)

    if cols:
        data = data[list(range(cols))]
    
    if custom:
        min_limit = testbags.min()[0]
        max_limit = testbags.max()[0]
        size = testbags.size
        size_pos = size // 2
        pos = list(range(min_limit, min_limit + size_pos))
        neg = list(range(max_limit - size_pos + 1, max_limit + 1))
        testbags = pd.DataFrame([*pos, *neg])
          
    train_data = data[~data[1].isin(testbags[0].tolist())]    
    
    #for i in range(2, 94):
    #    clean_data = train_data[(train_data[i] != 0) & (train_data[i] != -32767)]
    #    mean = clean_data[i].mean()
    #    train_data[(train_data[i] == 0) | (train_data[i] == -32767)] = mean

    test_data = data[data[1].isin(testbags[0].tolist())]
    
    (train_features, train_labels, train_bag_ids) = split_features_labels_bags(train_data)
    (test_features, test_labels, test_bag_ids) = split_features_labels_bags(test_data)
    
    if explained_variance < 1:
        pipe = Pipeline([('pca', PCA(n_components = explained_variance, 
                         svd_solver = "full")), 
         ('scaler', StandardScaler()), ])
    else:
        pipe = Pipeline([('scaler', StandardScaler()), ])
    
    if fit_on_full:
        pipe.fit(data[data.columns[~data.columns.isin([0,1])]].to_numpy())
    else:
        pipe.fit(train_features)

    train_features = pipe.transform(train_features)
    test_features = pipe.transform(test_features)
    
    return (
        train_features, 
        train_labels, 
        train_bag_ids,
        test_features, 
        test_labels,
        test_bag_ids)


In [None]:
info_list = []
pred_all = []

for i in range(1, 46):
    col_no = i*2 + 2
    dataset = "WheatYields"
    print(i)
    
    (train_features,
         train_labels,
         train_bag_ids,
         test_features,
         test_labels,
         test_bag_ids) = train_test_split(dataset, 1, 1, 1, fit_on_full = True, cols = col_no)

    model = PrototypeForest(size=100,
                            max_depth=8,
                            min_samples_leaf=2,
                            min_samples_split=4,
                            prototype_count=1,
                            early_stopping_round= 5)

    model.fit(train_features, train_labels, train_bag_ids)

    probas = model.predict(test_features, test_bag_ids)
    
    pred_list = list(zip(probas, test_labels))
    names = dataset.split("_")

    pred_list = [(i, x[0], x[1]) for x in pred_list]

    pred_all.extend(pred_list)

    #pred_df = pd.DataFrame(probas, columns=["prediction"])
    #pred_df.to_csv(f"./performance/prediction_reg_corn_{i}.csv")

    _, index  = np.unique(test_bag_ids, return_index=True)

    score = metrics.mean_absolute_error(test_labels[index], probas[index])

    mean = score/(np.sum(test_labels)/test_labels.size)
    info_list_row = [i, mean]
    
    info_list.append(info_list_row)

perf_df = pd.DataFrame(info_list, columns=["i", "score"])
perf_df.to_csv(f"./performance/performance_{dataset}.csv")

all_df = pd.DataFrame(pred_all, columns=["i", "prediction", "label"])
all_df.to_csv(f"./performance/predictions_{dataset}.csv")

1
2


In [23]:
info_list = []
pred_all = []

for i in range(1, 46):
    col_no = i*2 + 2
    dataset = "CornYields"
    print(i)
    
    (train_features,
         train_labels,
         train_bag_ids,
         test_features,
         test_labels,
         test_bag_ids) = train_test_split(dataset, 1, 1, 1, fit_on_full = True, cols = col_no)

    model = PrototypeForest(size=100,
                            max_depth=8,
                            min_samples_leaf=2,
                            min_samples_split=4,
                            prototype_count=1,
                            early_stopping_round= 5)

    model.fit(train_features, train_labels, train_bag_ids)

    probas = model.predict(test_features, test_bag_ids)
    
    pred_list = list(zip(probas, test_labels))
    names = dataset.split("_")

    pred_list = [(i, x[0], x[1]) for x in pred_list]

    pred_all.extend(pred_list)

    #pred_df = pd.DataFrame(probas, columns=["prediction"])
    #pred_df.to_csv(f"./performance/prediction_reg_corn_{i}.csv")

    _, index  = np.unique(test_bag_ids, return_index=True)

    score = metrics.mean_absolute_error(test_labels[index], probas[index])

    mean = score/(np.sum(test_labels)/test_labels.size)
    info_list_row = [i, mean]
    
    info_list.append(info_list_row)

perf_df = pd.DataFrame(info_list, columns=["i", "score"])
perf_df.to_csv(f"./performance/performance_{dataset}.csv")

all_df = pd.DataFrame(pred_all, columns=["i", "prediction", "label"])
all_df.to_csv(f"./performance/predictions_{dataset}.csv")

1


FileNotFoundError: [Errno 2] No such file or directory: './datasets_regression/cv/CornYields.csv_rep1_fold1.txt'

In [60]:
def train_test_split(dataset, rep, fold, explained_variance, fit_on_full = False, custom=False, cols=None):
    #data = pd.read_csv(f"./datasets_regression/{dataset}.csv", header=None, sep=" ")
    #testbags =  pd.read_csv(f"./datasets_regression/{dataset}.csv_rep{rep}_fold{fold}.txt", header=None)
    data = pd.read_csv(f"./datasets_regression/syn_new/{dataset}.csv", header=None, sep=",")
    testbags =  pd.read_csv(f"./datasets_regression/syn_new/cv/{dataset}.csv_rep{rep}_fold{fold}.txt", sep=",")
    
    data = np.round(data,2)
    
    if cols:
        data = data[list(range(cols))]
    
    if custom:
        min_limit = testbags.min()[0]
        max_limit = testbags.max()[0]
        size = testbags.size
        size_pos = size // 2
        pos = list(range(min_limit, min_limit + size_pos))
        neg = list(range(max_limit - size_pos + 1, max_limit + 1))
        testbags = pd.DataFrame([*pos, *neg])

    train_data = data[~data[1].isin(testbags["x"].tolist())]    
    
    #for i in range(2, 94):
    #    clean_data = train_data[(train_data[i] != 0) & (train_data[i] != -32767)]
    #    mean = clean_data[i].mean()
    #    train_data[(train_data[i] == 0) | (train_data[i] == -32767)] = mean

    test_data = data[data[1].isin(testbags["x"].tolist())]
    
    (train_features, train_labels, train_bag_ids) = split_features_labels_bags(train_data)
    (test_features, test_labels, test_bag_ids) = split_features_labels_bags(test_data)
    
    if explained_variance < 1:
        pipe = Pipeline([('pca', PCA(n_components = explained_variance, 
                         svd_solver = "full")), 
         ('scaler', StandardScaler()), ])
    else:
        pipe = Pipeline([('scaler', StandardScaler()), ])
    
    if fit_on_full:
        pipe.fit(data[data.columns[~data.columns.isin([0,1])]].to_numpy())
    else:
        pipe.fit(train_features)

    train_features = pipe.transform(train_features)
    test_features = pipe.transform(test_features)
    
    train_features = np.round(train_features,2)
    test_features = np.round(test_features,2)

    return (
        train_features, 
        train_labels, 
        train_bag_ids,
        test_features, 
        test_labels,
        test_bag_ids)


In [69]:
import os

folders = os.listdir("/home/erdemb/libs/mil/datasets_regression/syn_new/")
datasets = [x for x in folders if x != "cv"]
datasets = [x.split(".")[0] for x in datasets]

info_list = []
pred_all = []

for dataset in datasets:
    for rep in range(1, 10):
        for fold in range(1, 10):
            print(f"dataset {dataset}, rep {rep}, fold {fold}")
            (train_features,
                 train_labels,
                 train_bag_ids,
                 test_features,
                 test_labels,
                 test_bag_ids) = train_test_split(dataset, 1, 1, 1, fit_on_full = True)

            model = PrototypeForest(size=100,
                                    max_depth=8,
                                    min_samples_leaf=2,
                                    min_samples_split=4,
                                    prototype_count=1,
                                    early_stopping_round= 5)

            model.fit(train_features, train_labels, train_bag_ids)

            probas = model.predict(test_features, test_bag_ids)
            
            pred_list = list(zip(probas, test_labels))
            names = dataset.split("_")
            
            pred_list = [(names[1], names[3], names[5], rep, fold, x[0], x[1]) for x in pred_list]
            
            pred_all.extend(pred_list)

            #pred_df = pd.DataFrame(probas, columns=["prediction"])
            #pred_df.to_csv(f"./performance/prediction_{dataset}_rep_{rep}_fold_{fold}.csv")

            _, index  = np.unique(test_bag_ids, return_index=True)

            score = metrics.mean_absolute_error(test_labels[index], probas[index])

            mean = score/(np.sum(test_labels)/test_labels.size)
            info_list_row = [dataset, rep, fold, mean]

            info_list.append(info_list_row)

all_df = pd.DataFrame(pred_all, columns=["nBag", "nFeat","nInsPerBag","rep", "fold", "prediction", "label"])
           
perf_df = pd.DataFrame(info_list, columns=["dataset", "rep", "fold", "score"])
perf_df.to_csv(f"./performance/{dataset}_rep_{rep}_fold_{fold}.csv")

all_df.to_csv(f"./performance/predictions_synthetic.csv")

dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 1
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 2
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 3
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 4
dataset nBag_25_nFeat_2_nInsPerBag_2, rep 1, fold 5


KeyboardInterrupt: 