# **🐍 CO-FOREST PYTONIZADO** 

##### **Autora: Patricia Hernando Fernández**

---------------------------
### **IMPORTS**

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from copy import deepcopy
import numpy as np

### **CLASS**

In [2]:
class Co_Forest:

    def __init__(self, L, y, U, n, sigma, classes, max_features='sqrt'):
        """
        L, U, y: numpy array
        """

        self.n = n
        self.sigma = sigma
        self.classes = classes

        self.L = L
        self.y = y
        self.U = U
        self.U_pseudo_tags = ( np.ones(shape=((self.U.shape[0]), self.n)) * -1 )

        self.mask_L = np.zeros(shape=((self.L.shape[0]), self.n), dtype=int, order='C') 
        self.mask_U = np.zeros(shape=((self.U.shape[0]), self.n), dtype=int, order='C') 

        self.ensemble = self.create_trees(max_features) 

        #self.fit()


    def create_trees(self, max_features) -> dict:
        """Generates a dict -> {key: int, value: Tree}

        Parameters
        ----------
        max_features: number of features to consider 
                      when looking for the best split
                      'sqrt', 'log2', None

        Returns
        -------
        dict
            dict containing the trees of co-forest
        """

        ensemble = {}

        for i in range(self.n):

            rand_rows = np.random.choice(a = np.arange(start=0, stop=self.L.shape[0]), replace = True, size=(int(0.5*self.L.shape[0])) )
            self.mask_L[rand_rows, i] = 1
            h = DecisionTreeClassifier(max_features=max_features, random_state=1)
            ensemble[i] = h.fit(self.L[rand_rows, :], self.y[rand_rows])

        return ensemble


    def get_rows_training(self, i, both = True):
        """"
        Devuelve las filas modo vector, NO números
        """
        if both:
            return np.concatenate( (self.L[self.mask_L[:, i] == 1], self.U[self.mask_U[:, i] == 1]) )
        else:
            return self.L[self.mask_L[:, i] == 1]


    def get_tags_training(self, i, both = True):

        if both:
            U_tags_i = self.U_pseudo_tags[:, i]
            return np.concatenate( (self.y[self.mask_L[:, i] == 1], U_tags_i[self.mask_U[:, i] == 1]) )
        
        else:
            return self.y[self.mask_L[:, i] == 1]


    def fit(self):
        
        e_anterior = [0.5 for i in range(self.n)]
        W_anterior = [min(0.1*len(self.L), 100) for i in range(self.n)]

        t = W = 0
        new_data = True
        
        while new_data:

            t += 1
            tree_changes = [False for i in range(self.n)]

            for i, hi in self.ensemble.items():

                e = self.calculate_e(hi)

                if e < e_anterior[i]:

                    (lambda x: 0.000001 if x <= 0 else x)(e)
                    U_subsampled = self.subsample(hi, ((e_anterior[i]*W_anterior[i])/e) )
                    W = 0

                    for u in U_subsampled:
                        confidence, selected_class = self.confidence(hi, self.U[u, :])

                        if confidence > self.sigma:
                            tree_changes[i] = True
                            self.mask_U[u, i] = 1
                            self.U_pseudo_tags[u, i] = selected_class
                            W += confidence

                e_anterior[i] = e
                W_anterior[i] = W

            new_data = self.retrain_ensemble(np.array(tree_changes))


    def retrain_ensemble(self, tree_changes: np.array) -> bool:
        """Retrains from scratch those trees that have 
        received new data.

        Parameters
        ----------
        tree_changes : boolean numpy array

        Returns
        -------
        bool
            True if one or more trees have chanded, False if not.
        """

        if tree_changes.sum() == 0:
            return False

        for i in np.fromiter(self.ensemble.keys(), dtype=int)[tree_changes]:
            self.ensemble[i] = self.ensemble[i].fit(self.get_rows_training(i), self.get_tags_training(i))
        
        return True
        


    def subsample(self, hi: DecisionTreeClassifier, Wmax: float) -> np.array:
        """Samples from U uniformly at random until 
        the sum of the sample weights reaches Wmax.
        Bootstraping is applied.

        Parameters
        ----------
        hi : DecisionTreeClassifier
        Wmax: float

        Returns
        -------
        np.array
            Array containing the index of the chosen
            samples from U
        """

        W = 0
        U_subsampled = []

        while (W < Wmax):

            rand_row = np.random.choice(a = np.arange(start=0, stop=self.U.shape[0]))
            W += self.confidence(hi, self.U[rand_row, :])[0]
            U_subsampled.append(rand_row)

        return np.array(U_subsampled)

        
    def calculate_e(self, hi: DecisionTreeClassifier) -> float:
        """Calculates the Out of Bag Error of the concomitant 
        ensemble of hi for the whole labeled data.

        Parameters
        ----------
        hi : DecisionTreeClassifier

        Returns
        -------
        float
            OOBE
        """

        error_sum = total_samples_voted = 0

        for sample, tag in zip(self.L, self.y):
            
            n_votes = n_hits = 0 

            for prediction in (tree.predict([sample])[0] for i, tree in self.ensemble.items() if tree != hi and sample not in self.get_rows_training(i, both = False)):

                if prediction == tag:
                    n_hits += 1
                n_votes +=1

            if (n_votes > 0):
                error_sum += 1 - (n_hits/n_votes)
                total_samples_voted += 1

        if total_samples_voted > 0:
            return error_sum/total_samples_voted

        return 1 #No data was voted (Raise Exception instead)


    def confidence(self, hi: DecisionTreeClassifier, sample: np.array) -> tuple:
        """Calculates the number of coincidences during
        prediction of the hi concomitant ensemble for a
        data sample.

        Parameters
        ----------
        hi : DecisionTreeClassifier
        sample: sample's features array

        Returns
        -------
        tuple (float, int)
            float: confidence for the sample
            int: most agreed class
        """

        count = { **dict.fromkeys([i for i in self.classes], 0)} 
        for i in (tree.predict([sample])[0] for tree in self.ensemble.values() if tree != hi):
            count[i]+= 1

        max_agreement = max(count.values())
        most_agreed_class = list(count.values()).index(max_agreement)
        return max_agreement/(len(self.ensemble) -1), most_agreed_class

### **DATA**

In [3]:
iris = load_iris()

total_data = np.array(iris.data)
total_tags = np.array(iris.target)

n_class_types = len(iris.target_names)

L, U, L_tags, U_tags = train_test_split(total_data, total_tags, test_size=0.4)

### **TEST**

In [6]:
coforest = Co_Forest(L, L_tags, U, 5, 0.75, [0,1,2])
coforest.subsample(coforest.ensemble[0], 50)
coforest.fit()

In [4]:
# L = np.array([
#     [1, 1, 1, 2],
#     [2, 2, 2, 2],
#     [3, 3, 3, 3],
#     [4, 4, 4, 4],
#     [5, 5, 5, 5],
#     [6, 6, 6, 6]
# ])

# y = np.array([1,2,3,4,5,6])

# U = np.array([
#     [7, 7, 7, 7],
#     [8, 8, 8, 8],
#     [9, 9, 9,9]
# ])

# coforest = Co_Forest(L, y, U, 5, 0.75, [1,2,3,4,5,6])
# coforest.subsample(coforest.ensemble[0], 50)
# coforest.fit()