# **🐍 CO-FOREST PYTONIZADO** 

##### **Autora: Patricia Hernando Fernández**

*DEBUGGEAR:*

* Se puede "no recordar" L' it porque se descarta (hacerlo variable local de entrenamiento) si se prefiere y se ahorra bastante memoria

* Si todos los oob error de la primera ronda son > 0.5, se ignora el entrenamiento. Experimentar con los parámetros. 
* ¿Qué hacer si no se vota? -> Sustituído por nan. Creo que por porcentaje se arregla solo si aumenta mucho el número de datos

* Problema: WMax cuando el error de la iteración es 0

* Se ha limitado a un Wmax máximo de 100, pero habría que experimentar

---------------------------
### **IMPORTS**

In [37]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import numbers

### **CLASS**

In [38]:
class noVotesException(Exception):

    def __init__(self, message="No tree voted"):
        self.message = message
        super().__init__(self.message)

In [39]:
class Co_Forest:
    """"
    Class used to generate a semi-supervised Random Forest.
    ...

    Attributes
    ----------
    ...

    Methods
    -------
    ...
    """

    def __init__(self, L, y, U, n, theta, classes, random_state=None, max_features='log2'):
        """"
        Constructor. Creates and trains the Co-Forest.
        
        Parameters
        ----------
        L: np.array
            labeled data used for training
        y: np.array
            tags of the labeled data used for training
        U: np.array
            -1 data used for training
        n: int
            number of trees in the ensemble
        theta: float
            tolerance
        classes: np.array
            names of the classes that can be predicted
        random_state: .....
        max_features: string
            log2, sqrt, None
        """

        self.n = n
        self.theta = theta
        self.classes = classes

        self.L = L
        self.y = y
        self.U = U

        self.random_state = self.check_random_state(random_state)

        #Used to re-train trees from scratch
        self.U_pseudo_tags = ( np.ones(shape=((self.U.shape[0]), self.n), dtype=int) * -1 )

        self.mask_L = np.zeros(shape=((self.L.shape[0]), self.n), dtype=int, order='C') 
        self.mask_U = np.zeros(shape=((self.U.shape[0]), self.n), dtype=int, order='C')

        self.ensemble = self.create_trees(max_features) 

        #self.fit()


    def create_trees(self, max_features) -> dict:
        """Generates a dict -> {key: int, value: Tree}

        Parameters
        ----------
        max_features: number of features to consider 
                      when looking for the best split
                      'sqrt', 'log2', None

        Returns
        -------
        dict
            dict containing the trees of co-forest
        """

        ensemble = {}

        for i in range(self.n):

            #IMPORTANT: bootstrapping only 50% of L size (few data to let oob error work)
            rand_rows = self.random_state.choice(a = np.arange(start=0, stop=self.L.shape[0]), replace = True, size=(int(0.5*self.L.shape[0])) )
            #np.random.choice(a = np.arange(start=0, stop=self.L.shape[0]), replace = True, size=(int(0.5*self.L.shape[0])) )
            self.mask_L[rand_rows, i] = 1
            h = DecisionTreeClassifier(max_features=max_features, random_state=self.random_state)
            ensemble[i] = h.fit(self.L[rand_rows, :], self.y[rand_rows])

        return ensemble


    def get_rows_training(self, i: int, both=True) -> np.array:
        """Returns X used for training a tree.

        Parameters
        ----------
        i: int
            index of the tree
        both: bool
            True if L union U is desired, False if L.

        Returns
        -------
        np.array
            array containing data used during training.
        """
        if both:
            return np.concatenate( (self.L[self.mask_L[:, i] == 1], self.U[self.mask_U[:, i] == 1]) )
        else:
            return self.L[self.mask_L[:, i] == 1]


    def get_tags_training(self, i, both = True):
        """Returns y used for training a tree.

        Parameters
        ----------
        i: int
            index of the tree
        both: bool
            True if L union U is desired, False if L.

        Returns
        -------
        np.array
            array containing tags of the data used.
        """
        if both:
            U_tags_i = self.U_pseudo_tags[:, i]
            return np.concatenate( (self.y[self.mask_L[:, i] == 1], U_tags_i[self.mask_U[:, i] == 1]) )
        
        else:
            return self.y[self.mask_L[:, i] == 1]


    def fit(self):
        """Fits the ensemble using both labeled and
        pseudo-labeled data.
        """

        previous_e = [0.5 for i in range(self.n)]
        previous_W = [min(0.1*len(self.L), 100) for i in range(self.n)]
        new_data = True
        t = 0
        
        while new_data:

            t += 1
            # Previous pseudo-labels are discarded on each iteration.
            self.mask_U = np.zeros(shape=((self.U.shape[0]), self.n), dtype=int, order='C')
            tree_changes = [False for i in range(self.n)]

            for i, hi in self.ensemble.items():

                e = self.concomitant_oob_error(hi)

                if e < previous_e[i]:

                    if e == 0:
                        Wmax = self.theta * self.U.shape[0]
                    else:
                        Wmax = min(self.theta * self.U.shape[0], ((previous_e[i]*previous_W[i])/e) )

                    U_subsampled = self.subsample(hi, Wmax) 

                    #Mas pequeñitos y voy probando sobre el 10% etiquetadas
                    #Gráficas -> errores por iteración

                    #pandas o seaborn (capa encima matplotlib) y te lo hace
                    #validaci

                    W = 0

                    for u in U_subsampled:
                        concomitant_confidence, selected_class = self.concomitant_confidence(hi, self.U[u, :])

                        if concomitant_confidence > self.theta:
                            tree_changes[i] = True
                            self.mask_U[u, i] = 1
                            self.U_pseudo_tags[u, i] = selected_class
                            W += concomitant_confidence

                    previous_W[i] = W #Sacar fuera del if? Y a qué valor si no se entra? 0 no puede ser...

                previous_e[i] = e

            new_data = self.retrain_ensemble(np.array(tree_changes))


    def retrain_ensemble(self, tree_changes: np.array) -> bool:
        """Retrains from scratch those trees that have 
        received new data.

        Parameters
        ----------
        tree_changes : boolean numpy array

        Returns
        -------
        bool
            True if one or more trees have chanded, False if not.
        """

        if tree_changes.sum() == 0:
            return False

        for i in np.fromiter(self.ensemble.keys(), dtype=int)[tree_changes]:
            self.ensemble[i] = DecisionTreeClassifier(max_features='log2', random_state=self.random_state).fit(self.get_rows_training(i), self.get_tags_training(i))
        
        return True
        

    def subsample(self, hi: DecisionTreeClassifier, Wmax: float) -> np.array:
        """Samples from U uniformly at random until 
        the sum of the sample weights reaches Wmax.
        Bootstraping is applied.

        Parameters
        ----------
        hi : DecisionTreeClassifier
        Wmax: float

        Returns
        -------
        np.array
            Array containing the index of the chosen
            samples from U
        """

        W = 0
        U_subsampled = []

        while (W < Wmax):

            rand_row = self.random_state.choice(a = np.arange(start=0, stop=self.U.shape[0]))
            W += self.concomitant_confidence(hi, self.U[rand_row, :])[0]
            U_subsampled.append(rand_row)

        return np.array(U_subsampled)

        
    def concomitant_oob_error(self, hi: DecisionTreeClassifier) -> float:
        """Calculates the Out of Bag Error of the concomitant 
        ensemble of hi for the whole labeled data.

        Parameters
        ----------
        hi : DecisionTreeClassifier

        Returns
        -------
        float
            OOBE if trees voted, nan if not
        """

        errors = []

        for sample, tag in zip(self.L, self.y):
            
            n_votes = n_hits = 0 

            for i, tree in self.ensemble.items():
                if tree != hi and sample not in self.get_rows_training(i, both = False):

                    if tree.predict([sample])[0] == tag:
                        n_hits += 1
                    n_votes +=1

            if (n_votes > 0):
                errors.append(1 - (n_hits/n_votes))

        return np.mean(a=errors)

    def concomitant_confidence(self, hi: DecisionTreeClassifier, sample: np.array) -> tuple:
        """Calculates the number of coincidences during
        prediction of the hi concomitant ensemble for a
        data sample.

        Parameters
        ----------
        hi : DecisionTreeClassifier
        sample: sample's features array

        Returns
        -------
        tuple (float, int)
            float: confidence for the sample
            int: most agreed class
        """

        count = { **dict.fromkeys([i for i in self.classes], 0)} 
        for i in (tree.predict([sample])[0] for tree in self.ensemble.values() if tree != hi):
            count[i]+= 1

        max_agreement = max(count.values())
        most_agreed_class = list(count.values()).index(max_agreement)
        return max_agreement/(len(self.ensemble) -1), most_agreed_class


    def single_predict(self, sample: np.array): 
        """Returns the class predicted by coforest
        for a given sample. Majority voting is used.

        Parameters
        ----------
        sample: np_array
            sample to predict

        Returns
        -------
        np.array:
            label predicted by coforest.
        """

        count = { **dict.fromkeys([i for i in self.classes], 0)} 
        for i in (tree.predict([sample])[0] for tree in self.ensemble.values()):
            count[i]+= 1

        max_agreement = max(count.values())
        return self.classes[list(count.values()).index(max_agreement)]


    def predict(self, samples: np.array) -> np.array:
        """Returns the labels predicted by the coforest
        for a given data.

        Parameters
        ----------
        samples: np_array
            samples to predict

        Returns
        -------
        np.array:
            labels predicted by the coforest.
        """
        
        samples = (lambda x: np.expand_dims(x, axis=0) if x.ndim == 1 else x)(samples)
        return np.array([self.single_predict(sample) for sample in samples])


    def score(self, X_test: np.array, y_test: np.array) -> float:
        """Calculates the number of hits by coforest
        given a training set.

        Parameters
        ----------
        X_test: np_array
            Samples used during testing
        y_test: np_array
            Samples' tags

        Returns
        -------
        float:
            percentage of hits.
        """
        y_predictions = self.predict(X_test)
        return np.count_nonzero(y_predictions==y_test)/len(y_test)

    def check_random_state(self, seed):
        """
        Source: SkLearn
        Turn seed into a np.random.RandomState instance.
        Parameters
        ----------
        seed : None, int or instance of RandomState
            If seed is None, return the RandomState singleton used by np.random.
            If seed is an int, return a new RandomState instance seeded with seed.
            If seed is already a RandomState instance, return it.
            Otherwise raise ValueError.
        Returns
        -------
        :class:`numpy:numpy.random.RandomState`
            The random state object based on `seed` parameter.
        """

        if seed is None or seed is np.random:
            return np.random.mtrand._rand
        if isinstance(seed, numbers.Integral):
            return np.random.RandomState(seed)
        if isinstance(seed, np.random.RandomState):
            return seed
        raise ValueError("%r cannot be used to seed a numpy.random.RandomState instance" % seed)

### **DATA**

In [40]:
iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(np.array(iris.data), np.array(iris.target), test_size=0.15, random_state=np.random.RandomState(5))
L_train, U_train, Ly_train, Uy_train = train_test_split(X_train, y_train, test_size=0.8, random_state=np.random.RandomState(5))

### **TEST**

In [41]:
coforest = Co_Forest(L_train, Ly_train, U_train, 10, 0.75, [0,1,2], 19)

print(coforest.score(X_test, y_test))
coforest.fit()
print(coforest.score(X_test, y_test))

0.8695652173913043
0.8695652173913043
