# 🌳 **CO-FOREST** 🌳

##### **Autora: Patricia Hernando Fernández**


**✍🏽 TO-DO:**

- Revisar si dataframes, arrays... mejorar estructuras (definición)
- Complejidades. Intentar hashear las listas y usar conjuntos para hacer comprobaciones en O(1)
- Espacio, ocupa mucho
- Cambiar todos los samples en los árboles por bootstrap (falta)
- Documentar
- Hacer más tests

---------------------------
### **IMPORTS**

In [124]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from copy import deepcopy

### **CLASS**

In [125]:
class Co_Forest:

    def __init__(self, DL, U, n, sigma, n_class_types):
        """
        objetos de pandas, tanto L, como L_tags, como U
        DL -> Tupla (L, L_tags)
        """

        self.n = n
        self.sigma = sigma
        self.n_class_types = n_class_types

        self.ensemble = self.create_trees(DL) 
        self.fit(DL, U)


    def create_trees(self, DL):
        # keys: id, values: (tree, L ignored during training)
        ensemble = {}
        L, L_tags = DL

        for i in range(self.n):

            #cambiar valores tamaño a entrenar (está forzado para entrenar con pocos y que salte el error)
            X_train_h, X_ignore_h, y_train_h, y_ignore_h = train_test_split(L, L_tags, test_size=random.uniform(0.30, 0.50)) #CAMBIAR POR BOOTSTRAP
            h = DecisionTreeClassifier()
            h.fit(X_train_h.values, y_train_h.values)

            ensemble[i] = (h, X_ignore_h.values) #Cambiar X ignore a tuplas y hacer conjunto? hasheable para O(1)...

        return ensemble


    def fit(self, DL, U):

        # x = U.sample().values[0]
        # print("Error (todo L para los no entrenados con esos L): {}; Confidence (en una muestra de U): {}".format(self.calculate_e(hi, DL), self.confidence(hi, x, self.n_class_types)))
        
        L = DL[0]
        e_anterior = []
        W_anterior = []
        DLP_i_t = []

        for i in range(self.n):

            e_anterior.append(0.5)
            W_anterior.append(min(0.1*len(L), 100))
            DLP_i_t.append(([], []))

        t = 0
        new_data = True
        
        while new_data:

            t += 1

            for i in range(self.n):

                hi = self.ensemble[i][0]
                e = self.calculate_e(hi, DL)
                LP = []
                LP_tags = [] #Ojo con los sets porque se desordenan (en python??)

                if e < e_anterior[i]:

                    #Si tienes un error de 0?!
                    if e <= 0:
                        e = 0.0000001

                    UP = self.subsample(U.values, hi, ((e_anterior[i]*W_anterior[i])/e))
                    W = 0

                    for x in UP:
                        confidence, selected_class = self.confidence(hi, x)

                        if confidence > self.sigma:
                            LP.append(x)
                            LP_tags.append(selected_class)
                            W += confidence

                DLP_i_t[i] = ((LP, LP_tags)) #añade aunque vacío
                e_anterior[i] = e
                W_anterior[i] = W

            #Abajo
            changed_t = False

            for i in range(self.n):

                if len(DLP_i_t[i][0]) > 0:
                    changed_t = True
                    self.retrain_tree(i, DL, DLP_i_t[i])

            if not changed_t:
                new_data = False

    def retrain_tree(self, i, DL, DLP):
        
        hi = self.ensemble[i][0]
        X_ignored = self.ensemble[i][1]

        L = DL[0].values
        L_tags = DL[1].values

        LP, LP_tags = DLP

        X_train = []
        y_train = []

        for i in range(len(L)):

            if L[i] not in X_ignored:
                X_train.append(L[i])
                y_train.append(L_tags[i])

        for i in range(len(LP)):
            X_train.append(LP[i])
            y_train.append(LP_tags[i])

        hi.fit(X_train, y_train)

        self.ensemble[i] = (hi, X_ignored)


    def subsample(self, U, hi, Wmax):
        # U′ is then determined by sampling from U uniformly at random until the sum of
        # the sample weights reaches Wmax.

        W_U = 0
        UP = []
        U_left = []

        for x in U:
            U_left.append(x)

        while (W_U < Wmax and len(U_left) > 0): #Cambiar por lectura adelantada

            i = random.randint(0, len(U_left)-1)
            x = U_left[i]

            confidence, selected_class = self.confidence(hi, x)

            W_U += confidence
            UP.append(x)

            U_left.pop(i)

        return UP

        
    def calculate_e(self, hi, DL):

        sum_errores = total_dl_voted = 0
        DL_zip = zip(DL[0].values, DL[1].values) #Tuplas ([array features], int clase)

        for d in DL_zip:

            x, tag = d
            n_votes = n_hits = 0 

            for key, value in self.ensemble.items():
                tree, ignored = value

                if tree != hi and x in ignored:

                    pred = hi.predict([x])[0]

                    if pred == tag:
                        n_hits += 1
                    n_votes +=1

            if (n_votes > 0):
                ex = 1 - (n_hits/n_votes)
                sum_errores += ex
                total_dl_voted += 1

        if total_dl_voted > 0:
            return sum_errores/total_dl_voted

        else:
            return 1 #No se ha votado, se repite -> CAMBIAR POR EXCEPCIÓN


    def confidence(self, hi, x):

        count = {}

        for i in range(self.n_class_types):
            count[i] = 0

        for key, value in self.ensemble.items():
            tree, ignored = value

            if tree != hi:
                prediction = tree.predict([x])[0]
                count[prediction] += 1

        #max_agreement = max(count.values())

        max_agreement = most_agreed_class = -1

        for key, value in count.items():
            if value > max_agreement:
                most_agreed_class = key
                max_agreement = value

        confidence = max_agreement/(len(self.ensemble) -1)

        return confidence, most_agreed_class

### **DATA**

In [126]:
iris = load_iris()

data=pd.DataFrame({
    'sepal length':iris.data[:,0],
    'sepal width':iris.data[:,1],
    'petal length':iris.data[:,2],
    'petal width':iris.data[:,3],
    'species':iris.target
})

#data.head()
X=data[['sepal length', 'sepal width', 'petal length', 'petal width']] 
y=data['species']
n_class_types = len(iris.target_names)

#X_train, X_test, y_train, y_test
L, U, L_tags, U_tags = train_test_split(X, y, test_size=0.2)
DL = (L, L_tags)

### **TEST**

In [127]:
coforest = Co_Forest(DL, U, 5, 0.75, n_class_types)