# 📈 **TRI-TRAINING: GRAPHS** 

In [33]:
from sklearn.datasets import load_iris, load_digits, load_wine, load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import LinearRegression
import numpy as np
import numbers
import matplotlib.pyplot as plt
import pandas as pd
from copy import deepcopy
import csv
from itertools import cycle
import time
import os
import sys
from sklearn.metrics import recall_score, precision_score, auc, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

src_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(src_path)
from classifiers.TriTrainingClassifier import TriTraining
from notebooks.graphs_utils import *

# **CLASS**

Fit method needs to be overwritten since graphs are generated, so inheritance is used.

In [34]:

import numpy as np
import numbers
from math import floor, ceil

class TriTraining_graphs(TriTraining):  

    def fit(self, L, y, U):
        """
        Trains the tri-training ensemble using Zhi-Hua Zhou
        Algorithm. Generates graphs to show how score
        evolves during training.

        Parameters
        ----------
        L: np.array
            Labeled data used for training
        y: np.array
            Labeled data tags used for training
        U: np.array
            Unlabeled data used for training
        """

        self.initialize_classifiers(L, y)
        self.classes = np.unique(y)

        previous_e = [0.5 for i in range(self.n)]
        previous_l = [0.0 for i in range(self.n)]
        e = [0.0 for i in range(self.n)]

        new_data = True

        while new_data:

            cls_changes = np.array([False for i in range(self.n)])
            cls_pseudo_updates = [() for i in range(self.n)]

            for i in range(self.n):

                e[i] = self.measure_error(i, L, y)

                if e[i] < previous_e[i]:
                    cls_pseudo_updates[i] = self.create_pseudolabeled_set(i, U)

                    if previous_l[i] == 0:
                        previous_l[i] = floor((e[i] / (previous_e[i]-e[i])) + 1)

                    L_i_size = cls_pseudo_updates[i][0].shape[0]

                    if previous_l[i] < L_i_size:

                        if e[i] * L_i_size < previous_e[i] * previous_l[i]:
                            cls_changes[i] = True
                        
                        elif previous_l[i] > (e[i] / (previous_e[i] - e[i])):

                            L_index = self.rd.choice(L_i_size, ceil((previous_e[i] * previous_l[i] / e[i]) - 1))
                            cls_pseudo_updates[i] = (cls_pseudo_updates[i][0][L_index, :], cls_pseudo_updates[i][1][L_index])
                            cls_changes[i] = True

            if cls_changes.sum() == 0:
                new_data = False

            else:

                for i in np.fromiter(self.classifiers.keys(), dtype=int)[cls_changes]:

                    X_train = np.concatenate((L, cls_pseudo_updates[i][0]))
                    y_train = np.concatenate((y, cls_pseudo_updates[i][1]))
                    self.classifiers[i] = self.classifiers[i].fit(X_train, y_train)

                    previous_e[i] = e[i]
                    previous_l[i] = cls_pseudo_updates[i][0].shape[0] #Tamaño de Li anterior

# **GRAPHS**

In [35]:
h_0 = DecisionTreeClassifier()
h_1 = GaussianNB()
h_2 = KNeighborsClassifier()
rd = np.random.RandomState(5)
t_t_g = TriTraining_graphs(h_0, h_1, h_2, rd)

### **Contra KEEL** 🦈🍣

In [36]:
l = []

for i in range(1, 11):

    file_train = "wine/wine-ssl10-10-{}tra.csv".format(i)
    file_test = "wine/wine-ssl10-10-{}tst.csv".format(i)

    # file_train = "iris/{}f_tra.csv".format(i)
    # file_test = "iris/{}f_tst.csv".format(i)

    L, L_tags, U = extract_training_data(file_train)
    X_test, y_test = extract_test_data(file_test)

    t_t_g = TriTraining_graphs(h_0, h_1, h_2, rd)
    t_t_g.fit(L, L_tags, U)
    l.append(t_t_g.score(X_test, y_test))

print(l)
print(np.array(l).mean())

[0.8333333333333334, 0.7777777777777778, 0.9444444444444444, 0.6666666666666666, 0.9411764705882353, 0.8823529411764706, 0.8888888888888888, 0.8333333333333334, 0.6666666666666666, 1.0]
0.8434640522875817
