In [40]:
import numpy as np
import numbers
from copy import deepcopy
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_digits, load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import math

In [48]:
class DemocraticCo:  

    def __init__(self, base_cls, random_state=None):
        """
        Constructor. Creates the co-training instance.
        
        Parameters
        ----------
        base_cls:
            Classifiers
        random_state:
            Random object or seed
        """
        
        self.n = len(base_cls)
        self.classes = []
        self.rd = self.check_random_state(random_state)
        self.classifiers = {i: base_cls[i] for i in range(self.n)}


    def fit(self, L, y, U):
        """
        Trains the Democratic-Co.

        Parameters
        ----------
        L: np.array
            Labeled data used for training
        y: np.array
            Labeled data tags used for training
        U: np.array
            Unlabeled data used for training
        """

        classes = np.unique(y)
        self.classes = classes

        e = [0.0 for i in range(self.n)]
        q = [0.0 for i in range(self.n)]
        e_prime = [0.0 for i in range(self.n)]
        q_prime = [0.0 for i in range(self.n)]

        cls_updates = [(list(L), list(y)) for i in range(self.n)]
        cls_proposed_updates = [([], []) for i in range(self.n)]
        changes = True

        while changes:

            hyps = deepcopy(self.classifiers)
            cls_changes = np.array([False for i in range(self.n)])

            for i in range(self.n):
                X_train, y_train = cls_updates[i]
                hyps[i] = hyps[i].fit(X_train, y_train)

            matrix = [[set() for j in range(len(self.classes))] for x in range(len(U))]

            for x in range(len(U)):
                for i in range(self.n):
                    j = hyps[i].predict([U[x]])[0]
                    c = matrix[x][j]
                    c.add(i)
                    matrix[x][j] = c

            y_U = []
            for x in matrix:
                x = [len(c) for c in x]
                n_max = np.amax(x)
                y_U.append(x.index(n_max))

            # Choose which exs to propose for labeling
            w = []
            for i in range(self.n):
                li, hi = self.confidence_interval(hyps[i], L, y)
                w.append((li + hi) / 2)
                
            cls_proposed_updates = [([], []) for i in range(self.n)]

            for x in range(len(U)):

                voted_tag = y_U[x]
                index = np.where(classes == voted_tag)[0][0]
                cls_agree_tag = matrix[x][index]

                a1 = 0  #sumatorio de los que coinciden por sus pesos
                for cls in cls_agree_tag:
                    a1 += w[cls]

                a2 = 0
                for tag in classes:
                    if tag != voted_tag:
                        index = np.where(classes == tag)[0][0]

                        weight_tag = 0
                        for cls in matrix[x][index]:
                            weight_tag += w[cls]
                        a2 = max(a2, weight_tag)

                if a1 > a2:
                    for cls in (set(self.classifiers.keys()) - cls_agree_tag):
                        Li_prime, y_Li_prime = cls_proposed_updates[i]
                        Li_prime.append(U[x])
                        y_Li_prime.append(y_U[x])
                        cls_proposed_updates[i] = (Li_prime, y_Li_prime)


            # Estimate if adding this is better
            for i in range(self.n):

                l = []
                for j in range(self.n):
                    Lj, y_Lj = cls_updates[j]
                    lj, hj = self.confidence_interval(hyps[j], Lj, y_Lj)
                    l.append(lj)

                Li, y_Li = cls_updates[i]
                Li_prime, y_Li_prime = cls_proposed_updates[i]

                if len(Li_prime) > 0:
                    Li_union_Li_prime = Li + Li_prime
                else:
                    Li_union_Li_prime = Li

                q[i] = len(Li) * (1 - 2 * (e[i] / len(Li))) ** 2
                e_prime[i] = (1 - np.mean(l)) * len(Li_prime)
                q_prime[i] = len(Li_union_Li_prime) * ( 1 - (2*(e[i] + e_prime[i]) / len(Li_union_Li_prime)) ) ** 2

                if q_prime[i] > q[i]:
                    cls_changes[i] = True
                    cls_updates[i] = (Li_union_Li_prime, y_Li + y_Li_prime)
                    e[i] = e[i] + e_prime[i]

        
            if cls_changes.sum() == 0:
                changes = False

    def confidence_interval(self, cls, L, y):
        
        y_pred = cls.predict(L)
        hits = (y_pred == y)

        n_total = len(y)
        n_hits = hits.sum()
        p_hat = n_hits / n_total
        margin = 1.96 * math.sqrt( p_hat * (1 - p_hat) / n_total)

        return (p_hat - margin, p_hat + margin)


    def check_random_state(self, seed=None):
        """
        Turn seed into a np.random.RandomState instance.
        Source: SkLearn

        Parameters
        ----------
        seed : None, int or instance of RandomState
            If None, return the RandomState singleton.
            If int, return a new RandomState seeded with seed.
            If RandomState instance, return it.

        Returns
        -------
        numpy.random.RandomState
            The random state object based on seed parameter.
        """

        if seed is None or seed is np.random:
            return np.random.mtrand._rand

        if isinstance(seed, numbers.Integral):
            return np.random.RandomState(seed)

        if isinstance(seed, np.random.RandomState):
            return seed

In [30]:
h_0 = DecisionTreeClassifier()
h_1 = GaussianNB()
h_2 = KNeighborsClassifier()

dataset = load_wine()

X = np.array(dataset.data)
y = np.array(dataset.target)

rd = np.random.RandomState(5)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=rd)

for train_index, test_index in skf.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        L_train, U_train, Ly_train, Uy_train = train_test_split(X_train, y_train, test_size=0.8, random_state=rd, stratify=y_train)

In [49]:
democratic_co = DemocraticCo([h_0, h_1, h_2], random_state=5)
democratic_co.fit(L_train, Ly_train, U_train)

KeyboardInterrupt: 