In [1]:
import numpy as np
from scipy.io import arff
import pandas as pd
import time
import skmultilearn.adapt as skmad
from skmultilearn.dataset import load_dataset
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.metrics import hamming_loss,coverage_error,label_ranking_loss,label_ranking_average_precision_score
from scipy.special import softmax
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier

In [6]:
#Load dataset

X, y, feature_names, label_names = load_dataset('yeast', 'undivided')
X = X.toarray()
y = y.toarray()

classifiers = []
metrics = ['Hamming loss','Coverage','Ranking loss','Average precision']
mlknni_hl=[]
mlknni_ce=[]
mlknni_rl=[]
mlknni_ap=[]

mlknn_soft_hl=[]
mlknn_soft_ce=[]
mlknn_soft_rl=[]
mlknn_soft_ap=[]

mlknns_hl=[]
mlknns_ce=[]
mlknns_rl=[]
mlknns_ap=[]

br_hl=[]
br_ce=[]
br_rl=[]
br_ap=[]

nfolds = 10 # Numero de k-folds
kf = KFold(n_splits=nfolds)
val_k = [8,9,10,11,12]

yeast:undivided - exists, not redownloading


In [3]:
class MLkNN:

    def __init__(self,k,s=1.0,smax=False):
        self.k = k
        self.s = s
        self.smax = smax
        
    def fit(self,X_train,Y_train):
        
        N_train,N_labels = Y_train.shape
        
        # Calculo das probabilidades a priori
        prior_proba = np.zeros((2,N_labels))
        
        for l in np.arange(N_labels):
            prior_proba[1,l] = (self.s + sum(Y_train[:,l]))/(self.s*2+N_train)
            prior_proba[0,l] = 1 - prior_proba[1,l]
            
        # Calculo das probabilidades a posteriori
        
        cond_proba = np.zeros((2,self.k+1,N_labels))
        
        # Calculo das distancias
        # dist(i,j) = distancia entre a i-esima amostra de teste e a j-esima amostra de treinamento
        #dist = np.zeros((N_train,N_train))       ## Considera o elemento como vizinho a ele mesmo
        dist = np.ones((N_train,N_train))*np.Inf  ## Desconsidera o  elemento como vizinho a ele mesmo

        for i in np.arange(N_train):
            for j in np.arange(i+1,N_train):
                # Distancia euclidiana
                dist[i,j] = np.sqrt(np.sum((X_train[i] - X_train[j])**2))
                dist[j,i] = dist[i,j]

        ind_neigh = np.argsort(dist,axis=1)[:,:self.k] # indices dos k vizinhos em ordem crescente de distancia
        
        for l in np.arange(N_labels):
            
            c = np.zeros((2,self.k+1))
            
            for i in np.arange(N_train):
                delta = sum(Y_train[ind_neigh[i],l])
                if(Y_train[i,l] == 1):
                    c[1,delta] = c[1,delta] + 1
                else:
                    c[0,delta] = c[0,delta] + 1
            
            if(self.smax == True):
                c = softmax(c,axis=1)
            
            for j in np.arange(self.k+1):
                cond_proba[1,j,l] = (self.s + c[1,j])/(self.s*(self.k+1) + sum(c[1,:]))
                cond_proba[0,j,l] = (self.s + c[0,j])/(self.s*(self.k+1) + sum(c[0,:]))
                
        self.X_train = X_train
        self.Y_train = Y_train
        self.N_labels = N_labels
        self.prior_proba = prior_proba
        self.cond_proba = cond_proba
            
    def predict(self,X_test):
        
        X_train = self.X_train
        Y_train = self.Y_train
        N_labels = self.N_labels
        N_train = X_train.shape[0]
        N_test = X_test.shape[0]
        
        Y_test = np.zeros((N_test,N_labels))
        post_proba = np.zeros((N_test,N_labels))
        
        prior_proba = self.prior_proba
        cond_proba = self.cond_proba
        
         # Calculo das distancias
        # dist(i,j) = distancia entre a i-esima amostra de teste e a j-esima amostra de treinamento
        dist = np.zeros((N_test,N_train))

        for i in np.arange(N_test):
            for j in np.arange(N_train):
                # Distancia euclidiana
                dist[i,j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))

        ind_neigh = np.argsort(dist,axis=1)[:,:self.k] # indices dos k vizinhos em ordem crescente de distancia
        
        for t in np.arange(N_test):
            
            for l in np.arange(N_labels):
                c = sum(Y_train[ind_neigh[t],l])
                proba_1 = prior_proba[1,l]*cond_proba[1,c,l]
                proba_0 = prior_proba[0,l]*cond_proba[0,c,l]
                if(proba_1 > proba_0):
                    Y_test[t,l] = 1
                post_proba[t,l] = proba_1/(proba_1+proba_0)
        
        return Y_test,post_proba

In [7]:
#ML-kNN implementado

for k in val_k:
    test_hl = np.zeros(nfolds)
    test_ce = np.zeros(nfolds)
    test_rl = np.zeros(nfolds)
    test_ap = np.zeros(nfolds)

    i=0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        classifier = MLkNN(k)

        # train
        classifier.fit(X_train, y_train)

        # predict
        pred,proba = classifier.predict(X_test)

        test_hl[i] = hamming_loss(y_test,pred)
        test_ce[i] = coverage_error(y_test,proba)
        test_rl[i] = label_ranking_loss(y_test,proba)
        test_ap[i] = label_ranking_average_precision_score(y_test,proba)
        i = i+1
    
    classifiers.append('ML-kNN impl.')
    mlknni_hl.append(np.mean(test_hl))
    mlknni_ce.append(np.mean(test_ce))
    mlknni_rl.append(np.mean(test_rl))
    mlknni_ap.append(np.mean(test_ap))


In [15]:
#ML-kNN com softmax

for k in val_k:
    test_hl = np.zeros(nfolds)
    test_ce = np.zeros(nfolds)
    test_rl = np.zeros(nfolds)
    test_ap = np.zeros(nfolds)

    i=0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        classifier2 = MLkNN(k,smax=True)

        # train
        classifier2.fit(X_train, y_train)

        # predict
        pred,proba = classifier2.predict(X_test)

        test_hl[i] = hamming_loss(y_test,pred)
        test_ce[i] = coverage_error(y_test,proba)
        test_rl[i] = label_ranking_loss(y_test,proba)
        test_ap[i] = label_ranking_average_precision_score(y_test,proba)
        i = i+1
    
    classifiers.append('ML-kNN c/ softmax')
    mlknn_soft_hl.append(np.mean(test_hl))
    mlknn_soft_ce.append(np.mean(test_ce))
    mlknn_soft_rl.append(np.mean(test_rl))
    mlknn_soft_ap.append(np.mean(test_ap))

In [17]:
#ML-kNN scikit

for k in val_k:
    test_hl = np.zeros(nfolds)
    test_ce = np.zeros(nfolds)
    test_rl = np.zeros(nfolds)
    test_ap = np.zeros(nfolds)

    i=0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        classifier3 = skmad.MLkNN(k=12)

        # train
        classifier3.fit(X_train, y_train)

        # predict
        pred = classifier3.predict(X_test)
        proba = classifier3.predict_proba(X_test).toarray()

        test_hl[i] = hamming_loss(y_test,pred)
        test_ce[i] = coverage_error(y_test,proba)
        test_rl[i] = label_ranking_loss(y_test,proba)
        test_ap[i] = label_ranking_average_precision_score(y_test,proba)
        i = i+1

    classifiers.append('ML-kNN scikit')
    mlknns_hl.append(np.mean(test_hl))
    mlknns_ce.append(np.mean(test_ce))
    mlknns_rl.append(np.mean(test_rl))
    mlknns_ap.append(np.mean(test_ap))

In [19]:
#Binary Relevance

for k in val_k:
    test_hl = np.zeros(nfolds)
    test_ce = np.zeros(nfolds)
    test_rl = np.zeros(nfolds)
    test_ap = np.zeros(nfolds)

    i=0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        classifier4 = BinaryRelevance(classifier = KNeighborsClassifier(n_neighbors=k))

        # train
        classifier4.fit(X_train, y_train)

        # predict
        pred = classifier4.predict(X_test)
        proba = classifier4.predict_proba(X_test).toarray()

        test_hl[i] = hamming_loss(y_test,pred)
        test_ce[i] = coverage_error(y_test,proba)
        test_rl[i] = label_ranking_loss(y_test,proba)
        test_ap[i] = label_ranking_average_precision_score(y_test,proba)
        i = i+1

    classifiers.append('Binary Relevance')
    br_hl.append(np.mean(test_hl))
    br_ce.append(np.mean(test_ce))
    br_rl.append(np.mean(test_rl))
    br_ap.append(np.mean(test_ap))

In [14]:
# Mostrar os resultados ML-kNN implementado

results = [mlknni_hl, mlknni_ce, mlknni_rl, mlknni_ap]
table = dict([(metrics[i],results[i]) for i in range(4)])
  
# Creates pandas DataFrame. 
df = pd.DataFrame(table, columns=metrics,index=val_k) 
  
# print the data 
df 

Unnamed: 0,Hamming loss,Coverage,Ranking loss,Average precision
8,0.196171,7.285522,0.168446,0.763081
9,0.194783,7.285098,0.167474,0.764379
10,0.194456,7.2959,0.167811,0.764695
11,0.195019,7.261966,0.16722,0.765038
12,0.19499,7.270282,0.168083,0.764757


In [16]:
# Mostrar os resultados ML-kNN com softmax

results = [mlknn_soft_hl, mlknn_soft_ce, mlknn_soft_rl, mlknn_soft_ap]
table = dict([(metrics[i],results[i]) for i in range(4)])
  
# Creates pandas DataFrame. 
df = pd.DataFrame(table, columns=metrics,index=val_k) 
  
# print the data 
df 

Unnamed: 0,Hamming loss,Coverage,Ranking loss,Average precision
8,0.225251,7.699249,0.199005,0.719555
9,0.227027,7.79153,0.202676,0.716133
10,0.225282,7.803474,0.20269,0.716411
11,0.227021,7.792781,0.203564,0.715916
12,0.225991,7.828816,0.204842,0.715033


In [18]:
# Mostrar os resultados ML-kNN scikit

results = [mlknns_hl, mlknns_ce, mlknns_rl, mlknns_ap]
table = dict([(metrics[i],results[i]) for i in range(4)])
  
# Creates pandas DataFrame. 
df = pd.DataFrame(table, columns=metrics,index=val_k) 
  
# print the data 
df 

Unnamed: 0,Hamming loss,Coverage,Ranking loss,Average precision
8,0.198889,8.412174,0.224925,0.701845
9,0.198889,8.412174,0.224925,0.701845
10,0.198889,8.412174,0.224925,0.701845
11,0.198889,8.412174,0.224925,0.701845
12,0.198889,8.412174,0.224925,0.701845


In [20]:
# Mostrar os resultados Binary Relevance

results = [br_hl, br_ce, br_rl, br_ap]
table = dict([(metrics[i],results[i]) for i in range(4)])
  
# Creates pandas DataFrame. 
df = pd.DataFrame(table, columns=metrics,index=val_k) 
  
# print the data 
df 

Unnamed: 0,Hamming loss,Coverage,Ranking loss,Average precision
8,0.199301,8.181741,0.220685,0.735128
9,0.199009,8.091099,0.214814,0.737824
10,0.196908,8.010452,0.2091,0.74168
11,0.196142,7.951296,0.204955,0.744228
12,0.19617,7.878073,0.201139,0.747198
