In [86]:
import numpy as np
from scipy.io import arff
import pandas as pd
import time
from sklearn.metrics import hamming_loss,coverage_error,label_ranking_loss,label_ranking_average_precision_score

In [3]:
#Load dataset

Natt = 14 # Numero de atributos
data = arff.loadarff('yeast/yeast-train.arff')
df = pd.DataFrame(data[0])
data_train = df.to_numpy()
X_train = data_train[:,:-Natt]
Y_train = data_train[:,-Natt:].astype(np.int)

data = arff.loadarff('yeast/yeast-test.arff')
df = pd.DataFrame(data[0])
data_test = df.to_numpy()
X_test = data_test[:,:-Natt]
Y_test = data_test[:,-Natt:].astype(np.int)

In [8]:
class kNN:

    def __init__(self,k,X_train,X_test):
        self.k = k
        self.X_train = X_train
        self.X_test = X_test
        self.N_train = len(X_train)
        self.N_test = len(X_test)
    
    def train(self):
    
        # Calculo das distancias
        # dist(i,j) = distancia entre a i-esima amostra de teste e a j-esima amostra de treinamento
        dist = np.zeros((self.N_test,self.N_train)) 

        for i in np.arange(self.N_test):
            for j in np.arange(self.N_train):
                # Distancia euclidiana
                dist[i,j] = np.sqrt(np.sum((self.X_test[i] - self.X_train[j])**2))

        self.ind_neigh = np.argsort(dist,axis=1) # indices dos vizinhos em ordem crescente de distancia
        dist_neigh = np.array([dist[i,self.ind_neigh[i]] for i in range(self.N_test)]) # distancias ordenadas
        self.k_dist = dist_neigh[:,:k] # distancias dos k-vizinhos mais proximos a cada amostra

    def predict(self,Y_train):
        
        Nclasses = int(Y_train.max()) + 1
        classes_neigh = Y_train[self.ind_neigh] # classes dos vizinhos mais proximos
        k_classes = classes_neigh[:,:k] # classes dos k-vizinhos mais proximos a cada amostra

        votes = np.zeros((self.N_test,Nclasses)) # votos das classes para cada amostra

        # soma de votos para cada classe
        for c in np.arange(Nclasses):
            votes[:,c] = votes[:,c] + ((k_classes == c)/(self.k_dist+1)).sum(axis=1) # votos ponderados pelo inverso da distancia+1

        # retorna classe com maior voto para cada amostra
        return votes.argmax(axis=1)

In [9]:
def eval_MLL(pred,Y_test):
    
    Nsamples,Nlabels = Y_test.shape
    hl = np.sum(abs(Y_test - pred))/(Nsamples*Nlabels)
    acc = (Y_test*pred).sum()/((np.logical_or(Y_test,pred)).sum())
    prec = (Y_test*pred).sum()/(pred.sum())
    recall = (Y_test*pred).sum()/(Y_test.sum())
    f1 = 2*prec*recall/(prec+recall)
    
    return hl,acc,prec,recall,f1

In [9]:
k = 12
Nclasses = 2 #Numero de classes

j = 0
Nsamples,Nlabels = Y_test.shape
pred = np.zeros((Nsamples,Nlabels))

# Treinamento

print('Iniciando treinamento do classificador...')
start = time.time()
clf = kNN(k,X_train,X_test)
clf.train()
end = time.time()
print('Treinamento do classificador finalizado em %f segundos' %((end-start)))

Iniciando treinamento do classificador...
Treinamento do classificador finalizado em 29.495472 segundos


In [10]:
# Predicoes

for j in np.arange(Nlabels):
    
    pred[:,j] = clf.predict(Y_train[:,j])
    
hl,acc,prec,recall,f1 = eval_MLL(pred,Y_test)

print ('Hamming Loss = %f, F1 = %f' %(hl,f1))

Hamming Loss = 0.202368, F1 = 0.635930


In [83]:
class MLkNN:

    def __init__(self,k,s=1.0,softmax=False):
        self.k = k
        self.s = s
        self.softmax = softmax
        
    def fit(self,X_train,Y_train):
        
        N_train,N_labels = Y_train.shape
        
        # Calculo das probabilidades a priori
        prior_proba = np.zeros((2,N_labels))
        
        for l in np.arange(N_labels):
            prior_proba[1,l] = (self.s + sum(Y_train[:,l]))/(self.s*2+N_train)
            prior_proba[0,l] = 1 - prior_proba[1,l]
            
        # Calculo das probabilidades a posteriori
        
        cond_proba = np.zeros((2,self.k+1,N_labels))
        
        # Calculo das distancias
        # dist(i,j) = distancia entre a i-esima amostra de teste e a j-esima amostra de treinamento
        dist = np.ones((N_train,N_train))*np.Inf

        for i in np.arange(N_train):
            for j in np.arange(i+1,N_train):
                # Distancia euclidiana
                dist[i,j] = np.sqrt(np.sum((X_train[i] - X_train[j])**2))
                dist[j,i] = dist[i,j]

        ind_neigh = np.argsort(dist,axis=1)[:,:self.k] # indices dos k vizinhos em ordem crescente de distancia
        
        for l in np.arange(N_labels):
            
            c = np.zeros((2,self.k+1))
            
            for i in np.arange(N_train):
                delta = sum(Y_train[ind_neigh[i],l])
                if(Y_train[i,l] == 1):
                    c[1,delta] = c[1,delta] + 1
                else:
                    c[0,delta] = c[0,delta] + 1
                    
            for j in np.arange(self.k+1):
                cond_proba[1,j,l] = (self.s + c[1,j])/(self.s*(self.k+1) + sum(c[1,:]))
                cond_proba[0,j,l] = (self.s + c[0,j])/(self.s*(self.k+1) + sum(c[0,:]))
                
        self.X_train = X_train
        self.Y_train = Y_train
        self.N_labels = N_labels
        self.prior_proba = prior_proba
        self.cond_proba = cond_proba
            
    def predict(self,X_test):
        
        X_train = self.X_train
        Y_train = self.Y_train
        N_labels = self.N_labels
        N_train = X_train.shape[0]
        N_test = X_test.shape[0]
        
        Y_test = np.zeros((N_test,N_labels))
        post_proba = np.zeros((N_test,N_labels))
        
        prior_proba = self.prior_proba
        cond_proba = self.cond_proba
        
         # Calculo das distancias
        # dist(i,j) = distancia entre a i-esima amostra de teste e a j-esima amostra de treinamento
        dist = np.zeros((N_test,N_train))

        for i in np.arange(N_test):
            for j in np.arange(N_train):
                # Distancia euclidiana
                dist[i,j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))

        ind_neigh = np.argsort(dist,axis=1)[:,:self.k] # indices dos k vizinhos em ordem crescente de distancia
        
        for t in np.arange(N_test):
            
            for l in np.arange(N_labels):
                c = sum(Y_train[ind_neigh[t],l])
                proba_1 = prior_proba[1,l]*cond_proba[1,c,l]
                proba_0 = prior_proba[0,l]*cond_proba[0,c,l]
                if(proba_1 > proba_0):
                    Y_test[t,l] = 1
                post_proba[t,l] = proba_1/(proba_1+proba_0)
        
        return Y_test,post_proba

In [84]:
k = 12
clf = MLkNN(k)
clf.fit(X_train,Y_train)
pred,post_proba = clf.predict(X_test)

In [87]:
hl = hamming_loss(Y_test,pred)
ce = coverage_error(Y_test,post_proba)
rl = label_ranking_loss(Y_test,post_proba)
lr = label_ranking_average_precision_score(Y_test,post_proba)
print ('Hamming loss: ', hl)
print ('Coverage error: ', ce)
print ('Ranking Loss: ', rl)
print ('Label Ranking Average Precision: ', lr)

('Hamming loss: ', 0.19621436360803862)
('Coverage error: ', 7.408942202835332)
('Ranking Loss: ', 0.17260819856434437)
('Label Ranking Average Precision: ', 0.758828760331851)
