In [1]:
import numpy as np
from scipy.io import arff
import pandas as pd
import time
from sklearn.metrics import hamming_loss,coverage_error,label_ranking_loss,label_ranking_average_precision_score
from skmultilearn.adapt import MLkNN
from scipy.special import softmax

In [2]:
#Load dataset

Natt = 14 # Numero de atributos
data = arff.loadarff('yeast/yeast-train.arff')
df = pd.DataFrame(data[0])
data_train = df.to_numpy()
X_train = data_train[:,:-Natt]
Y_train = data_train[:,-Natt:].astype(np.int)

data = arff.loadarff('yeast/yeast-test.arff')
df = pd.DataFrame(data[0])
data_test = df.to_numpy()
X_test = data_test[:,:-Natt]
Y_test = data_test[:,-Natt:].astype(np.int)

# Metricas
hl = ['Hamming loss']
ce = ['Coverage']
rl = ['Ranking loss']
ap = ['Average precision']

In [3]:
class kNN:

    def __init__(self,k,X_train,X_test):
        self.k = k
        self.X_train = X_train
        self.X_test = X_test
        self.N_train = len(X_train)
        self.N_test = len(X_test)
    
    def train(self):
    
        # Calculo das distancias
        # dist(i,j) = distancia entre a i-esima amostra de teste e a j-esima amostra de treinamento
        dist = np.zeros((self.N_test,self.N_train)) 

        for i in np.arange(self.N_test):
            for j in np.arange(self.N_train):
                # Distancia euclidiana
                dist[i,j] = np.sqrt(np.sum((self.X_test[i] - self.X_train[j])**2))

        self.ind_neigh = np.argsort(dist,axis=1) # indices dos vizinhos em ordem crescente de distancia
        dist_neigh = np.array([dist[i,self.ind_neigh[i]] for i in range(self.N_test)]) # distancias ordenadas
        self.k_dist = dist_neigh[:,:k] # distancias dos k-vizinhos mais proximos a cada amostra

    def predict(self,Y_train):
        
        Nclasses = int(Y_train.max()) + 1
        classes_neigh = Y_train[self.ind_neigh] # classes dos vizinhos mais proximos
        k_classes = classes_neigh[:,:k] # classes dos k-vizinhos mais proximos a cada amostra

        votes = np.zeros((self.N_test,Nclasses)) # votos das classes para cada amostra
        pert = np.zeros(self.N_test) # pertinencia da classe escolhida

        # soma de votos para cada classe
        for c in np.arange(Nclasses):
            votes[:,c] = votes[:,c] + ((k_classes == c)/(self.k_dist+1)).sum(axis=1) # votos ponderados pelo inverso da distancia+1

        result = votes.argmax(axis=1)
        pert = votes[:,1]/votes.sum(axis=1)
        
        # retorna classe com maior voto para cada amostra
        return result, pert

In [4]:
# Transformacao binaria

k = 12
Nclasses = 2 #Numero de classes

j = 0
Nsamples,Nlabels = Y_test.shape
pred = np.zeros((Nsamples,Nlabels)) # Predicoes
proba = np.zeros((Nsamples,Nlabels)) # Pertinencias

# Treinamento

print('Iniciando treinamento do classificador...')
start = time.time()
clf = kNN(k,X_train,X_test)
clf.train()
end = time.time()
print('Treinamento do classificador finalizado em %f segundos' %((end-start)))

Iniciando treinamento do classificador...
Treinamento do classificador finalizado em 44.494593 segundos


In [5]:
# Predicoes

for j in np.arange(Nlabels):
    
    pred[:,j],proba[:,j] = clf.predict(Y_train[:,j])
    
hl.append(hamming_loss(Y_test,pred))
ce.append(coverage_error(Y_test,proba))
rl.append(label_ranking_loss(Y_test,proba))
ap.append(label_ranking_average_precision_score(Y_test,proba))
print 'Transformacao binaria:'
print ('Hamming loss: ', hl[-1])
print ('Coverage error: ', ce[-1])
print ('Ranking Loss: ', rl[-1])
print ('Label Ranking Average Precision: ', ap[-1])

Transformacao binaria:
('Hamming loss: ', 0.20236797008879887)
('Coverage error: ', 7.609596510359869)
('Ranking Loss: ', 0.18295979024869818)
('Label Ranking Average Precision: ', 0.7586383799661772)


In [6]:
class MLkNN:

    def __init__(self,k,s=1.0,smax=False):
        self.k = k
        self.s = s
        self.smax = smax
        
    def fit(self,X_train,Y_train):
        
        N_train,N_labels = Y_train.shape
        
        # Calculo das probabilidades a priori
        prior_proba = np.zeros((2,N_labels))
        
        for l in np.arange(N_labels):
            prior_proba[1,l] = (self.s + sum(Y_train[:,l]))/(self.s*2+N_train)
            prior_proba[0,l] = 1 - prior_proba[1,l]
            
        # Calculo das probabilidades a posteriori
        
        cond_proba = np.zeros((2,self.k+1,N_labels))
        
        # Calculo das distancias
        # dist(i,j) = distancia entre a i-esima amostra de teste e a j-esima amostra de treinamento
        #dist = np.zeros((N_train,N_train))       ## Considera o elemento como vizinho a ele mesmo
        dist = np.ones((N_train,N_train))*np.Inf  ## Desconsidera o  elemento como vizinho a ele mesmo

        for i in np.arange(N_train):
            for j in np.arange(i+1,N_train):
                # Distancia euclidiana
                dist[i,j] = np.sqrt(np.sum((X_train[i] - X_train[j])**2))
                dist[j,i] = dist[i,j]

        ind_neigh = np.argsort(dist,axis=1)[:,:self.k] # indices dos k vizinhos em ordem crescente de distancia
        
        for l in np.arange(N_labels):
            
            c = np.zeros((2,self.k+1))
            
            for i in np.arange(N_train):
                delta = sum(Y_train[ind_neigh[i],l])
                if(Y_train[i,l] == 1):
                    c[1,delta] = c[1,delta] + 1
                else:
                    c[0,delta] = c[0,delta] + 1
            
            if(self.smax == True):
                c = softmax(c,axis=1)
            
            for j in np.arange(self.k+1):
                cond_proba[1,j,l] = (self.s + c[1,j])/(self.s*(self.k+1) + sum(c[1,:]))
                cond_proba[0,j,l] = (self.s + c[0,j])/(self.s*(self.k+1) + sum(c[0,:]))
                
        self.X_train = X_train
        self.Y_train = Y_train
        self.N_labels = N_labels
        self.prior_proba = prior_proba
        self.cond_proba = cond_proba
            
    def predict(self,X_test):
        
        X_train = self.X_train
        Y_train = self.Y_train
        N_labels = self.N_labels
        N_train = X_train.shape[0]
        N_test = X_test.shape[0]
        
        Y_test = np.zeros((N_test,N_labels))
        post_proba = np.zeros((N_test,N_labels))
        
        prior_proba = self.prior_proba
        cond_proba = self.cond_proba
        
         # Calculo das distancias
        # dist(i,j) = distancia entre a i-esima amostra de teste e a j-esima amostra de treinamento
        dist = np.zeros((N_test,N_train))

        for i in np.arange(N_test):
            for j in np.arange(N_train):
                # Distancia euclidiana
                dist[i,j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))

        ind_neigh = np.argsort(dist,axis=1)[:,:self.k] # indices dos k vizinhos em ordem crescente de distancia
        
        for t in np.arange(N_test):
            
            for l in np.arange(N_labels):
                c = sum(Y_train[ind_neigh[t],l])
                proba_1 = prior_proba[1,l]*cond_proba[1,c,l]
                proba_0 = prior_proba[0,l]*cond_proba[0,c,l]
                if(proba_1 > proba_0):
                    Y_test[t,l] = 1
                post_proba[t,l] = proba_1/(proba_1+proba_0)
        
        return Y_test,post_proba

In [7]:
#ML-kNN implementado

k = 12
clf = MLkNN(k)
clf.fit(X_train,Y_train)
pred,proba = clf.predict(X_test)

In [8]:
hl.append(hamming_loss(Y_test,pred))
ce.append(coverage_error(Y_test,proba))
rl.append(label_ranking_loss(Y_test,proba))
ap.append(label_ranking_average_precision_score(Y_test,proba))
print 'ML-kNN implementado:'
print ('Hamming loss: ', hl[-1])
print ('Coverage error: ', ce[-1])
print ('Ranking Loss: ', rl[-1])
print ('Label Ranking Average Precision: ', ap[-1])

ML-kNN implementado:
('Hamming loss: ', 0.19621436360803862)
('Coverage error: ', 7.408942202835332)
('Ranking Loss: ', 0.17260819856434437)
('Label Ranking Average Precision: ', 0.758828760331851)


In [9]:
#ML-kNN com softmax

k = 12
clf = MLkNN(k,smax=True)
clf.fit(X_train,Y_train)
pred,proba = clf.predict(X_test)

In [10]:
hl.append(hamming_loss(Y_test,pred))
ce.append(coverage_error(Y_test,proba))
rl.append(label_ranking_loss(Y_test,proba))
ap.append(label_ranking_average_precision_score(Y_test,proba))
print 'ML-kNN com softmax:'
print ('Hamming loss: ', hl[-1])
print ('Coverage error: ', ce[-1])
print ('Ranking Loss: ', rl[-1])
print ('Label Ranking Average Precision: ', ap[-1])

ML-kNN com softmax:
('Hamming loss: ', 0.22410032715376227)
('Coverage error: ', 7.79062159214831)
('Ranking Loss: ', 0.20029567116340757)
('Label Ranking Average Precision: ', 0.718858481170692)


In [17]:
#ML-kNN scikit

classifier = MLkNN(k=12)

# train
classifier.fit(X_train, Y_train)

# predict
pred = classifier.predict(X_test)
proba = classifier.predict_proba(X_test)

hl.append(hamming_loss(Y_test,pred))
ce.append(coverage_error(Y_test,proba))
rl.append(label_ranking_loss(Y_test,proba))
ap.append(label_ranking_average_precision_score(Y_test,proba))
print 'ML-kNN scikit:'
print ('Hamming loss: ', hl[-1])
print ('Coverage error: ', ce[-1])
print ('Ranking Loss: ', rl[-1])
print ('Label Ranking Average Precision: ', ap[-1])

AttributeError: MLkNN instance has no attribute 'predict_proba'