In [2]:
### IMPORTS ###
import scipy.sparse
import scipy.stats
import numpy as np
import matplotlib.pyplot as pyplot
import sklearn.metrics as mtc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain
from pyemd import emd
import valores
import pickle


In [3]:
## FUNCIONES ##


class TFM:

    def __init__(self,path):
        self.X_tr = scipy.sparse.load_npz(path+'X_tr.npz')
        self.X_tst = scipy.sparse.load_npz(path+'X_tst.npz')
        self.y_tr = np.load(path+'y_tr.npy')
        self.y_tst = np.load(path+'y_tst.npy')
        # Se binarizan las caracteristicas 
        self.y_tr=MultiLabelBinarizer().fit_transform(self.y_tr)
        self.y_tst=MultiLabelBinarizer().fit_transform(self.y_tst)
        # Escalado 
        scaler = MaxAbsScaler().fit(self.X_tr)
        self.X_tr=scaler.transform(self.X_tr)
        self.X_tst=scaler.transform(self.X_tst)
        self.X_tr_simpli= None
        self.simplificador = False
        self.modelo = None
        self.clf= None
        self.columns= None
        print ('Dataset cargado y preprocesado')

    def reduce_features(self,num_data):
        columns=np.array([])
        for i in range(np.shape(self.X_tr)[1]):
            if self.X_tr[:,i].count_nonzero()> num_data:
                columns=np.append(columns,i)
        np.savetxt('features'+num_data+'.txt',columns,delimiter=',')
        print('Las características han sido reducidas')

    def simplify_dataset(self,features,tr_size):
        self.X_tr_simpli=self.X_tr
        self.y_tr_simpli=self.y_tr
        
        self.columns=np.loadtxt('features'+features+'.txt',delimiter=',')
        self.X_tr_simpli=self.X_tr[:,self.columns]
        self.X_tr_simpli, __, self.y_tr_simpli, __ = train_test_split(self.X_tr_simpli, self.y_tr_simpli, test_size=(1-tr_size), random_state=42)
        print('Simplificación finalizada')
        print(np.shape(self.X_tr_simpli))

    def use_simplify(self,boolean):
        self.simplificador= boolean
        
    def grid_search_cv(self,classifier,parameters,metodo):
        if (metodo == 0):
            classif = OneVsRestClassifier(classifier)
        elif (metodo==1):
            classif = ClassifierChain(classifier)
        model_tunning = GridSearchCV(classif, param_grid=parameters,cv=4,verbose=50, n_jobs=-1,scoring='accuracy')
        if (self.simplificador):
            model_tunning.fit(self.X_tr_simpli, self.y_tr_simpli)
        else :
            model_tunning.fit(self.X_tr, self.y_tr)
        print (model_tunning.best_score_)
        print (model_tunning.best_params_)
        self.modelo = model_tunning.best_estimator_

    def fitting_classifier(self):
        if (self.simplificador):
            self.clf=self.modelo.fit(self.X_tr,self.y_tr)
        else :
            self.clf=self.modelo

    def metrics(self):
        if (self.simplificador):
            self.y_pred=self.clf.predict(self.X_tst)
        else :
            self.y_pred=self.clf.predict(self.X_tst)
        accuracy=mtc.accuracy_score(self.y_tst,self.y_pred)
        hamming=mtc.hamming_loss(self.y_tst,self.y_pred)
        precision=mtc.precision_score(self.y_tst,self.y_pred,average='micro')
        print("Total accuracy: ", accuracy)
        print("Hamming loss: ", hamming)
        print("Precision: ", precision)
        print(mtc.classification_report(self.y_tst,self.y_pred))
        #print("Accuracy per class:")
        aux=0
        for i in range(np.shape(self.y_pred)[1]):
            #print ("Class " ,i,": " ,mtc.accuracy_score(self.y_tst[:,i],y_pred[:,i]))
            aux=aux+mtc.accuracy_score(self.y_tst[:,i],self.y_pred[:,i])
        print("Accuracy media: ",aux/37)
        distance_matrix=np.ones((37,37))*(1/37)
        np.fill_diagonal(distance_matrix,0)
        emd_aux=0
        for i in range (np.shape(self.y_tst)[0]):
            emd_aux= emd_aux + emd(self.y_tst[i,:].astype(float),self.y_pred[i,:].astype(float),distance_matrix)
        
        print("EMD: ", emd_aux/(np.shape(self.y_tst)[0]))
        print("METRICAS DE LABEL RANKING: ")
        spearman=scipy.stats.spearmanr(self.y_tst, self.y_pred)[0]
        print("Spearmans Rank Correlation Coefficient: ", spearman)
        kendall=scipy.stats.kendalltau(self.y_tst, self.y_pred)[0]
        print("Kendall’s tau Correlation Coefficient: ",kendall)
        
        



In [5]:
path_string=input('Introduce el directorio del dataset')
clasificacion=TFM(path_string)
print('Inicio del programa clasificador')

while True:
    
    ask2=int(input('Que clasificador vas a usar?: '))
    clasificador=valores.clasificadores_dict[ask2]
    metod=int(input('Que metodo vas a usar?: Introduce 0 para One vs Rest, o 1 para Classifier Chain'))
        
    while True:
        ask1=input(('Quieres usar simplificacion para entrenar el algoritmo?'))
        if (ask1.lower()=='si'):
            non_zero_data=input(('Introduce el numero de elementos no nulos por categoria que consideras aceptable'))
            training_size=float(input('Introduce el porcentaje de tamano de datos de entrenamiento que quieres usar para simplificar'))
            print('Realizando simplificacion del dataset...')
            clasificacion.simplify_dataset(non_zero_data,training_size)
            clasificacion.use_simplify(True)
        elif (ask1.lower()=='no'):
            clasificacion.use_simplify(False)
        print("Iniciando tunning de los parámetros:")
        clasificacion.grid_search_cv(clasificador,valores.parametros_dict[(clasificador,metod)],metod)    
        print("Tunning finalizado:")  
        ask5=input('Quieres continuar con el entrenamiento?')
        if (ask5.lower()=='no'):
            break
        
        print("Entrenando el algoritmo con los mejores parametros...")
        clasificacion.fitting_classifier()
        print("Entrenamiento finalizado, se procede a predecir las etiquetas")
        print("Mostrando metricas obtenidas")
        clasificacion.metrics()
        ask3=input('Quieres probar a variar la simplificacion?')
        if (ask3.lower()=='si'):
            continue
        elif (ask3.lower()=='no'):
            break
            
    ask4=input('Quieres probar con otro clasificador o finalizamos el programa?')
    if (ask4.lower()=='si'):
        continue
    elif (ask4.lower()=='no'):
        break

print("Fin del programa clasificador")

Introduce el directorio del dataset../../Datasets/dataset/
Dataset cargado y preprocesado
Inicio del programa clasificador
Que clasificador vas a usar?: 2
Que metodo vas a usar?: Introduce 0 para One vs Rest, o 1 para Classifier Chain1
Quieres usar simplificacion para entrenar el algoritmo?si
Introduce el numero de elementos no nulos por categoria que consideras aceptable500
Introduce el porcentaje de tamano de datos de entrenamiento que quieres usar para simplificar0.4
Realizando simplificacion del dataset...
Simplificación finalizada
(37579, 2823)
Iniciando tunning de los parámetros:
Fitting 4 folds for each of 336 candidates, totalling 1344 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1

[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 123 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 126 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 127 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 131 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 13.0min
[Paralle

[Parallel(n_jobs=-1)]: Done 255 tasks      | elapsed: 24.1min
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed: 24.3min
[Parallel(n_jobs=-1)]: Done 257 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done 259 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done 260 tasks      | elapsed: 24.9min
[Parallel(n_jobs=-1)]: Done 261 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 262 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 263 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed: 25.4min
[Parallel(n_jobs=-1)]: Done 265 tasks      | elapsed: 25.4min
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed: 25.4min
[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed: 25.6min
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed: 25.7min
[Paralle

[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 389 tasks      | elapsed: 37.3min
[Parallel(n_jobs=-1)]: Done 390 tasks      | elapsed: 37.3min
[Parallel(n_jobs=-1)]: Done 391 tasks      | elapsed: 37.3min
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 393 tasks      | elapsed: 37.6min
[Parallel(n_jobs=-1)]: Done 394 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 395 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 396 tasks      | elapsed: 37.9min
[Parallel(n_jobs=-1)]: Done 397 tasks      | elapsed: 38.0min
[Parallel(n_jobs=-1)]: Done 398 tasks      | elapsed: 38.0min
[Parallel(n_jobs=-1)]: Done 399 tasks      | elapsed: 38.1min
[Parallel(n_jobs=-1)]: Done 400 tasks      | elapsed: 38.2min
[Parallel(n_jobs=-1)]: Done 401 tasks      | elapsed: 38.6min
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed: 38.6min
[Parallel(n_jobs=-1)]: Done 403 tasks      | elapsed: 38.6min
[Paralle

[Parallel(n_jobs=-1)]: Done 521 tasks      | elapsed: 49.7min
[Parallel(n_jobs=-1)]: Done 522 tasks      | elapsed: 49.7min
[Parallel(n_jobs=-1)]: Done 523 tasks      | elapsed: 49.7min
[Parallel(n_jobs=-1)]: Done 524 tasks      | elapsed: 49.9min
[Parallel(n_jobs=-1)]: Done 525 tasks      | elapsed: 50.1min
[Parallel(n_jobs=-1)]: Done 526 tasks      | elapsed: 50.1min
[Parallel(n_jobs=-1)]: Done 527 tasks      | elapsed: 50.1min
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed: 50.3min
[Parallel(n_jobs=-1)]: Done 529 tasks      | elapsed: 50.4min
[Parallel(n_jobs=-1)]: Done 530 tasks      | elapsed: 50.5min
[Parallel(n_jobs=-1)]: Done 531 tasks      | elapsed: 50.5min
[Parallel(n_jobs=-1)]: Done 532 tasks      | elapsed: 50.6min
[Parallel(n_jobs=-1)]: Done 533 tasks      | elapsed: 50.8min
[Parallel(n_jobs=-1)]: Done 534 tasks      | elapsed: 50.8min
[Parallel(n_jobs=-1)]: Done 535 tasks      | elapsed: 50.8min
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed: 51.0min
[Paralle

[Parallel(n_jobs=-1)]: Done 655 tasks      | elapsed: 62.4min
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed: 62.5min
[Parallel(n_jobs=-1)]: Done 657 tasks      | elapsed: 62.6min
[Parallel(n_jobs=-1)]: Done 658 tasks      | elapsed: 62.6min
[Parallel(n_jobs=-1)]: Done 659 tasks      | elapsed: 62.7min
[Parallel(n_jobs=-1)]: Done 660 tasks      | elapsed: 62.8min
[Parallel(n_jobs=-1)]: Done 661 tasks      | elapsed: 63.0min
[Parallel(n_jobs=-1)]: Done 662 tasks      | elapsed: 63.0min
[Parallel(n_jobs=-1)]: Done 663 tasks      | elapsed: 63.0min
[Parallel(n_jobs=-1)]: Done 664 tasks      | elapsed: 63.2min
[Parallel(n_jobs=-1)]: Done 665 tasks      | elapsed: 63.3min
[Parallel(n_jobs=-1)]: Done 666 tasks      | elapsed: 63.4min
[Parallel(n_jobs=-1)]: Done 667 tasks      | elapsed: 63.4min
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed: 63.5min
[Parallel(n_jobs=-1)]: Done 669 tasks      | elapsed: 63.7min
[Parallel(n_jobs=-1)]: Done 670 tasks      | elapsed: 63.7min
[Paralle

[Parallel(n_jobs=-1)]: Done 788 tasks      | elapsed: 75.5min
[Parallel(n_jobs=-1)]: Done 789 tasks      | elapsed: 75.8min
[Parallel(n_jobs=-1)]: Done 790 tasks      | elapsed: 75.8min
[Parallel(n_jobs=-1)]: Done 791 tasks      | elapsed: 75.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 76.0min
[Parallel(n_jobs=-1)]: Done 793 tasks      | elapsed: 76.0min
[Parallel(n_jobs=-1)]: Done 794 tasks      | elapsed: 76.0min
[Parallel(n_jobs=-1)]: Done 795 tasks      | elapsed: 76.1min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 76.2min
[Parallel(n_jobs=-1)]: Done 797 tasks      | elapsed: 76.2min
[Parallel(n_jobs=-1)]: Done 798 tasks      | elapsed: 76.2min
[Parallel(n_jobs=-1)]: Done 799 tasks      | elapsed: 76.3min
[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed: 76.4min
[Parallel(n_jobs=-1)]: Done 801 tasks      | elapsed: 76.5min
[Parallel(n_jobs=-1)]: Done 802 tasks      | elapsed: 76.5min
[Parallel(n_jobs=-1)]: Done 803 tasks      | elapsed: 76.6min
[Paralle

[Parallel(n_jobs=-1)]: Done 921 tasks      | elapsed: 88.0min
[Parallel(n_jobs=-1)]: Done 922 tasks      | elapsed: 88.0min
[Parallel(n_jobs=-1)]: Done 923 tasks      | elapsed: 88.0min
[Parallel(n_jobs=-1)]: Done 924 tasks      | elapsed: 88.3min
[Parallel(n_jobs=-1)]: Done 925 tasks      | elapsed: 88.5min
[Parallel(n_jobs=-1)]: Done 926 tasks      | elapsed: 88.6min
[Parallel(n_jobs=-1)]: Done 927 tasks      | elapsed: 88.6min
[Parallel(n_jobs=-1)]: Done 928 tasks      | elapsed: 88.8min
[Parallel(n_jobs=-1)]: Done 929 tasks      | elapsed: 89.1min
[Parallel(n_jobs=-1)]: Done 930 tasks      | elapsed: 89.2min
[Parallel(n_jobs=-1)]: Done 931 tasks      | elapsed: 89.2min
[Parallel(n_jobs=-1)]: Done 932 tasks      | elapsed: 89.4min
[Parallel(n_jobs=-1)]: Done 933 tasks      | elapsed: 89.7min
[Parallel(n_jobs=-1)]: Done 934 tasks      | elapsed: 89.7min
[Parallel(n_jobs=-1)]: Done 935 tasks      | elapsed: 89.7min
[Parallel(n_jobs=-1)]: Done 936 tasks      | elapsed: 90.0min
[Paralle

[Parallel(n_jobs=-1)]: Done 1053 tasks      | elapsed: 100.3min
[Parallel(n_jobs=-1)]: Done 1054 tasks      | elapsed: 100.3min
[Parallel(n_jobs=-1)]: Done 1055 tasks      | elapsed: 100.4min
[Parallel(n_jobs=-1)]: Done 1056 tasks      | elapsed: 100.5min
[Parallel(n_jobs=-1)]: Done 1057 tasks      | elapsed: 100.6min
[Parallel(n_jobs=-1)]: Done 1058 tasks      | elapsed: 100.7min
[Parallel(n_jobs=-1)]: Done 1059 tasks      | elapsed: 100.7min
[Parallel(n_jobs=-1)]: Done 1060 tasks      | elapsed: 100.8min
[Parallel(n_jobs=-1)]: Done 1061 tasks      | elapsed: 100.9min
[Parallel(n_jobs=-1)]: Done 1062 tasks      | elapsed: 101.0min
[Parallel(n_jobs=-1)]: Done 1063 tasks      | elapsed: 101.0min
[Parallel(n_jobs=-1)]: Done 1064 tasks      | elapsed: 101.1min
[Parallel(n_jobs=-1)]: Done 1065 tasks      | elapsed: 101.5min
[Parallel(n_jobs=-1)]: Done 1066 tasks      | elapsed: 101.5min
[Parallel(n_jobs=-1)]: Done 1067 tasks      | elapsed: 101.5min
[Parallel(n_jobs=-1)]: Done 1068 tasks  

[Parallel(n_jobs=-1)]: Done 1182 tasks      | elapsed: 112.9min
[Parallel(n_jobs=-1)]: Done 1183 tasks      | elapsed: 112.9min
[Parallel(n_jobs=-1)]: Done 1184 tasks      | elapsed: 113.1min
[Parallel(n_jobs=-1)]: Done 1185 tasks      | elapsed: 113.1min
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed: 113.1min
[Parallel(n_jobs=-1)]: Done 1187 tasks      | elapsed: 113.2min
[Parallel(n_jobs=-1)]: Done 1188 tasks      | elapsed: 113.3min
[Parallel(n_jobs=-1)]: Done 1189 tasks      | elapsed: 113.4min
[Parallel(n_jobs=-1)]: Done 1190 tasks      | elapsed: 113.4min
[Parallel(n_jobs=-1)]: Done 1191 tasks      | elapsed: 113.5min
[Parallel(n_jobs=-1)]: Done 1192 tasks      | elapsed: 113.6min
[Parallel(n_jobs=-1)]: Done 1193 tasks      | elapsed: 113.8min
[Parallel(n_jobs=-1)]: Done 1194 tasks      | elapsed: 113.8min
[Parallel(n_jobs=-1)]: Done 1195 tasks      | elapsed: 113.8min
[Parallel(n_jobs=-1)]: Done 1196 tasks      | elapsed: 114.0min
[Parallel(n_jobs=-1)]: Done 1197 tasks  

[Parallel(n_jobs=-1)]: Done 1311 tasks      | elapsed: 125.2min
[Parallel(n_jobs=-1)]: Done 1312 tasks      | elapsed: 125.5min
[Parallel(n_jobs=-1)]: Done 1313 tasks      | elapsed: 125.8min
[Parallel(n_jobs=-1)]: Done 1314 tasks      | elapsed: 125.8min
[Parallel(n_jobs=-1)]: Done 1315 tasks      | elapsed: 125.8min
[Parallel(n_jobs=-1)]: Done 1316 tasks      | elapsed: 126.1min
[Parallel(n_jobs=-1)]: Done 1317 tasks      | elapsed: 126.4min
[Parallel(n_jobs=-1)]: Done 1318 tasks      | elapsed: 126.4min
[Parallel(n_jobs=-1)]: Done 1319 tasks      | elapsed: 126.4min
[Parallel(n_jobs=-1)]: Done 1320 tasks      | elapsed: 126.6min
[Parallel(n_jobs=-1)]: Done 1321 tasks      | elapsed: 126.6min
[Parallel(n_jobs=-1)]: Done 1322 tasks      | elapsed: 126.6min
[Parallel(n_jobs=-1)]: Done 1323 tasks      | elapsed: 126.7min
[Parallel(n_jobs=-1)]: Done 1324 tasks      | elapsed: 126.9min
[Parallel(n_jobs=-1)]: Done 1325 tasks      | elapsed: 126.9min
[Parallel(n_jobs=-1)]: Done 1326 tasks  

  'precision', 'predicted', average, warn_for)


EMD:  0.030304868097181675
METRICAS DE LABEL RANKING: 
Spearmans Rank Correlation Coefficient:  [[ 1.          0.24141361 -0.04033071 ... -0.02979356  0.00379775
  -0.02721012]
 [ 0.24141361  1.         -0.04659196 ... -0.01198404  0.04554504
  -0.0308856 ]
 [-0.04033071 -0.04659196  1.         ... -0.0651241  -0.03376179
  -0.00828285]
 ...
 [-0.02979356 -0.01198404 -0.0651241  ...  1.         -0.01528297
   0.04560083]
 [ 0.00379775  0.04554504 -0.03376179 ... -0.01528297  1.
   0.05085149]
 [-0.02721012 -0.0308856  -0.00828285 ...  0.04560083  0.05085149
   1.        ]]
Kendall’s tau Correlation Coefficient:  0.5897884135410598
Quieres probar a variar la simplificacion?no
Quieres probar con otro clasificador o finalizamos el programa?no
Fin del programa clasificador


In [None]:
path_string="../../Datasets/dataset/"
metod=0
clasificador=valores.LogisticRegression()
parameters={
    "estimator__C": [1],
    "estimator__solver": ["liblinear"],
    "estimator__tol": [0.01]
}
clasificacion=TFM(path_string)
clasificacion.use_simplify(False)
clasificacion.grid_search_cv(clasificador,parameters,metod)
clasificacion.fitting_classifier()
clasificacion.metrics()