In [17]:
### IMPORTS ###
import scipy.sparse
import scipy.stats
import numpy as np
import matplotlib.pyplot as pyplot
import sklearn.metrics as mtc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain
from pyemd import emd
import valores
import pickle


In [18]:
from __future__ import print_function
import keras
from keras.datasets import fashion_mnist  # new with Keras 2.1.2.  Yah!!
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras import backend as K
import sparse
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [26]:
## FUNCIONES ##


class TFM:

    def __init__(self,path):
        self.X_tr = scipy.sparse.load_npz(path+'X_tr.npz')
        self.X_tst = scipy.sparse.load_npz(path+'X_tst.npz')
        self.y_tr = np.load(path+'y_tr.npy')
        self.y_tst = np.load(path+'y_tst.npy')
        # Se binarizan las caracteristicas 
        self.y_tr=MultiLabelBinarizer().fit_transform(self.y_tr)
        self.y_tst=MultiLabelBinarizer().fit_transform(self.y_tst)
        # Escalado 
        scaler = MaxAbsScaler().fit(self.X_tr)
        self.X_tr=scaler.transform(self.X_tr)
        self.X_tst=scaler.transform(self.X_tst)
        self.X_tr_simpli= None
        self.simplificador = False
        self.modelo = None
        self.clf= None
        self.columns= None
        print ('Dataset cargado y preprocesado')

    def reduce_features(self,num_data):
        columns=np.array([])
        for i in range(np.shape(self.X_tr)[1]):
            if self.X_tr[:,i].count_nonzero()> num_data:
                columns=np.append(columns,i)
        np.savetxt('features'+num_data+'.txt',columns,delimiter=',')
        print('Las características han sido reducidas')

    def simplify_dataset(self,features,tr_size):
        self.X_tr_simpli=self.X_tr
        self.y_tr_simpli=self.y_tr
        
        self.columns=np.loadtxt('features'+features+'.txt',delimiter=',')
        self.X_tr_simpli=self.X_tr[:,self.columns]
        self.X_tr_simpli, __, self.y_tr_simpli, __ = train_test_split(self.X_tr_simpli, self.y_tr_simpli, test_size=(1-tr_size), random_state=42)
        print('Simplificación finalizada')
        print(np.shape(self.X_tr_simpli))

    def use_simplify(self,boolean):
        self.simplificador= boolean
        
    def grid_search_cv(self,classifier,parameters,metodo):
        if (metodo == 0):
            classif = OneVsRestClassifier(classifier)
        elif (metodo==1):
            classif = ClassifierChain(classifier)
        model_tunning = GridSearchCV(classif, param_grid=parameters,cv=4,verbose=50, n_jobs=-1,scoring='accuracy')
        if (self.simplificador):
            model_tunning.fit(self.X_tr_simpli, self.y_tr_simpli)
        else :
            model_tunning.fit(self.X_tr, self.y_tr)
        print (model_tunning.best_score_)
        print (model_tunning.best_params_)
        self.modelo = model_tunning.best_estimator_

    def fitting_classifier(self):
        if (self.simplificador):
            self.clf=self.modelo.fit(self.X_tr,self.y_tr)
        else :
            self.clf=self.modelo

    def pred_metr(self):
        if (self.simplificador):
            self.y_pred=self.clf.predict(self.X_tst)
        else :
            self.y_pred=self.clf.predict(self.X_tst)
        self.metrics()
    
    def metrics(self):
        accuracy=mtc.accuracy_score(self.y_tst,self.y_pred)
        hamming=mtc.hamming_loss(self.y_tst,self.y_pred)
        precision=mtc.precision_score(self.y_tst,self.y_pred,average='micro')
        print("Total accuracy: ", accuracy)
        print("Hamming loss: ", hamming)
        print("Precision: ", precision)
        print(mtc.classification_report(self.y_tst,self.y_pred))
        #print("Accuracy per class:")
        aux=0
        for i in range(np.shape(self.y_pred)[1]):
            #print ("Class " ,i,": " ,mtc.accuracy_score(self.y_tst[:,i],y_pred[:,i]))
            aux=aux+mtc.accuracy_score(self.y_tst[:,i],self.y_pred[:,i])
        print("Accuracy media: ",aux/37)
        distance_matrix=np.ones((37,37))*(1/37)
        np.fill_diagonal(distance_matrix,0)
        emd_aux=0
        for i in range (np.shape(self.y_tst)[0]):
            emd_aux= emd_aux + emd(self.y_tst[i,:].astype(float),self.y_pred[i,:].astype(float),distance_matrix)
        
        print("EMD: ", emd_aux/(np.shape(self.y_tst)[0]))
        print("METRICAS DE LABEL RANKING: ")
        spearman=scipy.stats.spearmanr(self.y_tst, self.y_pred)[0]
        print("Spearmans Rank Correlation Coefficient: ", spearman)
        kendall=scipy.stats.kendalltau(self.y_tst, self.y_pred)[0]
        print("Kendall’s tau Correlation Coefficient: ",kendall)      
        
    def dnn(self,batch,epochs):
        num_classes = 37
        batch_size = batch
        epochs = epochs
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience= 7)
        self.model = Sequential()
        input_shape=(np.shape(self.X_tr)[1],)
        self.model.add(Dense(200, activation='relu',input_shape=input_shape))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(100, activation='relu'))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(100, activation='relu'))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(num_classes, activation='sigmoid'))
        print(self.model.summary())
        self.model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
        hist = self.model.fit(self.X_tr, self.y_tr,batch_size=batch_size,epochs=epochs,verbose=1,validation_data=(self.X_tst, self.y_tst),callbacks=[es])
        self.y_pred = self.model.predict(self.X_tst)
        self.y_pred[self.y_pred>=0.5] = 1
        self.y_pred[self.y_pred<0.5] = 0
        self.metrics()
        
        


In [27]:
path_string=input('Introduce el directorio del dataset')
clasificacion=TFM(path_string)
print('Inicio del programa clasificador')

while True:
    ask6=int(input('Vas a usar clasificación normal o redes neuronales? (0 o 1) :'))
    if (ask6==0):
        ask2=int(input('Que clasificador vas a usar?: '))
        clasificador=valores.clasificadores_dict[ask2]
        metod=int(input('Que metodo vas a usar?: Introduce 0 para One vs Rest, o 1 para Classifier Chain'))

        while True:
            ask1=input(('Quieres usar simplificacion para entrenar el algoritmo?'))
            if (ask1.lower()=='si'):
                non_zero_data=input(('Introduce el numero de elementos no nulos por categoria que consideras aceptable'))
                training_size=float(input('Introduce el porcentaje de tamano de datos de entrenamiento que quieres usar para simplificar'))
                print('Realizando simplificacion del dataset...')
                clasificacion.simplify_dataset(non_zero_data,training_size)
                clasificacion.use_simplify(True)
            elif (ask1.lower()=='no'):
                clasificacion.use_simplify(False)
            print("Iniciando tunning de los parámetros:")
            clasificacion.grid_search_cv(clasificador,valores.parametros_dict[(clasificador,metod)],metod)    
            print("Tunning finalizado:")  
            ask5=input('Quieres continuar con el entrenamiento?')
            if (ask5.lower()=='no'):
                break

            print("Entrenando el algoritmo con los mejores parametros...")
            clasificacion.fitting_classifier()
            print("Entrenamiento finalizado, se procede a predecir las etiquetas")
            print("Mostrando metricas obtenidas")
            clasificacion.pred_metr()
            ask3=input('Quieres probar a variar la simplificacion?')
            if (ask3.lower()=='si'):
                continue
            elif (ask3.lower()=='no'):
                break

        ask4=input('Quieres probar con otro clasificador o finalizamos el programa?')
        if (ask4.lower()=='si'):
            continue
        elif (ask4.lower()=='no'):
            break
    elif (ask6==1):
        ask7 =int(input('Introduce el batch size: '))
        ask8 =int(input('Introduce epochs: '))
        clasificacion.dnn(ask7,ask8)
        ask9 = input('Quieres probar a entrenar con algo diferente o salir? :')
        if (ask9.lower()=='si'):
            continue
        elif (ask9.lower() =='no'):
            break
        
print("Fin del programa clasificador")

Introduce el directorio del dataset../../Datasets/dataset/
Dataset cargado y preprocesado
Inicio del programa clasificador
Vas a usar clasificación normal o redes neuronales? (0 o 1) :1
Introduce el batch size: 128
Introduce epochs: 20
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_31 (Dense)             (None, 200)               8901600   
_________________________________________________________________
dropout_23 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_32 (Dense)             (None, 100)               20100     
_________________________________________________________________
dropout_24 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_33 (Dense)             (None, 100)               10100     
______________________________________