In [28]:
### IMPORTS ###
import scipy.sparse
import numpy as np
import matplotlib.pyplot as pyplot
import sklearn.metrics as mtc
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [29]:
## FUNCIONES ##


class TFM:

    def __init__(self,path):
        self.X_tr = scipy.sparse.load_npz(path+'X_tr.npz')
        self.X_tst = scipy.sparse.load_npz(path+'X_tst.npz')
        self.y_tr = np.load(path+'y_tr.npy')
        self.y_tst = np.load(path+'y_tst.npy')
        # Se binarizan las caracteristicas 
        self.y_tr=MultiLabelBinarizer().fit_transform(self.y_tr)
        self.y_tst=MultiLabelBinarizer().fit_transform(self.y_tst)
        # Escalado 
        scaler = MaxAbsScaler().fit(self.X_tr)
        self.X_tr=scaler.transform(self.X_tr)
        self.X_tst=scaler.transform(self.X_tst)
        self.X_tr_simpli= None
        self.simplificador = False
        self.modelo = None
        self.clf= None
        print ('Dataset cargado y preprocesado')

    def reduce_features(self,num_data):
        columns=np.array([])
        for i in range(np.shape(self.X_tr)[1]):
            if self.X_tr[:,i].count_nonzero()> num_data:
                columns=np.append(columns,i)
        np.savetxt('features'+num_data+'.txt',columns,delimiter=',')
        print('Las características han sido reducidas')

    def simplify_dataset(self,features,tr_size):
        self.X_tr_simpli=self.X_tr
        self.y_tr_simpli=self.y_tr
        columns=np.loadtxt('features'+features+'.txt',delimiter=',')
        self.X_tr_simpli=self.X_tr[:,columns]
        self.X_tr_simpli, __, self.y_tr_simpli, __ = train_test_split(self.X_tr_simpli, self.y_tr_simpli, test_size=(1-tr_size), random_state=42)
        print('Simplificación finalizada')

    def use_simplify(self,boolean):
        self.simplificador= boolean
        
    def grid_search_cv(self,classifier,parameters,metodo):
        if (metodo == 0):
            classif = OneVsRestClassifier(classifier)
        elif (metodo==1):
            classif = ClassifierChain(classifier)
        model_tunning = GridSearchCV(classif, param_grid=parameters,cv=4,verbose=100)
        if (self.simplificador):
            model_tunning.fit(self.X_tr_simpli, self.y_tr_simpli)
        else :
            model_tunning.fit(self.X_tr, self.y_tr)
        print (model_tunning.best_score_)
        print (model_tunning.best_params_)
        self.modelo = model_tunning.best_estimator_

    def fitting_classifier(self):
        self.clf=self.modelo
        self.clf.fit(self.X_tr,self.y_tr)
    
    def metrics(self):
        y_pred=self.clf.predict(self.X_tst)
        accuracy=mtc.accuracy_score(self.y_tst,y_pred)
        hamming=mtc.hamming_loss(self.y_tst,y_pred)
        precision=mtc.precision_score(self.y_tst,y_pred,average='micro')
        print("Total accuracy: ", accuracy)
        print("Hamming loss: ", hamming)
        print("Precision: ", precision)
        print(mtc.classification_report(self.y_tst,y_pred))
        print("Accuracy per class:")
        aux=0
        for i in range(np.shape(y_pred)[1]):
            print ("Class " ,i,": " ,mtc.accuracy_score(self.y_tst[:,i],y_pred[:,i]))
            aux=aux+mtc.accuracy_score(self.y_tst[:,i],y_pred[:,i])
        print("Accuracy media: ",aux/37)



In [None]:

parameters0 = {
    "estimator__C": [1,10,100,1000],
    "estimator__solver": ["liblinear"],
    "estimator__tol": [0.1,0.01,0.001,0.0001],
    "estimator__class_weight":[None]
}
parameters1 = {
    "base_estimator__C": [1,10,100,1000],
    "base_estimator__solver": ["liblinear"],
    "base_estimator__tol": [0.1,0.01,0.001,0.0001],
    "base_estimator__max_iter":[10000]
}
parameters2 = {
    "estimator__C": [0.1, 1, 10],
    "estimator__kernel": ["linear"],
    "estimator__gamma": [0.01, 0.1, 1]
}
parameters3 = {
    "base_estimator__C": [4],
    "base_estimator__solver": ["liblinear"],
    "base_estimator__tol": [0.01],
    "base_estimator__max_iter":[10000]
}

"""
parameters2 = {
    "estimator__C": [0.1,1,10,100,1000],
    "estimator__kernel": ["linear","rbf","poly"],
    "estimator__gamma": [0.1,1,10,100],
    "estimator__degree":[2,3,4,5]
}
"""
path_string=raw_input('Introduce el directorio del dataset')
clasificacion=TFM(path_string)
print('Inicio del programa clasificador')

while True:
    
    ask2=int(raw_input('Que clasificador vas a usar?: Introduce 0 para Logistic Regression, o 1 para SVM'))
    if (ask2==0):
        clasificador=LogisticRegression()
    elif (ask2==1):
        clasificador=SVC()
    metod=int(raw_input('Que metodo vas a usar?: Introduce 0 para One vs Rest, o 1 para Classifier Chain'))
        
    while True:
        ask1=raw_input(('Quieres usar simplificacion para entrenar el algoritmo?'))
        if (ask1.lower()=='si'):
            non_zero_data=raw_input(('Introduce el numero de elementos no nulos por categoria que consideras aceptable'))
            training_size=float(raw_input('Introduce el porcentaje de tamano de datos de entrenamiento que quieres usar para simplificar'))
            print('Realizando simplificacion del dataset...')
            clasificacion.simplify_dataset(non_zero_data,training_size)
            clasificacion.use_simplify(True)
        elif (ask1.lower()=='no'):
            clasificacion.use_simplify(False)
        print("Iniciando tunning de los parámetros:")
        if (ask2==0 and metod == 0):
            clasificacion.grid_search_cv(clasificador,parameters0,metod)
        elif (ask2==0 and metod == 1):
            clasificacion.grid_search_cv(clasificador,parameters1,metod)
        elif (ask2==1 and metod == 0):
            clasificacion.grid_search_cv(clasificador,parameters2,metod)
        elif (ask2==1 and metod == 1):
            clasificacion.grid_search_cv(clasificador,parameters3,metod)
        print("Tunning finalizado:")  
        print("Entrenando el algoritmo con los mejores parametros...")
        clasificacion.fitting_classifier()
        print("Entrenamiento finalizado, se procede a predecir las etiquetas")
        print("Mostrando metricas obtenidas")
        clasificacion.metrics()
        ask3=raw_input('Quieres probar a variar la simplificacion?')
        if (ask3.lower()=='si'):
            continue
        elif (ask3.lower()=='no'):
            break
            
    ask4=raw_input('Quieres probar con otro clasificador o finalizamos el programa?')
    if (ask4.lower()=='si'):
        continue
    elif (ask4.lower()=='no'):
        break

print("Fin del programa clasificador")
    



Introduce el directorio del dataset../../Datasets/dataset/
Dataset cargado y preprocesado
Inicio del programa clasificador
Que clasificador vas a usar?: Introduce 0 para Logistic Regression, o 1 para SVM1
Que metodo vas a usar?: Introduce 0 para One vs Rest, o 1 para Classifier Chain0
Quieres usar simplificacion para entrenar el algoritmo?si
Introduce el numero de elementos no nulos por categoria que consideras aceptable1000
Introduce el porcentaje de tamano de datos de entrenamiento que quieres usar para simplificar0.1
Realizando simplificacion del dataset...
Simplificación finalizada
Iniciando tunning de los parámetros:
Fitting 4 folds for each of 9 candidates, totalling 36 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] estimator__kernel=linear, estimator__C=0.1, estimator__gamma=0.01 


In [10]:
#path_string='../../Datasets/dataset/'
#mlp=MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(300,200,100,50), random_state=0)