In [2]:
## IMPORTS ##
import scipy.sparse
import numpy as np
import matplotlib.pyplot as pyplot
import sklearn.metrics as mtc
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.neural_network import MLPClassifier

In [50]:
## FUNCIONES ##

class TFM:

    def __init__(self,path):
        self.X_tr = scipy.sparse.load_npz(path+'X_tr.npz')
        self.X_tst = scipy.sparse.load_npz(path+'X_tst.npz')
        self.y_tr = np.load(path+'y_tr.npy')
        self.y_tst = np.load(path+'y_tst.npy')
        # Se binarizan las caracteristicas 
        self.y_tr=MultiLabelBinarizer().fit_transform(self.y_tr)
        self.y_tst=MultiLabelBinarizer().fit_transform(self.y_tst)
        # Escalado 
        scaler = MaxAbsScaler().fit(self.X_tr)
        self.X_tr=scaler.transform(self.X_tr)
        self.X_tst=scaler.transform(self.X_tst)
        self.X_tr_simpli= None
        self.simplificador = False
        self.modelo = None
        self.clf= None
        print ('Dataset cargado y preprocesado')

    def reduce_features(self,num_data):
        columns=np.array([])
        for i in range(np.shape(self.X_tr)[1]):
            if self.X_tr[:,i].count_nonzero()> num_data:
                columns=np.append(columns,i)
        np.savetxt('features'+num_data+'.txt',columns,delimiter=',')
        print('Las características han sido reducidas')

    def simplify_dataset(self,features,tr_size):
        self.X_tr_simpli=self.X_tr
        self.y_tr_simpli=self.y_tr
        columns=np.loadtxt('features'+features+'.txt',delimiter=',')
        self.X_tr_simpli=self.X_tr[:,columns]
        self.X_tr_simpli, __, self.y_tr_simpli, __ = train_test_split(self.X_tr_simpli, self.y_tr_simpli, test_size=(1-tr_size), random_state=42)
        print('Simplificación finalizada')

    def use_simplify(self,boolean):
        self.simplificador= boolean
        
    def grid_search_cv(self,classifier,parameters,metodo):
        if (metodo == 0):
            classif = OneVsRestClassifier(classifier)
        elif (metodo==1):
            classif = ClassifierChain(classifier)
        model_tunning = GridSearchCV(classif, param_grid=parameters,cv=4)
        if (self.simplificador):
            model_tunning.fit(self.X_tr_simpli, self.y_tr_simpli)
        else :
            model_tunning.fit(self.X_tr, self.y_tr)
        print (model_tunning.best_score_)
        print (model_tunning.best_params_)
        self.modelo = model_tunning.best_estimator_

    def fitting_classifier(self):
        self.clf=self.modelo
        self.clf.fit(self.X_tr,self.y_tr)
    
    def metrics(self):
        y_pred=self.clf.predict(self.X_tst)
        accuracy=mtc.accuracy_score(self.y_tst,y_pred)
        hamming=mtc.hamming_loss(self.y_tst,y_pred)
        precision=mtc.precision_score(self.y_tst,y_pred,average='micro')
        print("Total accuracy: ", accuracy)
        print("Hamming loss: ", hamming)
        print("Precision: ", precision)
        print(mtc.classification_report(self.y_tst,y_pred))
        print("Accuracy per class:")
        aux=0
        for i in range(np.shape(y_pred)[1]):
            print ("Class " ,i,": " ,mtc.accuracy_score(self.y_tst[:,i],y_pred[:,i]))
            aux=aux+mtc.accuracy_score(self.y_tst[:,i],y_pred[:,i])
        print("Accuracy media: ",aux/37)


In [51]:
#Variables a usar 
path_string='../../Datasets/dataset/'
non_zero_data='250'
training_size=0.1
clasificador=LogisticRegression()
metod=0
#mlp=MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(300,200,100,50), random_state=0)
parameters0 = {
    "estimator__C": [1],
    "estimator__solver": ["liblinear"],
    "estimator__tol": [0.01],
    "estimator__class_weight":[None]
}
parameters1 = {
    "base_estimator__C": [4],
    "base_estimator__solver": ["liblinear"],
    "base_estimator__tol": [0.01],
    "base_estimator__max_iter":[10000]
}


clasificacion= TFM(path_string)

Dataset cargado y preprocesado


In [52]:
#clasificacion.reduce_features(10000)
clasificacion.simplify_dataset('500',training_size)
clasificacion.use_simplify(True)

Simplificación finalizada


In [53]:
clasificacion.grid_search_cv(clasificador,parameters0,metod)

0.2597402597402597
{'estimator__class_weight': None, 'estimator__C': 1, 'estimator__solver': 'liblinear', 'estimator__tol': 0.01}


In [54]:
clasificacion.fitting_classifier()

In [55]:
clasificacion.metrics()

('Total accuracy: ', 0.39135317237507017)
('Hamming loss: ', 0.02901497792008741)
('Precision: ', 0.7681099084096586)
              precision    recall  f1-score   support

           0       0.86      0.70      0.77       109
           1       0.78      0.46      0.58       138
           2       0.77      0.64      0.70       224
           3       0.76      0.55      0.64       180
           4       0.82      0.69      0.75       177
           5       0.75      0.63      0.69       262
           6       0.65      0.47      0.55        55
           7       0.79      0.67      0.73       341
           8       0.64      0.52      0.57        91
           9       0.60      0.38      0.47        65
          10       0.72      0.66      0.69       181
          11       0.57      0.16      0.25        25
          12       0.69      0.51      0.58       172
          13       0.67      0.20      0.31        10
          14       0.91      0.68      0.78       173
          15     

In [None]:
#SEGUNDA SOLUCIÓN
#tunning 
parameters = {
    "base_estimator__C": [1,10,100,1000],
    "base_estimator__solver": ["liblinear"],
    "base_estimator__tol": [0.1,0.01,0.001,0.0001],
    "base_estimator__max_iter":[10000]
}

modelo2=classifier_chain_tunning(X_aux,y_aux,LogisticRegression(),parameters)