In [1]:
## IMPORTS ##
import scipy.sparse
import numpy as np
import matplotlib.pyplot as pyplot
import sklearn.metrics as mtc
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain

In [2]:
X_tr = scipy.sparse.load_npz('../../Datasets/dataset/X_tr.npz')
X_tst = scipy.sparse.load_npz('../../Datasets/dataset/X_tst.npz')
y_tr = np.load('../../Datasets/dataset/y_tr.npy')
y_tst = np.load('../../Datasets/dataset/y_tst.npy')
# Se binarizan las caracteristicas 
y_tr=MultiLabelBinarizer().fit_transform(y_tr)
y_tst=MultiLabelBinarizer().fit_transform(y_tst)
# Escalado 
scaler = MaxAbsScaler().fit(X_tr)
X_tr=scaler.transform(X_tr)
X_tst=scaler.transform(X_tst)


In [73]:
# REDUCCIÓN DE DIMENSIÓN PRIMERA TÉCNICA (al final no se usó)
svd = TruncatedSVD(n_components=17, n_iter=10, random_state=42)
X_tr = svd.fit_transform(X_tr)  
X_tst = svd.transform(X_tst)  
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)



ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [26]:
#REDUCCION TAMAÑO TRAINING Y ELIMINAR CARACTERISTICAS INSERVIBLES

columns=np.array([])
for i in range(np.shape(X_tr)[1]):
    if X_tr[:,i].count_nonzero()> 250:
        columns=np.append(columns,i)


In [27]:
#Se guardan las caracteristicas mas importantes para evitar repetir este proceso siempre
np.savetxt('features250.txt',columns,delimiter=',')

print(np.shape(columns))


(4295,)


In [16]:
#Se cargan las caracteristicas guardadas
columns=np.loadtxt('features500.txt',delimiter=',')
X_tr2=X_tr[:,columns]

In [30]:
#Reducir tamaño de muestra para un tunning mas rapido
X_aux, __, y_aux, __ = train_test_split(X_tr2, y_tr, test_size=0.1, random_state=42)


In [8]:
## Clasificador de regresión logística 
classif = OneVsRestClassifier(LogisticRegression())
parameters = {
    "estimator__C": [10],
    "estimator__solver": ["liblinear"],
    "estimator__tol": [0.1]
}

model_tunning = GridSearchCV(classif, param_grid=parameters,cv=4)

model_tunning.fit(X_aux, y_aux)

print (model_tunning.best_score_)
print (model_tunning.best_params_)


NameError: name 'X_aux' is not defined

In [6]:
#PRIMERA SOLUCIÓN 
classif = OneVsRestClassifier(LogisticRegression())
parameters = {
    "estimator__C": [1],
    "estimator__solver": ["liblinear"],
    "estimator__tol": [0.01],
    "estimator__class_weight":[None]
}

model_tunning = GridSearchCV(classif, param_grid=parameters,cv=4)

model_tunning.fit(X_tr, y_tr)

print (model_tunning.best_score_)
print (model_tunning.best_params_)

0.3978115786224441
{'estimator__C': 1, 'estimator__class_weight': None, 'estimator__solver': 'liblinear', 'estimator__tol': 0.01}


In [6]:
classif=OneVsRestClassifier(LogisticRegression(C=1,tol=0.01,solver='liblinear'))
classif.fit(X_tr,y_tr)



OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.01, verbose=0, warm_start=False),
          n_jobs=None)

In [14]:
y_pred=classif.predict(X_tst)
accuracy=mtc.accuracy_score(y_tst,y_pred)
hamming=mtc.hamming_loss(y_tst,y_pred)
precision=mtc.precision_score(y_tst,y_pred,average='micro')
print("Total accuracy: ", accuracy)
print("Hamming loss: ", hamming)
print("Precision: ", precision)
print(mtc.classification_report(y_tst,y_pred))
print("Accuracy per class:")
aux=0
for i in range(np.shape(y_pred)[1]):
    print ("Class " ,i,": " ,mtc.accuracy_score(y_tst[:,i],y_pred[:,i]))
    aux=aux+mtc.accuracy_score(y_tst[:,i],y_pred[:,i])
print("Accuracy media: ",aux/37)


Total accuracy:  0.39247613700168443
Hamming loss:  0.02899980272243046
Precision:  0.7686536056690287
              precision    recall  f1-score   support

           0       0.86      0.70      0.77       109
           1       0.78      0.46      0.58       138
           2       0.78      0.64      0.70       224
           3       0.76      0.55      0.64       180
           4       0.82      0.69      0.75       177
           5       0.75      0.63      0.69       262
           6       0.65      0.47      0.55        55
           7       0.79      0.67      0.72       341
           8       0.64      0.52      0.57        91
           9       0.60      0.38      0.47        65
          10       0.72      0.66      0.69       181
          11       0.57      0.16      0.25        25
          12       0.69      0.51      0.58       172
          13       0.67      0.20      0.31        10
          14       0.91      0.68      0.78       173
          15       0.65      0.3

In [36]:
#SEGUNDA SOLUCIÓN
chain = ClassifierChain(LogisticRegression(C=10,tol=0.001,solver='liblinear', max_iter= 100))
chain.fit(X_tr, y_tr)
y_pred2=chain.predict(X_tst)


In [5]:
#tunning
#Se cargan las caracteristicas guardadas
columns=np.loadtxt('features250.txt',delimiter=',')
X_tr2=X_tr[:,columns]
#Reducir tamaño de muestra para un tunning mas rapido
X_aux, __, y_aux, __ = train_test_split(X_tr2, y_tr, test_size=0.1, random_state=42)
classif =  ClassifierChain(LogisticRegression())
parameters = {
    "base_estimator__C": [1,10,100,1000],
    "base_estimator__solver": ["liblinear"],
    "base_estimator__tol": [0.1,0.01,0.001,0.0001],
    "base_estimator__max_iter":[10000]
}

model_tunning = GridSearchCV(classif, param_grid=parameters,cv=4,scoring='accuracy')
#print(model_tunning.get_params().keys())

model_tunning.fit(X_aux, y_aux)

print (model_tunning.best_score_)
print (model_tunning.best_params_)


KeyboardInterrupt: 

In [37]:
accuracy=mtc.accuracy_score(y_tst,y_pred2)
hamming=mtc.hamming_loss(y_tst,y_pred2)
precision=mtc.precision_score(y_tst,y_pred2,average='micro')
print("Total accuracy: ", accuracy)
print("Hamming loss: ", hamming)
print("Precision: ", precision)
print(mtc.classification_report(y_tst,y_pred))
print("Accuracy per class:")

aux=0
for i in range(np.shape(y_pred2)[1]):
    print ("Class " ,i,": " ,mtc.accuracy_score(y_tst[:,i],y_pred2[:,i]))
    aux=aux+mtc.accuracy_score(y_tst[:,i],y_pred2[:,i])
print("Accuracy media: ",aux/37)

Total accuracy:  0.39135317237507017
Hamming loss:  0.03041109610452676
Precision:  0.7195301027900147
              precision    recall  f1-score   support

           0       0.86      0.70      0.77       109
           1       0.78      0.46      0.58       138
           2       0.78      0.64      0.70       224
           3       0.76      0.55      0.64       180
           4       0.82      0.69      0.75       177
           5       0.75      0.63      0.69       262
           6       0.65      0.47      0.55        55
           7       0.79      0.67      0.72       341
           8       0.64      0.52      0.57        91
           9       0.60      0.38      0.47        65
          10       0.72      0.66      0.69       181
          11       0.57      0.16      0.25        25
          12       0.69      0.51      0.58       172
          13       0.67      0.20      0.31        10
          14       0.91      0.68      0.78       173
          15       0.65      0.3

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
#RBM -- SE IMPLEMENTARÁ CON UN PIPELINE PERO AÚN FALTA ENTENDER MEJOR EL FUNCIONAMIENTO DE BERNOULLIRBM -- 
#-- NO EJECUTAR-- #
logistic = OneVsRestClassifier(LogisticRegression())
rbm = BernoulliRBM(random_state=0, verbose=True)
rbm_features_classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
rbm.learning_rate = 0.06
rbm.n_iter = 20
rbm.n_components = 100
rbm_features_classifier.fit(X_tr, y_tr)
Y_pred2 = rbm_features_classifier.predict(X_tst)
