In [None]:
# Utils
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
import time
import pickle
import re, sys, unidecode

from sklearn.metrics import accuracy_score
from IPython.display import display, Markdown, Latex

import shutil
import os

import scikitplot as skplt

from tensorflow.contrib.learn import DNNClassifier



# Parameters

In [None]:
root_path = "../"
path_model = root_path + 'models/seccion'
features_path = root_path + 'data/features/data_tfid_hash28_n1000_SVD2.p'

delete_old_model = True
if delete_old_model:
    try:
        os.system("rm -rf "+path_model)
        os.system("mkdir "+path_model)
    except:
        print("error")

# Load Data

In [None]:
X, y1, y2, y3 = pickle.load( open( features_path, "rb" ) )


In [None]:
y1.values

# pre procesing

Cleaning data, select clases

In [None]:
y2 = np.array(y2)
y_2 = list(map(lambda x: unidecode.unidecode(x) if x!=None else None, y2))
y_2 = np.array(y_2)


y1 = y1.values

# Feature Extraccion

### Data representation

TFID calculation

### filter clases

In [None]:
labels = list(set(y1))
labels

In [None]:
def filterClases(X,y,umbral):
    labels = list(set(y))
    sizes = [ [label,y[y== label].shape[0]] for label in labels ]
    filter_sizes = list(filter(lambda x:x[1]>umbral ,sizes ))
    names_clases = set( map(lambda x:x[0], filter_sizes ) )
    index = list(map(lambda x: {x}.issubset(names_clases), y ))
    y = y[index]
    X = X[np.nonzero(index)]
    return X,y



In [None]:
y_temas = {}
X_temas = {}
labels_temas = {}
labels_temas_before = {}
umbral_ejemplos = 100

for key in labels:
    index = y1 == key
    y_22 = y_2[index]
    X2 = X[np.nonzero(index)]

    index = y_22 != None
    y_22 = y_22[index]
    X2 = X2[np.nonzero(index)]
    X_temas[key],y_temas[key] = filterClases(X2,y_22,umbral_ejemplos)
    
    
    labels_temas_before = list(set(y_22))
    labels_temas[key] = list(set(y_temas[key]))
    
    print("\n"+key + " ,total : " + str(len(labels_temas[key])) \
          +" ,deleted : "+ str( len(labels_temas_before) - len(labels_temas[key] )) )
    print(labels_temas[key])


### Output

In [None]:
np.random.seed(42)
train_fraction = 0.8

X_train = {}
y_train = {}
X_test = {}
y_test = {}
train_indices = {}
test_indices = {}

for key in labels:
    train_indices[key] = np.random.choice(X_temas[key].shape[0], round(train_fraction*X_temas[key].shape[0]), replace=False)
    test_indices[key] = np.array(list(set(range(X_temas[key].shape[0])) - set(train_indices)))

    X_train[key] = X_temas[key][train_indices[key]]
    y_train[key] = y_temas[key][train_indices[key]]
    X_test[key] = X_temas[key][test_indices[key]]
    y_test[key] = y_temas[key][test_indices[key]]

In [None]:
weights_train = {}
weights_test = {}

np.random.seed(42)

for key in labels:
    labelsTmp = list(set(y_train[key]))
    nClasses = len(labelsTmp)

    sizes = {label: y_train[key][y_train[key] == label].shape[0] for label in labelsTmp}
    weights = np.asarray([len(y_train[key])/(sizes[label]*nClasses) for label in y_temas[key]])

    weights = weights[:,np.newaxis]
    weights_train[key] = weights[train_indices[key]]
    weights_test[key] = weights[test_indices[key]]

    

# Model

### DNN graph generation

In [None]:
epochs = 14000

# Define the test inputs
def get_train_inputs(key):    
    dataset = tf.estimator.inputs.numpy_input_fn({'x': X_train[key].todense(),'class_weights': weights_train[key]},
                                                  y_train[key][:,np.newaxis],
                                                  shuffle=True,
                                                  batch_size=50,
                                                  num_epochs=epochs)
    return dataset

def get_test_inputs(key):   
    dataset = tf.estimator.inputs.numpy_input_fn({'x': X_test[key].todense(),'class_weights': weights_test[key]},
                                                  y_test[key][:,np.newaxis],
                                                  shuffle=False)
    return dataset

### functions graph tensorflow

In [None]:


tf.set_random_seed(42)
feature_columns = [tf.contrib.layers.real_valued_column('x', dimension=1000)]

classifier = {}
path_all_models  = {}
for key in labels:
    path_all_models[key] = path_model+"/"+str(key)
    classifier[key] = DNNClassifier(                                
                               n_classes=len(labels_temas[key]), label_keys=labels_temas[key], feature_columns=feature_columns,
                               hidden_units=[2000],
                               dropout=0.5,
                               weight_column_name='class_weights',
                               model_dir = path_model+"/"+str(key),
                               config = tf.contrib.learn.RunConfig(save_checkpoints_steps = 500,
                               save_checkpoints_secs = None)                           
                              )

# Training



In [None]:
#validation_monitor = {}

print("start")
start = time.time()

for key in labels:
    
    validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
        #input_fn= get_train_inputs(),
        input_fn= get_test_inputs(key),    
        every_n_steps=500,
        #early_stopping_metric="accuracy",#loss
        early_stopping_metric="loss",
        early_stopping_metric_minimize=True,
        early_stopping_rounds=2000)

    

    classifier[key].fit(input_fn=get_train_inputs(key), monitors=[validation_monitor], steps=epochs, max_steps=None)
    print("###################### "+key+" ######################")
    
end = time.time()
#print(key)
print("Training time :" + str(end - start) )


# Evaluation performance


In [None]:
pred_test = {}
y_test_hat = {}
y_test_hat = {}
acc = {}

for key in labels:

    def input_fn_evaluate():
        dataset = {'x': tf.constant(X_test[key].todense())}    
        return dataset

    pred_test[key] = classifier[key].predict_classes(input_fn=input_fn_evaluate)
    y_test_hat[key] = np.asarray([x.decode('UTF-8') for x in list(pred_test[key])])
    y_test_hat[key] = y_test_hat[key].astype(str)
    acc[key] = accuracy_score(y_true=y_test[key], y_pred=y_test_hat[key])


In [None]:
for key in labels:
    display(Markdown('## ' + key))
    display(Markdown('## Accuracy in test: {} '.format(acc[key]*100)))    
    skplt.metrics.plot_confusion_matrix(y_test[key], y_test_hat[key],normalize=True,figsize=(20,20))
    plt.xticks(rotation=45)
    plt.show()
    #print('Accuracy in test: {}'.format(acc))


###  Save info model

In [None]:
pickle.dump( (path_all_models,labels_temas), open( path_model + "/info_model.p", "wb" ) )