In [1]:
import os, sys, re, time, string, unicodedata

import tensorflow as tf
import pickle
import numpy as np
from tensorflow.contrib.learn import DNNClassifier
import time

# Representation
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from lime import lime_text
from lime.lime_text import LimeTextExplainer

#Carga stop word
#nltk.download('stopwords')
spanish_stopwords = nltk.corpus.stopwords.words('spanish')



  from ._conv import register_converters as _register_converters


### funciones y clases "template"

Para normalizar y clasficador , calcular features TFIDF+SVD,  DNN para evular

In [2]:

def normalize_text(text):
    """ Funcion de normalizacion """    
    # split into words
    tokens = nltk.tokenize.word_tokenize(text,language='spanish', preserve_line=False)
    # convert to lower case
    tokens = [w.lower() for w in tokens]    
    
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # remove remaining tokens that are n<<<<<<<<<<<<<<<<<<<<<
    words = [word for word in stripped if word.isalpha()]
    
    # stop word and remove accent
    def strip_accents(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    stop_words = set(spanish_stopwords)
    words = [strip_accents(w) for w in words if not w in stop_words]
#    return u" ".join(words)
    
    stemmer = SnowballStemmer("spanish")
    out = ""
    for word in words:
        out += stemmer.stem(word)+" "    
    return out

In [3]:
class tfidf_svd:
    def __init__(self, wordbach_trained , svd_trained,non_zero_index_feat,normalize_text):
        """ extractor features"""
        self.wordbach = wordbach_trained
        self.svdT = svd_trained
        self.non_zero_index_feat = non_zero_index_feat
        self.normalize_text = normalize_text
    
    def calc(self,text):
        
        self.tfidf = self.wordbach.transform([self.normalize_text(text)])
        self.tfidf = self.tfidf[:, self.non_zero_index_feat]
        tfidf_svd = self.svdT.transform(self.tfidf)
        return(tfidf_svd)
    
    def calcBatch(self,texts):
        
        normTxts = []
        for text in texts:
            tmp = self.normalize_text(text)
            normTxts.append(tmp)
        
        self.tfidf = self.wordbach.transform(normTxts)
        self.tfidf = self.tfidf[:, self.non_zero_index_feat]
        tfidf_svd = self.svdT.transform(self.tfidf)
        return(tfidf_svd) 

In [4]:
class DNN_eval:
    def __init__(self, labels, path_model,dim_vec_input):
        """ DNN eval"""
        self.path_model = path_model
        self.labels = labels
        
        self.nClasses = len(self.labels)
        self.feature_columns = [tf.contrib.layers.real_valued_column('x', dimension = dim_vec_input)]
        self.classifier = DNNClassifier(                                
                                   n_classes=len(labels), label_keys=self.labels, 
                                   feature_columns=self.feature_columns,
                                   hidden_units=[2000], 
                                   model_dir = self.path_model                         
                                  )
    def input_fn_evaluate(self):
        input = {'x': tf.constant(self.vec_input )}    
        return input    

    def calc(self,vec_input):
        self.vec_input = vec_input
        #pred_test = self.classifier.predict_classes(input_fn=self.input_fn_evaluate)
        pred_prob = self.classifier.predict_proba(input_fn=self.input_fn_evaluate)
        pred_prob = [x for x in list(pred_prob)]
        y_test_hat = self.labels[np.argmax(pred_prob)]
        #y_test_hat = np.asarray([x.decode('UTF-8') for x in list(pred_test)])
        #y_test_hat = y_test_hat.astype(str)
        return (y_test_hat , pred_prob[0])
    
    def calcBatch(self,vec_input):
        self.vec_input = vec_input
        pred_prob = self.classifier.predict_proba(input_fn=self.input_fn_evaluate)
        pred_prob = [x for x in list(pred_prob)]  
        return (np.array(pred_prob))        
    

In [5]:
root_path = "../"

file_name = "tfid_hash28_n10000_svd1000.p"
features_path = root_path + 'data/features/'
features_path_ = features_path + "calcFeat_" + file_name
path_model = root_path + 'models/test/info_model.p'



### carga clase extractor features TFIDF+SVF

In [6]:
wb,svdT,non_zero_index_feat = pickle.load( open( features_path_, "rb" ) )

tfidf_svd_model = tfidf_svd(wb,svdT,non_zero_index_feat,normalize_text)  

FileNotFoundError: [Errno 2] No such file or directory: '../data/features/calcFeat_tfid_hash28_n10000_svd1000.p'

### carga modelo DNN para inferencia

In [None]:
path_model,labels = pickle.load( open( path_model, "rb" ) )

sec1 = DNN_eval(labels, path_model,1000)

### evalua una entrada de test 

In [None]:
# evalua 
test_text = 'El Banco Falabella se convertirá en el mayor emisor de tarjetas de crédito del país, después de que la Superintendencia de Bancos e Instituciones Financieras (SBIF) aprobara la integración de CMR Falabella a la compañía. La figura bajo la cual CMR se integra a Banco Falabella es la de Sociedad de Apoyo al Giro (SAG). Con esto, Banco Falabella será el mayor emisor de tarjetas de crédito del país, con una cantidad superior a los 3 millones de ellas activas, según Diario Financiero.'

start = time.time()
feat = tfidf_svd_model.calc(test_text)
y_test_hat , pred_prob = sec1.calc( vec_input = feat)

print("end time :" + str(time.time() - start) )
y_test_hat

### plot

In [None]:
import matplotlib.pyplot as plt

plt.bar(np.arange(8), pred_prob)
plt.xticks(np.arange(8), labels)
plt.xticks(rotation=70)
plt.show()

# predicciton explained LIME

In [None]:
def predictTema2(texts):
    outs = []
    
    feat_data = tfidf_svd_model.calcBatch(texts)
    pred_prob = sec1.calcBatch( vec_input = feat_data)

    return pred_prob

In [None]:
explainer = LimeTextExplainer(class_names=labels)

In [None]:
start = time.time()

exp = explainer.explain_instance(test_text, predictTema2, num_features=10, top_labels=2 )#labels=[0, 2] top_labels=2
exp.show_in_notebook(text=test_text)

print("end time :" + str(time.time() - start) )


In [None]:
exp.as_map()

In [None]:
exp.as_html()

In [None]:
exp = explainer.explain_instance(test_text, predictTema2, num_features=10, top_labels=2 )#labels=[0, 2] top_labels=2


In [None]:
tfidf_svd_model.calcBatch([test_text,test_text])

In [None]:
feat = tfidf_svd_model.calc(test_text)


In [None]:
sec1.calcBatch( vec_input = feat)