In [None]:
#from IPython.display import display
import pandas as pd, time

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

# Representação de textos
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline

import nltk

stemmer = nltk.stem.RSLPStemmer()
stopwords = nltk.corpus.stopwords.words('portuguese')

def stem_sentences(sentence):
    
    tokens = sentence.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

def pre_processamento(version, df):
    
    if (version == 'stemming'):
        return df['produto'].apply(lambda row: stem_sentences(row))
    
    elif (version == 'rm_numeros'):
        return df['produto'].str.replace('\d+', '')
    
    elif (version == 'rm_pontuacoes'):
        return df['produto'].str.replace('[^a-zA-Z0-9]', ' ')
        
    return df['produto']

def mostrar_metricas(model_name, prep, y_pred, y_test, show_grid_parameters = False, show_report = False):
    
    print('Nome do modelo:', model_name)
    print('Técnica de pré-processamento:', prep)
    print('Acurácia: %s' % accuracy_score(y_pred, y_test))
    print('Precisão: %s' % precision_score(y_pred, y_test, average='macro'))
    print('Revocação: %s' % recall_score(y_pred, y_test, average='macro'))
    
    if (show_grid_parameters):
        print('')
        print("Best parameters set:")
        print(grid_search_tune.best_estimator_.steps)
    
    if (show_report):
        print('')
        print(classification_report(y_test, y_pred, digits=4))    
    
    print('')
    
def mostrar_tempo(start):
    print('Tempo em segundos: ', time.time() - start)
    print('')
    
def mostrar_ex_classificacoes(y_test, y_pred, corretas = False, min_diff = 5):
    
    conf_mat = confusion_matrix(y_test, y_pred)
    
    for predicted in categoria_id_df.categoria_id: 
        for actual in categoria_id_df.categoria_id:
            #conf_mat[actual, predicted] >= 2:
            if (((not corretas and predicted != actual) or (corretas and predicted == actual))
                and conf_mat[actual, predicted] >= min_diff):
                print("'{}' predicted as '{}' : {} examples.".format(id_para_categoria[actual], id_para_categoria[predicted], conf_mat[actual, predicted]))
                display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['categoria', 'produto']])
                print("")

df = pd.read_csv('data/produtos_categorias.csv', delimiter=";", quotechar="'", escapechar="\\") # csv to dataframe
df = df.applymap(str.lower) #lowercase

df.columns = ['produto', 'categoria'] # lowercase titles
df['categoria_id'] = df['categoria'].factorize()[0] # categoria como int na coluna categoria_id

categoria_id_df = df[['categoria', 'categoria_id']].drop_duplicates().sort_values('categoria_id')
id_para_categoria = dict(categoria_id_df[['categoria_id', 'categoria']].values)

models = [
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto')
]

preps = [ 'rm_pontuacoes', 'rm_numeros', 'rm_stopwords', 'stemming', 'none' ]

for model in models:
    for prep in list(preps):
        
        start = time.time()
        
        model_name = model.__class__.__name__
        
        y = df.categoria_id
        X = pre_processamento(prep, df)
            
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(sublinear_tf=True, norm='l2', stop_words=(stopwords if (prep == 'rm_stopwords') else None))),
            ('clf', model),
        ])

        parameters = {
            'tfidf__min_df': (1, 2, 3, 4, 5),
            'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)]
        }

        X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.33, random_state=42)

        grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=0, iid=True)
        grid_search_tune.fit(X_train, y_train)

        y_pred = grid_search_tune.predict(X_test)
                
        mostrar_metricas(model_name, prep, y_pred, y_test)
        mostrar_tempo(start)
        mostrar_ex_classificacoes(y_test, y_pred)        
        