# Classificação binária por termos 

In [1]:
from comet_ml import Experiment
import collections
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

import joblib



In [2]:
colunas = ['ROTULO_MANUAL', 'EMENTA_NORM']

df = pd.read_csv('../data/ementas_pre-processadas.csv', header=0, sep=',', quotechar='"', usecols=colunas)

In [3]:
rotulos = ['EXP', 'BAN', 'OIG', 'DAN', 'SEG', 'CON', 'OIE']

In [4]:
def separarAmostras(rotulo, dataframe):
   
    df = dataframe.copy()
    df.loc[df.ROTULO_MANUAL != rotulo, 'ROTULO_MANUAL'] = 'NONE'

    quantidade = df['ROTULO_MANUAL'].value_counts()[rotulo]
    print("Quantidade de amostras do rótulos: {}".format(quantidade))
    
    x = df['EMENTA_NORM'].values.astype('U')
    y = df['ROTULO_MANUAL'].values
    
    return train_test_split(x, y)

In [5]:
def criarModeloBinario(y_train, y_test):

    vect = CountVectorizer()
    tfidf = TfidfTransformer()
    clf = LogisticRegression(solver='lbfgs')

    pipe = Pipeline([
        ('vect', vect),
        ('tfidf', tfidf),
        ('dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True, validate=True)),
        ('clf', clf)
    ])
    
    return pipe.fit(x_train, y_train)

In [6]:
def avaliarModelo(clf, x_test):
    y_pred = clf.predict(x_test)
    
    print(metrics.classification_report(y_test, y_pred))
    
    return metrics.classification_report(y_test, y_pred, output_dict=True)

In [7]:
def converterMetricas(rotulo, result):
    metrics_dict = {}
    
    for metric in result[rotulo].keys():
        metrics_dict[rotulo + '-' + metric] = str(result[rotulo][metric])
            
    return metrics_dict

## Iniciando experimento

In [8]:
experiment = Experiment(project_name="igti-projeto-aplicado", workspace="piantino")

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/piantino/igti-projeto-aplicado/d25158e385af4a1384c025d9a02aea34



In [9]:
accuracy = 0

for rotulo in rotulos:
    
    print('"{}" Gerando modelo'.format(rotulo))
    
    x_train, x_test, y_train, y_test = separarAmostras(rotulo, df)
    
    clf = criarModeloBinario(y_train, y_test)
    
    result = avaliarModelo(clf, x_test)
    
    accuracy = result['accuracy'] + accuracy
    
    experiment.log_metrics(converterMetricas(rotulo, result))

experiment.log_metric('accuracy', accuracy / len(rotulos))

"EXP" Gerando modelo
Quantidade de amostras do rótulos: 1784
              precision    recall  f1-score   support

         EXP       0.97      0.88      0.93       477
        NONE       0.97      0.99      0.98      2002

    accuracy                           0.97      2479
   macro avg       0.97      0.94      0.95      2479
weighted avg       0.97      0.97      0.97      2479

"BAN" Gerando modelo
Quantidade de amostras do rótulos: 971
              precision    recall  f1-score   support

         BAN       0.92      0.38      0.54       255
        NONE       0.93      1.00      0.96      2224

    accuracy                           0.93      2479
   macro avg       0.92      0.69      0.75      2479
weighted avg       0.93      0.93      0.92      2479

"OIG" Gerando modelo
Quantidade de amostras do rótulos: 752
              precision    recall  f1-score   support

        NONE       0.98      1.00      0.99      2304
         OIG       0.94      0.78      0.85       175

 

In [10]:
experiment.end()

COMET INFO: ----------------------------
COMET INFO: Comet.ml Experiment Summary:
COMET INFO:   Data:
COMET INFO:     url: https://www.comet.ml/piantino/igti-projeto-aplicado/d25158e385af4a1384c025d9a02aea34
COMET INFO:   Metrics:
COMET INFO:      BAN-f1-score: 0.5373961218836566
COMET INFO:     BAN-precision: 0.9150943396226415
COMET INFO:        BAN-recall: 0.3803921568627451
COMET INFO:       BAN-support: 255
COMET INFO:      CON-f1-score: 0.24354243542435428
COMET INFO:     CON-precision: 0.868421052631579
COMET INFO:        CON-recall: 0.14163090128755365
COMET INFO:       CON-support: 233
COMET INFO:      DAN-f1-score: 0.6011560693641619
COMET INFO:     DAN-precision: 0.8387096774193549
COMET INFO:        DAN-recall: 0.46846846846846846
COMET INFO:       DAN-support: 222
COMET INFO:      EXP-f1-score: 0.925438596491228
COMET INFO:     EXP-precision: 0.9701149425287356
COMET INFO:        EXP-recall: 0.8846960167714885
COMET INFO:       EXP-support: 477
COMET INFO:      OIE-f1-scor