In [2]:
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata
import re
import string

mainPath = "D:\\Users\\matteus-paula\\Documents\\Projetos\\python\\desafio_mprj\\desafio-ia\\energia"

targetnames = list()
filenames = list()
target = list()
data = list()

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    change_str = only_ascii.decode("utf-8")
    change_str = re.sub(r"[0-9]+","", change_str)
    change_str = re.sub(r"\r|\n|\t"," ", change_str)
    change_str = re.sub(r"\s\s+"," ", change_str)
    change_str = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~]+', "", change_str)
   
    return change_str.lower()


def load_data(targetnames, filenames, target, data, mainPath):

    for folder in listdir(mainPath):
        dir_folder = join(mainPath, folder)
        if not isfile(dir_folder):
            targetnames.append(folder)
            for file in listdir(dir_folder):
                file_path = join(dir_folder, file)
                if isfile(file_path):
                    target.append(targetnames.index(folder))
                    filenames.append(file_path)
                    with open(file_path, 'rb') as f:
                        compressed_content = f.read()
                    data.append(remove_accents(compressed_content.decode("utf-8")))

    return train_test_split(data, target, test_size=0.33, stratify=target, random_state=42)


X_train, X_test, y_train, y_test = load_data(targetnames, filenames, target, data, mainPath)

['homologo para que surta seus efeitos legais o projeto de sentenca elaborado pelo juiz leigo na forma do artigo da lei no  apos o transito em julgado tratandose de sentenca de improcedencia ou de extincao do feito sem resolucao do merito dese baixa e arquivemse imediatamente tratandose de sentenca de procedencia aguardese por dias a manifestacao das partes e em seguida caso permanecam em silencio procedase a baixa e ao arquivamento adverte se as partes que antes da pratica de qualquer ato executivo uma vez escoado o prazo de dias previsto no art do cpc sem que tenha havido o cumprimento da obrigacao reconhecida na sentenca incidira automaticamente a multa de  dez por cento a que se refere o artigo e se procedera a intimacao da parte credora para que se manifeste no prazo de cinco dias sobre seu interesse em efetivar o protesto do titulo judicial na conformidade do art do ncpc e do ato executivo conjunto tjcgj n  publicado no dje em  pri',
 'defiro jg a parte autora anotese tratase de 

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words

stop_words_pt = get_stop_words('portuguese')

count_vect = CountVectorizer(stop_words=stop_words_pt)

X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(53384, 22017)

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

X_train_tf = tf_transformer.transform(X_train_counts)

X_train_tf.shape

(53384, 22017)

In [5]:
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_train_tfidf.shape

(53384, 22017)

In [6]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [7]:
docs_new = ['perigo de dano eletronico ficou evidenciado', 'sob pena de multa']

X_new_counts = count_vect.transform(docs_new)

X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, targetnames[category]))

'perigo de dano eletronico ficou evidenciado' => DanosEletrodomesticos
'sob pena de multa' => InterrupcaoInstabilidadeFornecimento


In [8]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words_pt)),
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB()),])

text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['a', 'ao',...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [9]:
docs_test = X_test
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test)

0.8716437210009889

In [14]:
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

from sklearn.model_selection import cross_val_score

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    KNeighborsClassifier(n_neighbors=8),
    SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, X_train_tfidf, y_train, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df.groupby('model_name').accuracy.mean()



model_name
KNeighborsClassifier      0.940113
LinearSVC                 0.980237
LogisticRegression        0.963435
MultinomialNB             0.873575
RandomForestClassifier    0.750710
SGDClassifier             0.930822
Name: accuracy, dtype: float64

In [15]:
text_clf = Pipeline([
     ('vect', CountVectorizer(stop_words=stop_words_pt)),
     ('tfidf', TfidfTransformer()),
     ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced'))),
    
])

text_clf.fit(X_train, y_train)
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test) 



0.9814786643340686

In [16]:
from sklearn import metrics

s = set(y_test)
target_names_test = [targetnames[i] for i in s]


print(metrics.classification_report(y_test, predicted,
target_names=target_names_test))


                                           precision    recall  f1-score   support

              CobrancaServicoNaoFornecido       0.93      0.76      0.84        54
                        CobrancaSobAmeaca       0.98      0.99      0.98      3799
                           CobrancaTarifa       0.42      0.86      0.56        44
                    DanosEletrodomesticos       0.96      0.98      0.97      3886
DificuldadeContratacaoRecusaInjustificada       0.88      0.92      0.90       304
                  DificuldadeRenegociacao       0.62      0.42      0.50       101
     InterrupcaoInstabilidadeFornecimento       0.99      0.99      0.99     18085
                      NegativacaoIndevida       1.00      0.57      0.73        21

                                micro avg       0.98      0.98      0.98     26294
                                macro avg       0.85      0.81      0.81     26294
                             weighted avg       0.98      0.98      0.98     26294



In [537]:
metrics.confusion_matrix(y_test, predicted)

array([[ 37,   1,   0,   0,   4,   5,   9,   0],
       [  0, 331,   0,   3,   1,   1,   2,   0],
       [  0,   2,  49,   1,   1,   1,   2,   0],
       [  0,   4,   0, 308,   7,   0,  11,   0],
       [  0,   6,   0,  10, 280,   0,   1,   0],
       [  1,  39,   1,   4,   6,  34,   4,   1],
       [  0,  25,   0,  15,   6,   3, 282,   0],
       [  1,   4,   2,   0,   0,   2,   0,   8]], dtype=int64)