In [665]:
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata
import re
import string

mainPath = "D:\\Users\\matteus-paula\\Documents\\Projetos\\python\\desafio_mprj\\desafio-ia\\energia"

targetnames = list()
filenames = list()
target = list()
data = list()

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    change_str = only_ascii.decode("utf-8")
    change_str = re.sub(r"[0-9]+","", change_str)
    change_str = re.sub(r"\r|\n|\t"," ", change_str)
    change_str = re.sub(r"\s\s+"," ", change_str)
    change_str = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~]+', "", change_str)

    
    return change_str.lower()


#64 ,135, 163, 305, 922, 11775, 11511, 54803 Files in each folder
# Use median to achieve 613

def load_data(targetnames, filenames, target, data, mainPath):

    for folder in listdir(mainPath):
        dir_folder = join(mainPath, folder)
        if not isfile(dir_folder):
            targetnames.append(folder)
            for file in listdir(dir_folder)[:613]:
                file_path = join(dir_folder, file)
                if isfile(file_path):
                    target.append(targetnames.index(folder))
                    filenames.append(file_path)
                    with open(file_path, 'rb') as f:
                        compressed_content = f.read()
                    data.append(remove_accents(compressed_content.decode("utf-8")))

    return train_test_split(data, target, test_size=0.33, random_state=42)


X_train, X_test, y_train, y_test = load_data(targetnames, filenames, target, data, mainPath)

from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words

stop_words_pt = get_stop_words('portuguese')

count_vect = CountVectorizer(stop_words=stop_words_pt)

X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

In [653]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

X_train_tf = tf_transformer.transform(X_train_counts)

X_train_tf.shape

(2089, 9122)

In [654]:
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_train_tfidf.shape

(2089, 9122)

In [655]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [656]:
docs_new = ['perigo de dano eletronico ficou evidenciado', 'sob pena de multa']

X_new_counts = count_vect.transform(docs_new)

X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, targetnames[category]))

'perigo de dano eletronico ficou evidenciado' => DanosEletrodomesticos
'sob pena de multa' => CobrancaSobAmeaca


In [657]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words_pt)),
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB()),])

text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['a', 'ao',...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [658]:

#twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

docs_test = X_test
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test)

0.6718446601941748

In [666]:
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import pandas as pd

from sklearn.model_selection import cross_val_score

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, X_train_tfidf, y_train, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df.groupby('model_name').accuracy.mean()



model_name
LinearSVC                 0.866943
LogisticRegression        0.815216
MultinomialNB             0.650493
RandomForestClassifier    0.681644
SGDClassifier             0.833893
Name: accuracy, dtype: float64

In [667]:
text_clf = Pipeline([
     ('vect', CountVectorizer(stop_words=stop_words_pt)),
     ('tfidf', TfidfTransformer()),
     ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced'))),
    
])

text_clf.fit(X_train, y_train)
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test) 

0.8902912621359224

In [660]:
from sklearn import metrics

s = set(y_test)
target_names_test = [targetnames[i] for i in s]


print(metrics.classification_report(y_test, predicted,
target_names=target_names_test))


                                           precision    recall  f1-score   support

              CobrancaServicoNaoFornecido       0.88      0.78      0.83        49
                        CobrancaSobAmeaca       0.91      0.93      0.92       222
                           CobrancaTarifa       0.89      0.87      0.88        46
                    DanosEletrodomesticos       0.92      0.94      0.93       206
DificuldadeContratacaoRecusaInjustificada       0.98      0.92      0.95       182
                  DificuldadeRenegociacao       0.71      0.80      0.75        94
     InterrupcaoInstabilidadeFornecimento       0.86      0.88      0.87       209
                      NegativacaoIndevida       0.93      0.59      0.72        22

                                micro avg       0.89      0.89      0.89      1030
                                macro avg       0.88      0.84      0.86      1030
                             weighted avg       0.89      0.89      0.89      1030



In [537]:
metrics.confusion_matrix(y_test, predicted)

array([[ 37,   1,   0,   0,   4,   5,   9,   0],
       [  0, 331,   0,   3,   1,   1,   2,   0],
       [  0,   2,  49,   1,   1,   1,   2,   0],
       [  0,   4,   0, 308,   7,   0,  11,   0],
       [  0,   6,   0,  10, 280,   0,   1,   0],
       [  1,  39,   1,   4,   6,  34,   4,   1],
       [  0,  25,   0,  15,   6,   3, 282,   0],
       [  1,   4,   2,   0,   0,   2,   0,   8]], dtype=int64)