# Modelo de classificador pela lei

In [1]:
from comet_ml import Experiment

experiment = Experiment(project_name="igti-projeto-aplicado-leis", workspace="piantino")

COMET INFO: old comet version (1.0.55) detected. current: 1.0.56 please update your comet lib with command: `pip install --no-cache-dir --upgrade comet_ml`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/piantino/igti-projeto-aplicado-leis/2f6cffc20c734a9c9860444407679d33



In [2]:
import pandas as pd

colunas = ['PROCESSO', 'ROTULO_MANUAL', 'LEI']

df = pd.read_csv('../data/leis.csv', header=0, sep=',', quotechar='"', usecols=colunas)

In [3]:
 df.head()

Unnamed: 0,PROCESSO,ROTULO_MANUAL,LEI
0,39604720138240039,CDC,artigo 535
1,39604720138240039,CDC,9656/1998
2,39604720138240039,CDC,616/2012
3,39604720138240039,CDC,artigo 535
4,39604720138240039,CDC,9656/1998


Utitiza apenas label que contenham mais de 200 exemplos.

In [4]:
rotulos = ['EXP', 'BAN', 'OIG', 'DAN', 'SEG', 'CON', 'OIE']

df.loc[~df['ROTULO_MANUAL'].isin(rotulos), 'ROTULO_MANUAL'] = 'NONE'

In [5]:
groupby_rotulo = df.groupby('ROTULO_MANUAL')

groupby_rotulo[['LEI']].count().sort_values(['LEI'], ascending=False)

Unnamed: 0_level_0,LEI
ROTULO_MANUAL,Unnamed: 1_level_1
EXP,28742
NONE,18962
BAN,7063
OIG,6577
DAN,5067
CON,3977
OIE,3112
SEG,2969


In [6]:
df.shape

(76469, 3)

In [7]:
data = df.groupby(['PROCESSO', 'ROTULO_MANUAL', 'LEI']).size().unstack('LEI', fill_value=0)
data = data.reset_index()

data.head()

LEI,PROCESSO,ROTULO_MANUAL,1/1975,1/2012,1/2014,100/2015,101/2010,10144/2002,10150/2000,1025/2009,...,artigo 975,artigo 977,artigo 98,artigo 982,artigo 988,artigo 99,artigo 991,artigo 995,artigo 996,artigo 998
0,12120028240050,CON,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,32720118240033,CON,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,42220138240104,BAN,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,42920148240058,CON,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,57320168240242,NONE,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
y = data['ROTULO_MANUAL'].values
y.shape

(6390,)

In [9]:
df1 = data.iloc[:,2:].copy()
df1.head()

LEI,1/1975,1/2012,1/2014,100/2015,101/2010,10144/2002,10150/2000,1025/2009,10257/2001,10391/2004,...,artigo 975,artigo 977,artigo 98,artigo 982,artigo 988,artigo 99,artigo 991,artigo 995,artigo 996,artigo 998
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
x = df1.values
x.shape

(6390, 946)

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)

import collections

print(collections.Counter(y_train))
print(collections.Counter(y_test))

Counter({'NONE': 1660, 'EXP': 578, 'BAN': 535, 'OIG': 532, 'OIE': 437, 'DAN': 376, 'CON': 353, 'SEG': 321})
Counter({'NONE': 525, 'BAN': 187, 'EXP': 182, 'OIG': 179, 'DAN': 141, 'OIE': 137, 'CON': 127, 'SEG': 120})


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.tree import DecisionTreeClassifier            
clf = DecisionTreeClassifier()

text_clf = Pipeline([
    ('clf', clf)
])

text_clf.fit(x , y)

Pipeline(memory=None,
     steps=[('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

Testando o modelo.

In [13]:
y_pred = text_clf.predict(x_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

accuracy

0.8504380475594493

In [14]:
experiment.log_metric("accuracy", accuracy)

In [15]:
from sklearn import metrics

metrics_dict = {}

result = metrics.classification_report(y_test, y_pred, output_dict=True)

for label in result.keys():
    for metric in result[label].keys():
        metrics_dict[label + '-' + metric] = str(result[label][metric])

experiment.log_metrics(metrics_dict)

In [16]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         BAN       0.91      0.80      0.85       187
         CON       0.91      0.86      0.88       127
         DAN       0.92      0.79      0.85       141
         EXP       0.91      0.91      0.91       182
        NONE       0.78      0.93      0.85       525
         OIE       0.70      0.74      0.72       137
         OIG       0.98      0.79      0.88       179
         SEG       0.98      0.75      0.85       120

   micro avg       0.85      0.85      0.85      1598
   macro avg       0.89      0.82      0.85      1598
weighted avg       0.86      0.85      0.85      1598



In [17]:
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(y_test, y_pred)

In [18]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

labels = set(y_test)
      
df_cm = pd.DataFrame(cnf_matrix, range(len(labels)), range(len(labels)))
plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, xticklabels=labels, yticklabels=labels, annot=True, fmt='g', annot_kws={"size": 16})

<matplotlib.axes._subplots.AxesSubplot at 0x7f13df683ef0>

In [19]:
features = list(data.columns.values)
del features[0:2]

len(features)

946

In [20]:
df_sample = pd.DataFrame(columns=features)
df_sample = df_sample.append({'artigo 1018': 1}, ignore_index=True)
df_sample = df_sample.fillna(0)

df_sample

Unnamed: 0,1/1975,1/2012,1/2014,100/2015,101/2010,10144/2002,10150/2000,1025/2009,10257/2001,10391/2004,...,artigo 975,artigo 977,artigo 98,artigo 982,artigo 988,artigo 99,artigo 991,artigo 995,artigo 996,artigo 998
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
y_pred = text_clf.predict(df_sample)

y_pred

array(['EXP'], dtype=object)

In [22]:
from sklearn.externals import joblib

joblib.dump(features, '../modelos/Features-Leis.pkl', compress = 1)
joblib.dump(clf, '../modelos/Modelo-Leis.pkl', compress = 1) 

['../modelos/Modelo-Leis.pkl']

In [23]:
experiment.log_asset('../modelos/Features-Leis.pkl')
experiment.log_asset('../modelos/Modelo-Leis.pkl')

{'web': 'https://www.comet.ml/api/asset/download?assetId=37ea305598a14d0d9614daabdd279c27&experimentKey=2f6cffc20c734a9c9860444407679d33',
 'api': 'https://www.comet.ml/api/rest/v1/asset/get-asset?assetId=37ea305598a14d0d9614daabdd279c27&experimentKey=2f6cffc20c734a9c9860444407679d33'}

In [24]:
experiment.end()

COMET INFO: ----------------------------
COMET INFO: Comet.ml Experiment Summary:
COMET INFO:   Data:
COMET INFO:     url: https://www.comet.ml/piantino/igti-projeto-aplicado-leis/2f6cffc20c734a9c9860444407679d33
COMET INFO:   Metrics:
COMET INFO:               BAN-f1-score: 0.8514285714285714
COMET INFO:              BAN-precision: 0.9141104294478528
COMET INFO:                 BAN-recall: 0.7967914438502673
COMET INFO:                BAN-support: 187
COMET INFO:               CON-f1-score: 0.8825910931174088
COMET INFO:              CON-precision: 0.9083333333333333
COMET INFO:                 CON-recall: 0.8582677165354331
COMET INFO:                CON-support: 127
COMET INFO:               DAN-f1-score: 0.8517110266159698
COMET INFO:              DAN-precision: 0.9180327868852459
COMET INFO:                 DAN-recall: 0.7943262411347518
COMET INFO:                DAN-support: 141
COMET INFO:               EXP-f1-score: 0.9095890410958904
COMET INFO:              EXP-precision: 0.