In [1]:
import os
from os import walk
from os.path import join, split, splitext
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import argparse
import json
import numpy as np
import pandas as pd

# backend
import tensorflow as tf
from tensorflow.keras import backend as K

from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Conv1D, MaxPooling1D, concatenate, Concatenate, BatchNormalization, Dropout
from tensorflow.keras.models import Model, Sequential, model_from_json

from tensorflow.keras.optimizers import SGD, RMSprop
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
 
from sklearn.utils import class_weight

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
from tqdm import trange

from pathlib import Path

In [2]:
seed = 42

# python RNG
import random
random.seed(seed)

# pytorch RNGs
import torch
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# numpy RNG
import numpy as np
np.random.seed(seed)

# tensorflow RNG
tf.random.set_seed(seed)

In [3]:
nClasses = 6
batch_size = 64
epoch = 20

MAX_FEATURES = 70000  # Size of vocabulary
EMBEDDING_DIM = MAX_FEATURES  # Size of vocabulary
SEQUENCE_LEN = 500 # Size of input arrays
EMBEDDING_OUT = 100  # Output dim of embedding

In [4]:
output_path = Path("./models")

In [5]:
data_path = Path("/mnt/nas/backups/08-07-2020/desktopg01/lisa/Data/CSV")

In [6]:
train = pd.read_csv(data_path/"train_small.csv", usecols=['document_type', 'body'])
val = pd.read_csv(data_path/"validation_small.csv", usecols=['document_type', 'body'])
test_data = pd.read_csv(data_path/"test_small.csv", usecols=['document_type', 'body'])

In [7]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train['body'])
with open(join(output_path, 'tokenizer.pickle'), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
sequences_train = tokenizer.texts_to_sequences(train['body'])
sequences_validation = tokenizer.texts_to_sequences(val['body'])
sequences_test = tokenizer.texts_to_sequences(test_data['body'])

In [8]:
X_train = sequence.pad_sequences(sequences_train, maxlen=SEQUENCE_LEN, padding='post')
X_val = sequence.pad_sequences(sequences_validation, maxlen=SEQUENCE_LEN, padding='post')
X_test = sequence.pad_sequences(sequences_test, maxlen=SEQUENCE_LEN, padding='post')

In [9]:
encoder = LabelEncoder()
 
label = train['document_type']
label = encoder.fit_transform(label)

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(label),
                                                 label)
label = np.transpose(label)
label = to_categorical(label)

val_label = val['document_type'] 
val_label_toTest = encoder.transform(val_label)
val_label = np.transpose(val_label_toTest)
val_label = to_categorical(val_label)

test_label = test_data['document_type']
test_label_toTest = encoder.transform(test_label)
test_label = np.transpose(test_label_toTest)
test_label = to_categorical(test_label)



In [10]:
encoder.classes_, class_weights

(array(['acordao_de_2_instancia', 'agravo_em_recurso_extraordinario',
        'despacho_de_admissibilidade', 'outros', 'peticao_do_RE',
        'sentenca'], dtype=object),
 array([44.97197107,  9.76806756, 71.87716763,  0.18540788,  2.61536439,
        11.68130578]))

In [14]:
f1_base = Input(shape=(SEQUENCE_LEN, ), dtype='int32')
text_embedding = Embedding(input_dim=MAX_FEATURES, output_dim=EMBEDDING_OUT,
                           input_length=SEQUENCE_LEN)(f1_base)

filter_sizes = [3, 4, 5]
convs = []
for filter_size in filter_sizes:
    l_conv = Conv1D(filters=256, kernel_size=filter_size, padding='same', activation='relu')(text_embedding)
    l_batch = BatchNormalization()(l_conv)
    l_pool = MaxPooling1D(2)(l_conv)
    
    convs.append(l_pool)

l_merge = Concatenate(axis=1)(convs)
l_pool1 = MaxPooling1D(50)(l_merge)
l_flat = Flatten()(l_pool1)
l_dense = Dense(128, activation='relu')(l_flat)
x = Dropout(0.5)(l_dense)
x = Dense(nClasses, activation='softmax')(x)
model = Model(inputs=f1_base, outputs=x)

# determine Loss function and Optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpointer = ModelCheckpoint(filepath=output_path/"stf_weights.keras", verbose=1, save_weights_only=True,
                               save_best_only=True, monitor="val_accuracy", mode="max")

In [12]:
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

In [13]:
model.fit(
	x=(X_train), y=(label),
	batch_size=batch_size,
	epochs=epoch,
	validation_data=(X_val, val_label),
	callbacks=[checkpointer],
	class_weight={i:v for i, v in enumerate(class_weights)})

Epoch 1/20
Epoch 00001: val_accuracy improved from -inf to 0.75500, saving model to models/stf_weights.keras
Epoch 2/20
Epoch 00002: val_accuracy did not improve from 0.75500
Epoch 3/20
Epoch 00003: val_accuracy did not improve from 0.75500
Epoch 4/20
Epoch 00004: val_accuracy improved from 0.75500 to 0.84142, saving model to models/stf_weights.keras
Epoch 5/20
Epoch 00005: val_accuracy improved from 0.84142 to 0.87286, saving model to models/stf_weights.keras
Epoch 6/20
Epoch 00006: val_accuracy did not improve from 0.87286
Epoch 7/20
Epoch 00007: val_accuracy improved from 0.87286 to 0.88205, saving model to models/stf_weights.keras
Epoch 8/20
Epoch 00008: val_accuracy did not improve from 0.88205
Epoch 9/20
Epoch 00009: val_accuracy did not improve from 0.88205
Epoch 10/20
Epoch 00010: val_accuracy did not improve from 0.88205
Epoch 11/20
Epoch 00011: val_accuracy did not improve from 0.88205
Epoch 12/20
Epoch 00012: val_accuracy improved from 0.88205 to 0.88281, saving model to mod

<tensorflow.python.keras.callbacks.History at 0x7f61bc713130>

In [14]:
# Convert Model into JSON Format
model_json = model.to_json()

with open(output_path/"cnn_text.json", "w") as json_file:
   json_file.write(model_json)

In [15]:
model.load_weights(output_path/"stf_weights.keras")

In [16]:
test_predict_1 = model.predict(X_val, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)

target_names = ['acordao_de_2_instancia','agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']
print(classification_report(val_label_toTest, pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.3902    0.8261    0.5300       299
agravo_em_recurso_extraordinario     0.3731    0.7473    0.4977      2149
     despacho_de_admissibilidade     0.1739    0.8361    0.2879       183
                          outros     0.9784    0.9027    0.9390     84104
                   peticao_do_RE     0.5906    0.8006    0.6797      6364
                        sentenca     0.4926    0.8099    0.6126      1636

                        accuracy                         0.8904     94735
                       macro avg     0.4998    0.8205    0.5912     94735
                    weighted avg     0.9268    0.8904    0.9034     94735



In [17]:
test_predict_1 = model.predict(X_test, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)
print(classification_report(test_label_toTest,
                            pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.4271    0.9341    0.5862       273
agravo_em_recurso_extraordinario     0.3369    0.7132    0.4577      1841
     despacho_de_admissibilidade     0.1745    0.8283    0.2882       198
                          outros     0.9793    0.9072    0.9419     85408
                   peticao_do_RE     0.5971    0.7852    0.6784      6331
                        sentenca     0.4783    0.8576    0.6141      1475

                        accuracy                         0.8945     95526
                       macro avg     0.4989    0.8376    0.5944     95526
                    weighted avg     0.9306    0.8945    0.9077     95526



In [13]:
model.fit(
	x=(X_train), y=(label),
	batch_size=batch_size,
	epochs=epoch,
	validation_data=(X_val, val_label),
	callbacks=[checkpointer])

Epoch 1/20
Epoch 00001: val_accuracy improved from -inf to 0.93942, saving model to models/stf_weights.keras
Epoch 2/20
Epoch 00002: val_accuracy improved from 0.93942 to 0.94322, saving model to models/stf_weights.keras
Epoch 3/20
Epoch 00003: val_accuracy improved from 0.94322 to 0.94448, saving model to models/stf_weights.keras
Epoch 4/20
Epoch 00004: val_accuracy improved from 0.94448 to 0.94614, saving model to models/stf_weights.keras
Epoch 5/20
Epoch 00005: val_accuracy did not improve from 0.94614
Epoch 6/20
Epoch 00006: val_accuracy improved from 0.94614 to 0.94680, saving model to models/stf_weights.keras
Epoch 7/20
Epoch 00007: val_accuracy improved from 0.94680 to 0.94743, saving model to models/stf_weights.keras
Epoch 8/20
Epoch 00008: val_accuracy improved from 0.94743 to 0.94787, saving model to models/stf_weights.keras
Epoch 9/20
Epoch 00009: val_accuracy did not improve from 0.94787
Epoch 10/20
Epoch 00010: val_accuracy did not improve from 0.94787
Epoch 11/20
Epoch 00

<tensorflow.python.keras.callbacks.History at 0x7f497445edf0>

In [16]:
model.load_weights(output_path/"stf_weights.keras")

In [17]:
test_predict_1 = model.predict(X_val, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)

target_names = ['acordao_de_2_instancia','agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']
print(classification_report(val_label_toTest, pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.7723    0.7826    0.7774       299
agravo_em_recurso_extraordinario     0.6479    0.5556    0.5982      2149
     despacho_de_admissibilidade     0.7229    0.6557    0.6877       183
                          outros     0.9639    0.9814    0.9726     84104
                   peticao_do_RE     0.8330    0.7172    0.7708      6364
                        sentenca     0.8691    0.6980    0.7742      1636

                        accuracy                         0.9479     94735
                       macro avg     0.8015    0.7318    0.7635     94735
                    weighted avg     0.9453    0.9479    0.9460     94735



In [18]:
test_predict_1 = model.predict(X_test, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)
print(classification_report(test_label_toTest,
                            pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.7716    0.9158    0.8375       273
agravo_em_recurso_extraordinario     0.6331    0.5595    0.5940      1841
     despacho_de_admissibilidade     0.7566    0.5808    0.6571       198
                          outros     0.9658    0.9827    0.9742     85408
                   peticao_do_RE     0.8479    0.7060    0.7705      6331
                        sentenca     0.8591    0.7275    0.7878      1475

                        accuracy                         0.9513     95526
                       macro avg     0.8057    0.7454    0.7702     95526
                    weighted avg     0.9490    0.9513    0.9495     95526

