In [1]:
import os
from os import walk
from os.path import join, split, splitext
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import argparse
import json
import numpy as np
import pandas as pd

# backend
import tensorflow as tf
from tensorflow.keras import backend as K

from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Conv1D, MaxPooling1D, concatenate, Concatenate, BatchNormalization, Dropout
from tensorflow.keras.models import Model, Sequential, model_from_json

from tensorflow.keras.optimizers import SGD, RMSprop
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, LambdaCallback
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
 
from sklearn.utils import class_weight

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
from tqdm import trange

from pathlib import Path

In [2]:
seed = 42

# python RNG
import random
random.seed(seed)

# pytorch RNGs
import torch
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# numpy RNG
import numpy as np
np.random.seed(seed)

# tensorflow RNG
tf.random.set_seed(seed)

In [3]:
nClasses = 6
batch_size = 64
epochs = 20

MAX_FEATURES = 70000  # Size of vocabulary
EMBEDDING_DIM = MAX_FEATURES  # Size of vocabulary
SEQUENCE_LEN = 500 # Size of input arrays
EMBEDDING_OUT = 100  # Output dim of embedding

In [4]:
output_path = Path("./models")

In [5]:
data_path = Path("/mnt/nas/backups/08-07-2020/desktopg01/lisa/Data/CSV")

In [6]:
train = pd.read_csv(data_path/"train_small.csv", usecols=['document_type', 'body'])
val = pd.read_csv(data_path/"validation_small.csv", usecols=['document_type', 'body'])
test_data = pd.read_csv(data_path/"test_small.csv", usecols=['document_type', 'body'])

In [7]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train['body'])
with open(join(output_path, 'tokenizer.pickle'), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
sequences_train = tokenizer.texts_to_sequences(train['body'])
sequences_validation = tokenizer.texts_to_sequences(val['body'])
sequences_test = tokenizer.texts_to_sequences(test_data['body'])

In [8]:
X_train = sequence.pad_sequences(sequences_train, maxlen=SEQUENCE_LEN, padding='post')
X_val = sequence.pad_sequences(sequences_validation, maxlen=SEQUENCE_LEN, padding='post')
X_test = sequence.pad_sequences(sequences_test, maxlen=SEQUENCE_LEN, padding='post')

In [9]:
encoder = LabelEncoder()
 
label = train['document_type']
label = encoder.fit_transform(label)

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(label),
                                                 label)
label = np.transpose(label)
label = to_categorical(label)

val_label = val['document_type'] 
val_label_toTest = encoder.transform(val_label)
val_label = np.transpose(val_label_toTest)
val_label = to_categorical(val_label)

test_label = test_data['document_type']
test_label_toTest = encoder.transform(test_label)
test_label = np.transpose(test_label_toTest)
test_label = to_categorical(test_label)



In [10]:
encoder.classes_, class_weights

(array(['acordao_de_2_instancia', 'agravo_em_recurso_extraordinario',
        'despacho_de_admissibilidade', 'outros', 'peticao_do_RE',
        'sentenca'], dtype=object),
 array([44.97197107,  9.76806756, 71.87716763,  0.18540788,  2.61536439,
        11.68130578]))

In [11]:
class F1History(tf.keras.callbacks.Callback):

    def __init__(self, validation):
        super(F1History, self).__init__()
        self.validation = validation
        
    def on_epoch_end(self, epoch, logs={}):
        logs['F1_score_val'] = float('-inf')
        X_valid, y_valid = self.validation[0], self.validation[1]
        y_val_pred = self.model.predict(X_valid).argmax(axis=1)
        val_score = f1_score(y_valid, y_val_pred, average="macro")
        logs['F1_score_val'] = np.round(val_score, 5)

In [20]:
def get_model():
    f1_base = Input(shape=(SEQUENCE_LEN, ), dtype='int32')
    text_embedding = Embedding(input_dim=MAX_FEATURES, output_dim=EMBEDDING_OUT,
                               input_length=SEQUENCE_LEN)(f1_base)

    filter_sizes = [3, 4, 5]
    convs = []
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=256, kernel_size=filter_size, padding='same', activation='relu')(text_embedding)
        l_batch = BatchNormalization()(l_conv)
        l_pool = MaxPooling1D(2)(l_conv)

        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)
    l_pool1 = MaxPooling1D(50)(l_merge)
    l_flat = Flatten()(l_pool1)
    l_dense = Dense(128, activation='relu')(l_flat)
    x = Dropout(0.5)(l_dense)
    x = Dense(nClasses, activation='softmax')(x)
    return Model(inputs=f1_base, outputs=x)

In [13]:
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

In [None]:
model = get_model()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpointer = ModelCheckpoint(filepath=output_path/"keras/stf_weights_{epoch:02d}.keras",
                               verbose=1, save_weights_only=True)

hist =  model.fit(
	x=(X_train), y=(label),
	batch_size=batch_size,
	epochs=epochs,
	validation_data=(X_val, val_label),
	callbacks=[F1History((X_val,val_label_toTest)),
               checkpointer],
	class_weight={i:v for i, v in enumerate(class_weights)})

Epoch 1/20
Epoch 00001: saving model to models/keras/stf_weights_01.keras
Epoch 2/20
Epoch 00002: saving model to models/keras/stf_weights_02.keras
Epoch 3/20
Epoch 00003: saving model to models/keras/stf_weights_03.keras
Epoch 4/20
Epoch 00004: saving model to models/keras/stf_weights_04.keras
Epoch 5/20
Epoch 00005: saving model to models/keras/stf_weights_05.keras
Epoch 6/20
Epoch 00006: saving model to models/keras/stf_weights_06.keras
Epoch 7/20
Epoch 00007: saving model to models/keras/stf_weights_07.keras
Epoch 8/20
Epoch 00008: saving model to models/keras/stf_weights_08.keras
Epoch 9/20
Epoch 00009: saving model to models/keras/stf_weights_09.keras
Epoch 10/20
Epoch 00010: saving model to models/keras/stf_weights_10.keras
Epoch 11/20
Epoch 00011: saving model to models/keras/stf_weights_11.keras
Epoch 12/20
Epoch 00012: saving model to models/keras/stf_weights_12.keras
Epoch 13/20
Epoch 00013: saving model to models/keras/stf_weights_13.keras
Epoch 14/20
Epoch 00014: saving mo

In [114]:
# Convert Model into JSON Format
model_json = model.to_json()

with open(output_path/"cnn_text.json", "w") as json_file:
   json_file.write(model_json)

In [119]:
best_epoch = np.argmax(hist.history["F1_score_val"]) +1; best_epoch

16

In [120]:
model.load_weights(output_path/f"keras/stf_weights_{best_epoch}.keras")

In [121]:
model.save_weights(output_path/"stf_weights.keras")

In [14]:
model.load_weights(output_path/"stf_weights.keras")

In [15]:
test_predict_1 = model.predict(X_val, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)

target_names = ['acordao_de_2_instancia','agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']
print(classification_report(val_label_toTest, pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.4346    0.8227    0.5688       299
agravo_em_recurso_extraordinario     0.4695    0.6705    0.5523      2149
     despacho_de_admissibilidade     0.3720    0.7541    0.4982       183
                          outros     0.9755    0.9173    0.9455     84104
                   peticao_do_RE     0.5561    0.8047    0.6577      6364
                        sentenca     0.5282    0.7855    0.6316      1636

                        accuracy                         0.9013     94735
                       macro avg     0.5560    0.7925    0.6424     94735
                    weighted avg     0.9253    0.9013    0.9098     94735



In [None]:
tf.keras.backend.clear_session() 
model = get_model()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpointer = ModelCheckpoint(filepath=output_path/"keras/stf_no_weights_{epoch:02d}.keras",
                               verbose=1, save_weights_only=True)

hist =  model.fit(
	x=(X_train), y=(label),
	batch_size=batch_size,
	epochs=epochs,
	validation_data=(X_val, val_label),
	callbacks=[F1History((X_val,val_label_toTest)),
               checkpointer])

In [30]:
hist.history["F1_score_val"]

[0.64365,
 0.74248,
 0.75294,
 0.7607,
 0.75816,
 0.76247,
 0.76618,
 0.76655,
 0.75043,
 0.76407,
 0.7587,
 0.75964,
 0.76295,
 0.76531,
 0.76045,
 0.74962,
 0.76908,
 0.75315,
 0.77141,
 0.76225]

In [31]:
best_epoch = np.argmax(hist.history["F1_score_val"]) +1; best_epoch

19

In [32]:
model.load_weights(output_path/f"keras/stf_no_weights_{best_epoch}.keras")

In [33]:
model.save_weights(output_path/"stf_no_weights.keras")

In [34]:
test_predict_1 = model.predict(X_val, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)

target_names = ['acordao_de_2_instancia','agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']
print(classification_report(val_label_toTest, pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.9116    0.7592    0.8285       299
agravo_em_recurso_extraordinario     0.7504    0.4742    0.5811      2149
     despacho_de_admissibilidade     0.7727    0.6503    0.7062       183
                          outros     0.9629    0.9797    0.9712     84104
                   peticao_do_RE     0.7645    0.7456    0.7549      6364
                        sentenca     0.9285    0.6822    0.7865      1636

                        accuracy                         0.9460     94735
                       macro avg     0.8484    0.7152    0.7714     94735
                    weighted avg     0.9437    0.9460    0.9437     94735



In [35]:
test_predict_1 = model.predict(X_test, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)
print(classification_report(test_label_toTest,
                            pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.9132    0.8864    0.8996       273
agravo_em_recurso_extraordinario     0.7114    0.4579    0.5572      1841
     despacho_de_admissibilidade     0.7535    0.5404    0.6294       198
                          outros     0.9651    0.9813    0.9731     85408
                   peticao_do_RE     0.7804    0.7329    0.7559      6331
                        sentenca     0.9191    0.7166    0.8053      1475

                        accuracy                         0.9494     95526
                       macro avg     0.8405    0.7193    0.7701     95526
                    weighted avg     0.9467    0.9494    0.9472     95526

