In [4]:
import os
from os import walk
from os.path import join, split, splitext
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import argparse
import json
import numpy as np
import pandas as pd

# backend
import tensorflow as tf
from tensorflow.keras import backend as K

from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Conv1D, MaxPooling1D, concatenate, Concatenate, BatchNormalization, Dropout
from tensorflow.keras.models import Model, Sequential, model_from_json

from tensorflow.keras.optimizers import SGD, RMSprop
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
 
from sklearn.utils import class_weight

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
from tqdm import trange

from pathlib import Path

In [5]:
nClasses = 6
batch_size = 64
epoch = 20

MAX_FEATURES = 70000  # Size of vocabulary
EMBEDDING_DIM = MAX_FEATURES  # Size of vocabulary
SEQUENCE_LEN = 500 # Size of input arrays
UNITS = 100  # Number of output cells for Recurrent Models
EMBEDDING_OUT = 100  # Output dim of embedding

In [6]:
output_path = Path("./models")

In [8]:
data_path = Path("/mnt/nas/backups/08-07-2020/desktopg01/lisa/Data/CSV")

In [9]:
train = pd.read_csv(data_path/"train_small.csv", usecols=['document_type', 'body'])
val = pd.read_csv(data_path/"validation_small.csv", usecols=['document_type', 'body'])
test_data = pd.read_csv(data_path/"test_small.csv", usecols=['document_type', 'body'])

In [11]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train['body'])
with open(join(output_path, 'tokenizer.pickle'), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
sequences_train = tokenizer.texts_to_sequences(train['body'])
sequences_validation = tokenizer.texts_to_sequences(val['body'])
sequences_test = tokenizer.texts_to_sequences(test_data['body'])

In [12]:
X_train = sequence.pad_sequences(sequences_train, maxlen=SEQUENCE_LEN, padding='post')
X_val = sequence.pad_sequences(sequences_validation, maxlen=SEQUENCE_LEN, padding='post')
X_test = sequence.pad_sequences(sequences_test, maxlen=SEQUENCE_LEN, padding='post')

In [15]:
encoder = LabelEncoder()
 
label = train['document_type']
label = encoder.fit_transform(label)

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(label),
                                                 label)
label = np.transpose(label)
label = to_categorical(label)

val_label = val['document_type'] 
val_label_toTest = encoder.fit_transform(val_label)
val_label = np.transpose(val_label_toTest)
val_label = to_categorical(val_label)

test_label = test_data['document_type']
test_label_toTest = encoder.fit_transform(test_label)
test_label = np.transpose(test_label_toTest)
test_label = to_categorical(test_label)

In [18]:
f1_base = Input(shape=(SEQUENCE_LEN, ), dtype='int32')
text_embedding = Embedding(input_dim=MAX_FEATURES, output_dim=EMBEDDING_OUT,
                           input_length=SEQUENCE_LEN)(f1_base)

filter_sizes = [3, 4, 5]
convs = []
for filter_size in filter_sizes:
    l_conv = Conv1D(filters=256, kernel_size=filter_size, padding='same', activation='relu')(text_embedding)
    l_batch = BatchNormalization()(l_conv)
    l_pool = MaxPooling1D(2)(l_conv)
    
    convs.append(l_pool)

l_merge = Concatenate(axis=1)(convs)
l_pool1 = MaxPooling1D(50)(l_merge)
l_flat = Flatten()(l_pool1)
l_dense = Dense(128, activation='relu')(l_flat)
x = Dropout(0.5)(l_dense)
#f1_x = Flatten()(f1_x)
x = Dense(nClasses, activation='softmax')(x)
model = Model(inputs=f1_base, outputs=x)

# determine Loss function and Optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpointer = ModelCheckpoint(filepath=output_path/"stf_weights.keras", verbose=1, save_weights_only=True)



In [20]:
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

In [None]:
model.fit(
	x=(X_train), y=(label),
	batch_size=batch_size,
	epochs=epoch,
	validation_data=(X_val, val_label),
	callbacks=[checkpointer],
	class_weight={i:v for i, v in enumerate(class_weights)})

Epoch 1/20
Epoch 2/20
Epoch 00002: saving model to models/stf_weights.keras
Epoch 3/20

In [None]:
# Convert Model into JSON Format
model_json = model.to_json()

with open(output_path/"cnn_text.json", "w") as json_file:
   json_file.write(model_json)

In [None]:
model.save_weights(path/"cnn_text.h5")

In [None]:
test_predict_1 = model.predict(X_val, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)

target_names = ['acordao_de_2_instancia','agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']
print(classification_report(val_label_toTest, pred_1, target_names=target_names, digits=4))

In [None]:
test_predict_1 = model.predict(X_test, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)
print(classification_report(test_label_toTest,
                            pred_1, target_names=target_names, digits=4))