In [2]:
import os
from os import walk
from os.path import join, split, splitext
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import argparse
import json
import numpy as np
import pandas as pd

# backend
import tensorflow as tf
from tensorflow.keras import backend as K

from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Conv1D, MaxPooling1D, concatenate, Concatenate, BatchNormalization, Dropout
from tensorflow.keras.models import Model, Sequential, model_from_json

from tensorflow.keras.optimizers import SGD, RMSprop
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, LambdaCallback
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
 
from sklearn.utils import class_weight

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
from tqdm import trange

from pathlib import Path

In [4]:
seed = 42

# python RNG
import random
random.seed(seed)

# pytorch RNGs
# import torch
# torch.manual_seed(seed)
# torch.backends.cudnn.deterministic = True
# if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# numpy RNG
import numpy as np
np.random.seed(seed)

# tensorflow RNG
tf.random.set_seed(seed)

In [79]:
nClasses = 2
batch_size = 64
epochs = 20

MAX_FEATURES = 70000  # Size of vocabulary
EMBEDDING_DIM = MAX_FEATURES  # Size of vocabulary
SEQUENCE_LEN = 500 # Size of input arrays
EMBEDDING_OUT = 100  # Output dim of embedding

In [80]:
output_path = Path("./models")

In [81]:
data_path = Path("/mnt/nas/databases/Tobacco800/unziped/")

In [82]:
!ls /mnt/nas/databases/Tobacco800/unziped/

README	page_imgs  test.csv  train.csv


In [83]:
train = pd.read_csv(data_path/"train.csv", delimiter=';',  usecols=['binder','docid','class', 'text'])
train.rename(columns={'text':'body'}, inplace=True)
print(train.shape)
train.dropna(inplace=True)
print(train.shape)

(1031, 4)
(1019, 4)


In [84]:
val = train.iloc[-200:,:]; print(val.shape); val.head()

(200, 4)


Unnamed: 0,binder,docid,class,body
830,Tobacco800,pkc56d00,FirstPage,".>->aa, Mailand 20014 / (301) 654-3400 ..."
831,Tobacco800,pkj90c00,FirstPage,AMERICAN '93 THE NEETROnilER FIELD SALES INFOR...
832,Tobacco800,ply60e00,FirstPage,LORILLARD INC. • ONE PARK AVENUE. NEW YORK. N....
833,Tobacco800,pmx82f00-page04_1,FirstPage,4 4 -/ NEWELL W. ELLISON H. THOMAS AUSTERN ...
834,Tobacco800,pmx82f00-page04_2,NextPage,COVINGTON & BURLING CONF...


In [85]:
train = train.iloc[:-200,:]; print(train.shape);train.head()

(819, 4)


Unnamed: 0,binder,docid,class,body
0,Tobacco800,aah97e00-page02_1,FirstPage,"Dr. M.A. Manzelli, PHILIP MORRIS INC., Researc..."
1,Tobacco800,aah97e00-page02_2,NextPage,- 2 - Please let me krow if you have any chang...
2,Tobacco800,aam09c00,FirstPage,I NOIJ-04-97 13 = 25 FROM = I D : PAGE 10/1...
3,Tobacco800,aao54e00_1,FirstPage,i PHILIP .MORRIS INCORPORATED 120 PARK...
4,Tobacco800,aao54e00_2,NextPage,"In the meantime, I hope you and your friends a..."


In [86]:
test_data = pd.read_csv(data_path/"test.csv", delimiter=';',  usecols=['binder','docid','class', 'text'])
test_data.rename(columns={'text':'body'}, inplace=True)
print(test_data.shape)
test_data.head()
test_data.dropna(inplace=True)

(259, 4)


In [87]:
for index, row in train.iterrows():
    if not isinstance(row.body, str):
        print(row.body)

In [88]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train['body'])
with open(join(output_path, 'tokenizer.pickle'), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
sequences_train = tokenizer.texts_to_sequences(train['body'])
sequences_validation = tokenizer.texts_to_sequences(val['body'])
sequences_test = tokenizer.texts_to_sequences(test_data['body'])

In [89]:
X_train = sequence.pad_sequences(sequences_train, maxlen=SEQUENCE_LEN, padding='post')
X_val = sequence.pad_sequences(sequences_validation, maxlen=SEQUENCE_LEN, padding='post')
X_test = sequence.pad_sequences(sequences_test, maxlen=SEQUENCE_LEN, padding='post')

In [90]:
encoder = LabelEncoder()
 
label = train['class']
label = encoder.fit_transform(label)

#class_weights = class_weight.compute_class_weight('balanced',
#                                                 np.unique(label),
#                                                 label)
label = np.transpose(label)
label = to_categorical(label)

val_label = val['class'] 
val_label_toTest = encoder.transform(val_label)
val_label = np.transpose(val_label_toTest)
val_label = to_categorical(val_label)

test_label = test_data['class']
test_label_toTest = encoder.transform(test_label)
test_label = np.transpose(test_label_toTest)
test_label = to_categorical(test_label)

In [93]:
#encoder.classes_, class_weights

In [94]:
class F1History(tf.keras.callbacks.Callback):

    def __init__(self, validation):
        super(F1History, self).__init__()
        self.validation = validation
        
    def on_epoch_end(self, epoch, logs={}):
        logs['F1_score_val'] = float('-inf')
        X_valid, y_valid = self.validation[0], self.validation[1]
        y_val_pred = self.model.predict(X_valid).argmax(axis=1)
        val_score = f1_score(y_valid, y_val_pred, average="macro")
        logs['F1_score_val'] = np.round(val_score, 5)

In [95]:
def get_model():
    f1_base = Input(shape=(SEQUENCE_LEN, ), dtype='int32')
    text_embedding = Embedding(input_dim=MAX_FEATURES, output_dim=EMBEDDING_OUT,
                               input_length=SEQUENCE_LEN)(f1_base)

    filter_sizes = [3, 4, 5]
    convs = []
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=256, kernel_size=filter_size, padding='same', activation='relu')(text_embedding)
        l_batch = BatchNormalization()(l_conv)
        l_pool = MaxPooling1D(2)(l_conv)

        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)
    l_pool1 = MaxPooling1D(50)(l_merge)
    l_flat = Flatten()(l_pool1)
    l_dense = Dense(128, activation='relu')(l_flat)
    x = Dropout(0.5)(l_dense)
    x = Dense(nClasses, activation='softmax')(x)
    return Model(inputs=f1_base, outputs=x)

In [96]:
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

In [97]:
model = get_model()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpointer = ModelCheckpoint(filepath=output_path/"keras/stf_weights_{epoch:02d}.keras",
                               verbose=1, save_weights_only=True)

hist =  model.fit(
	x=(X_train), y=(label),
	batch_size=batch_size,
	epochs=epochs,
	validation_data=(X_val, val_label),
	callbacks=[F1History((X_val,val_label_toTest)),
               checkpointer],
	class_weight={i:v for i, v in enumerate(class_weights)})

NameError: name 'class_weights' is not defined

In [119]:
best_epoch = np.argmax(hist.history["F1_score_val"]) +1; best_epoch

16

In [120]:
model.load_weights(output_path/f"keras/stf_weights_{best_epoch}.keras")

In [121]:
model.save_weights(output_path/"stf_weights.keras")

In [14]:
model.load_weights(output_path/"stf_weights.keras")

In [15]:
test_predict_1 = model.predict(X_val, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)

target_names = ['acordao_de_2_instancia','agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']
print(classification_report(val_label_toTest, pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.4346    0.8227    0.5688       299
agravo_em_recurso_extraordinario     0.4695    0.6705    0.5523      2149
     despacho_de_admissibilidade     0.3720    0.7541    0.4982       183
                          outros     0.9755    0.9173    0.9455     84104
                   peticao_do_RE     0.5561    0.8047    0.6577      6364
                        sentenca     0.5282    0.7855    0.6316      1636

                        accuracy                         0.9013     94735
                       macro avg     0.5560    0.7925    0.6424     94735
                    weighted avg     0.9253    0.9013    0.9098     94735



In [99]:
tf.keras.backend.clear_session() 
model = get_model()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpointer = ModelCheckpoint(filepath=output_path/"keras/stf_no_weights_{epoch:02d}.keras",
                               verbose=1, save_weights_only=True)

hist =  model.fit(
	x=(X_train), y=(label),
	batch_size=batch_size,
	epochs=epochs,
	validation_data=(X_val, val_label),
	callbacks=[F1History((X_val,val_label_toTest)),
               checkpointer])

Epoch 1/20

Epoch 00001: saving model to models/keras/stf_no_weights_01.keras
Epoch 2/20

Epoch 00002: saving model to models/keras/stf_no_weights_02.keras
Epoch 3/20

Epoch 00003: saving model to models/keras/stf_no_weights_03.keras
Epoch 4/20

Epoch 00004: saving model to models/keras/stf_no_weights_04.keras
Epoch 5/20

Epoch 00005: saving model to models/keras/stf_no_weights_05.keras
Epoch 6/20

Epoch 00006: saving model to models/keras/stf_no_weights_06.keras
Epoch 7/20

Epoch 00007: saving model to models/keras/stf_no_weights_07.keras
Epoch 8/20

Epoch 00008: saving model to models/keras/stf_no_weights_08.keras
Epoch 9/20

Epoch 00009: saving model to models/keras/stf_no_weights_09.keras
Epoch 10/20

Epoch 00010: saving model to models/keras/stf_no_weights_10.keras
Epoch 11/20

Epoch 00011: saving model to models/keras/stf_no_weights_11.keras
Epoch 12/20

Epoch 00012: saving model to models/keras/stf_no_weights_12.keras
Epoch 13/20

Epoch 00013: saving model to models/keras/stf_no

In [100]:
hist.history["F1_score_val"]

[0.31507,
 0.52038,
 0.68149,
 0.75978,
 0.74356,
 0.78499,
 0.77454,
 0.78344,
 0.77472,
 0.77945,
 0.76979,
 0.76991,
 0.76942,
 0.76963,
 0.76429,
 0.76942,
 0.76998,
 0.76963,
 0.76991,
 0.76991]

In [101]:
best_epoch = np.argmax(hist.history["F1_score_val"]) +1; best_epoch

6

In [105]:
model.load_weights(output_path/f"keras/stf_no_weights_{best_epoch:02d}.keras")

In [106]:
model.save_weights(output_path/"stf_no_weights.keras")

In [107]:
test_predict_1 = model.predict(X_val, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)

target_names = ['SD','ND']
print(classification_report(val_label_toTest, pred_1, target_names=target_names, digits=4))

              precision    recall  f1-score   support

          SD     0.7290    0.8478    0.7839        92
          ND     0.8495    0.7315    0.7861       108

    accuracy                         0.7850       200
   macro avg     0.7892    0.7897    0.7850       200
weighted avg     0.7940    0.7850    0.7851       200



In [108]:
test_predict_1 = model.predict(X_test, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)
print(classification_report(test_label_toTest,
                            pred_1, target_names=target_names, digits=4))

              precision    recall  f1-score   support

          SD     0.8354    0.9133    0.8726       150
          ND     0.8632    0.7523    0.8039       109

    accuracy                         0.8456       259
   macro avg     0.8493    0.8328    0.8383       259
weighted avg     0.8471    0.8456    0.8437       259



In [109]:
# Convert Model into JSON Format
model_json = model.to_json()

with open(output_path/"cnn_text.json", "w") as json_file:
   json_file.write(model_json)