In [2]:
%load_ext autoreload
%autoreload 2

from fastai.text.all import *
import tensorflow as tf
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import Input
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from os.path import join, split, splitext
import tqdm

In [3]:
seed = 42

# python RNG
import random
random.seed(seed)

# pytorch RNGs
import torch
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# numpy RNG
import numpy as np
np.random.seed(seed)

# tensorflow RNG
tf.random.set_seed(seed)

In [4]:
SEQUENCE_LEN = 500 # Size of input arrays

In [5]:
models_path = Path("./models/")
weights_path = models_path/"stf_no_weights.keras"
json_path = models_path/"cnn_text.json"
tokenizer_path = models_path/"tokenizer.pickle"

In [6]:
json_file = open(json_path,'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)

In [7]:
model.load_weights(weights_path)

In [8]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
data_path = Path("/mnt/nas/backups/08-07-2020/desktopg01/lisa/Data/CSV")

In [10]:
train = pd.read_csv(data_path/"train_small.csv")
val = pd.read_csv(data_path/"validation_small.csv")
test_data = pd.read_csv(data_path/"test_small.csv")

In [11]:
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle, encoding="utf-8")

In [12]:
sequences_train = tokenizer.texts_to_sequences(train['body'])
sequences_validation = tokenizer.texts_to_sequences(val['body'])
sequences_test = tokenizer.texts_to_sequences(test_data['body'])

In [13]:
X_train = sequence.pad_sequences(sequences_train, maxlen=SEQUENCE_LEN, padding='post')
X_val = sequence.pad_sequences(sequences_validation, maxlen=SEQUENCE_LEN, padding='post')
X_test = sequence.pad_sequences(sequences_test, maxlen=SEQUENCE_LEN, padding='post')

In [14]:
encoder = LabelEncoder()

In [15]:
train_label = train['document_type'] 
train_label_toTest = encoder.fit_transform(train_label)
train_label = np.transpose(train_label_toTest)
train_label = to_categorical(train_label)


valid_label = val['document_type'] 
valid_label_toTest = encoder.fit_transform(valid_label)
valid_label = np.transpose(valid_label_toTest)
valid_label = to_categorical(valid_label)

test_label = test_data['document_type'] 
test_label_toTest = encoder.fit_transform(test_label)
test_label = np.transpose(test_label_toTest)
test_label = to_categorical(test_label)

X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

In [16]:
test_predict_1 = model.predict(X_test, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)
target_names = ['acordao_de_2_instancia','agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']



In [17]:
print(classification_report(test_label_toTest, pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.9132    0.8864    0.8996       273
agravo_em_recurso_extraordinario     0.7114    0.4579    0.5572      1841
     despacho_de_admissibilidade     0.7535    0.5404    0.6294       198
                          outros     0.9651    0.9813    0.9731     85408
                   peticao_do_RE     0.7804    0.7329    0.7559      6331
                        sentenca     0.9191    0.7166    0.8053      1475

                        accuracy                         0.9494     95526
                       macro avg     0.8405    0.7193    0.7701     95526
                    weighted avg     0.9467    0.9494    0.9472     95526



In [18]:
model.summary()

Model: "functional_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 500)]        0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 500, 100)     7000000     input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 500, 256)     77056       embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 500, 256)     102656      embedding_3[0][0]                
_______________________________________________________________________________________

In [19]:
activation_output = model.layers[-4].output; activation_output

<tf.Tensor 'flatten_3/Reshape:0' shape=(None, 3840) dtype=float32>

In [20]:
inputs = model.input; inputs

<tf.Tensor 'input_4:0' shape=(None, 500) dtype=int32>

In [21]:
activation_model = tf.keras.Model(inputs, activation_output)

In [22]:
train_vecs = activation_model.predict(X_train, verbose=1)




In [23]:
val_vecs = activation_model.predict(X_val, verbose=1)



In [24]:
test_vecs = activation_model.predict(X_test, verbose=1)



In [25]:
train_vecs.shape, val_vecs.shape, test_vecs.shape

((149217, 3840), (94735, 3840), (95526, 3840))

In [28]:
train_files = []
for idx, item in train.iterrows():
    filename = (Path("./activations/text/train")/Path(item["document_type"])/
               (Path(item["file_name"]).with_suffix("").as_posix()+"_"+str(item["pages"])))
    filename.parent.mkdir(parents=True, exist_ok=True)
    np.save(filename, train_vecs[idx])
    train_files.append(filename)
    print(f"Saving example {idx+1}", end='\r', flush=True)

Saving example 149217

In [29]:
train["activation_path"] = train_files

In [38]:
assert (np.load("./activations/text/train/outros/AI_856934_1926211_34_17072013_3.npy") == train_vecs[3]).all()

In [30]:
val_files = []
for idx, item in val.iterrows():
    filename = (Path("./activations/text/val")/Path(item["document_type"])/
               (Path(item["file_name"]).with_suffix("").as_posix()+"_"+str(item["pages"])))
    filename.parent.mkdir(parents=True, exist_ok=True)
    np.save(filename, val_vecs[idx])
    val_files.append(filename)
    print(f"Saving example {idx+1}", end='\r', flush=True)

Saving example 94735

In [31]:
val["activation_path"] = val_files

In [32]:
test_files = []
for idx, item in test_data.iterrows():
    filename = (Path("./activations/text/test")/Path(item["document_type"])/
               (Path(item["file_name"]).with_suffix("").as_posix()+"_"+str(item["pages"])))
#     filename.parent.mkdir(parents=True, exist_ok=True)
#     np.save(filename, test_vecs[idx])
    test_files.append(filename)
    print(f"Saving example {idx+1}", end='\r', flush=True)

Saving example 95526

In [33]:
test_data["activation_path"] = test_files

In [40]:
train.to_csv("./data/train_fusion.csv", index= False)

In [41]:
val.to_csv("./data/val_fusion.csv", index=False)

In [42]:
test_data.to_csv("./data/test_fusion.csv", index=False)