In [2]:
%load_ext autoreload
%autoreload 2

from fastai.text.all import *
from fastai.vision.all import *
import pandas as pd
import torch
from tqdm.notebook import tqdm

from utils import get_dls

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import tensorflow as tf
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import Input
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from os.path import join, split, splitext

In [4]:
seed = 42

# python RNG
import random
random.seed(seed)

# pytorch RNGs
import torch
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# numpy RNG
import numpy as np
np.random.seed(seed)

# tensorflow RNG
tf.random.set_seed(seed)

In [5]:
SEQUENCE_LEN = 500 # Size of input arrays

In [6]:
models_path = Path("./models/")
weights_path = models_path/"stf_no_weights.keras"
json_path = models_path/"cnn_text.json"
tokenizer_path = models_path/"tokenizer.pickle"

In [7]:
json_file = open(json_path,'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)

In [8]:
model.load_weights(weights_path)

In [9]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
data_path = Path("/mnt/nas/backups/08-07-2020/desktopg01/lisa/Data/CSV")

In [13]:
test_data = pd.read_csv(data_path/"test_small.csv")

In [11]:
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle, encoding="utf-8")

In [14]:
sequences_test = tokenizer.texts_to_sequences(test_data['body'])

In [15]:
X_test = sequence.pad_sequences(sequences_test, maxlen=SEQUENCE_LEN, padding='post')

In [16]:
encoder = LabelEncoder()

In [17]:
test_label = test_data['document_type'] 
test_label_toTest = encoder.fit_transform(test_label)
test_label = np.transpose(test_label_toTest)
test_label = to_categorical(test_label)

X_test = np.array(X_test)

In [20]:
preds = model.predict(X_test, verbose=1)
preds_text = preds.argmax(axis=1)



In [21]:
preds_text.shape

(95526,)

In [23]:
path = Path("/mnt/nas/backups/08-07-2020/desktopg01/lisa/Data/small_flow")

In [24]:
test_items = get_image_files(path, folders="test")

In [38]:
t = test_items[0]

In [37]:
text_files = set((test_data["file_name"].str.slice(stop=-4) + "_" + test_data["pages"].astype(str)).values)

In [46]:
test_items_filtered = [x for x in test_items if x.with_suffix("").name not in text_files]

In [47]:
len(test_items_filtered)

8037

In [53]:
dls = get_dls(path, 64, 224)

In [54]:
test_dl = dls.test_dl(test_items_filtered, with_labels=True)

In [55]:
learn = cnn_learner(dls, resnet50, loss_func=CrossEntropyLossFlat())

In [56]:
learn.load("best_image_weights_224")

<fastai.learner.Learner at 0x7ff20c53bac0>

In [64]:
preds_img, labels_img = learn.get_preds(dl=test_dl)

In [65]:
preds_img = preds_img.argmax(dim=-1); preds_img.shape

torch.Size([8037])

In [67]:
test_label_toTest, labels_img

(array([3, 3, 3, ..., 3, 3, 3]), TensorCategory([0, 0, 0,  ..., 5, 5, 5]))

In [69]:
preds_text, preds_img

(array([3, 3, 3, ..., 3, 3, 3]), TensorImage([0, 0, 0,  ..., 0, 0, 0]))

In [78]:
preds = np.concatenate([preds_text, preds_img])

In [79]:
labels = np.concatenate([test_label_toTest, labels_img])

In [80]:
target_names = ['acordao_de_2_instancia','agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']
print(classification_report(labels, preds, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.2692    0.8920    0.4136       287
agravo_em_recurso_extraordinario     0.4408    0.5522    0.4902      2655
     despacho_de_admissibilidade     0.3543    0.5377    0.4271       199
                          outros     0.9655    0.9507    0.9580     92533
                   peticao_do_RE     0.7144    0.7278    0.7211      6386
                        sentenca     0.7800    0.7053    0.7407      1503

                        accuracy                         0.9222    103563
                       macro avg     0.5874    0.7276    0.6251    103563
                    weighted avg     0.9307    0.9222    0.9258    103563

