In [1]:
%load_ext autoreload
%autoreload 2

from fastai.text.all import *
from fastai.vision.all import *
import pandas as pd
import torch
from tqdm.notebook import tqdm
import tensorflow as tf
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import Input
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from os.path import join, split, splitext

from utils import get_dls

In [2]:
seed = 42

# python RNG
import random
random.seed(seed)

# pytorch RNGs
import torch
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# numpy RNG
import numpy as np
np.random.seed(seed)

# tensorflow RNG
tf.random.set_seed(seed)

## Text Evaluation

In [3]:
SEQUENCE_LEN = 500 # Size of input arrays

In [4]:
models_path = Path("./models/")
weights_path = models_path/"stf_no_weights.keras"
json_path = models_path/"cnn_text.json"
tokenizer_path = models_path/"tokenizer.pickle"

In [5]:
json_file = open(json_path,'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)

In [6]:
model.load_weights(weights_path)

In [7]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [8]:
data_path = Path("/mnt/nas/backups/08-07-2020/desktopg01/lisa/Data/CSV")

In [9]:
test_data = pd.read_csv(data_path/"test_small.csv")

In [None]:
test_data_first = test_data[test_data["pages"] == 1]

In [None]:
test_data_not_first = test_data[test_data["pages"] != 1]

In [None]:
len(test_data_first), len(test_data_not_first)

In [15]:
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle, encoding="utf-8")

In [16]:
sequences_test_first = tokenizer.texts_to_sequences(test_data_first['body'])
sequences_test_not_first = tokenizer.texts_to_sequences(test_data_not_first['body'])

In [17]:
X_test_first = sequence.pad_sequences(sequences_test_first, maxlen=SEQUENCE_LEN, padding='post')
X_test_not_first = sequence.pad_sequences(sequences_test_not_first, maxlen=SEQUENCE_LEN, padding='post')

In [18]:
encoder = LabelEncoder()

In [19]:
test_label_first = test_data_first['document_type'] 
test_label_toTest_first = encoder.fit_transform(test_label_first)
test_label_first = np.transpose(test_label_toTest_first)
test_label_first = to_categorical(test_label_first)

test_label_not_first = test_data_not_first['document_type'] 
test_label_toTest_not_first = encoder.fit_transform(test_label_not_first)
test_label_not_first = np.transpose(test_label_toTest_not_first)
test_label_not_first = to_categorical(test_label_not_first)

X_test_first = np.array(X_test_first)
X_test_not_first = np.array(X_test_not_first)

In [20]:
test_predict_1 = model.predict(X_test_first, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)
target_names = ['acordao_de_2_instancia','agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']



In [21]:
print(classification_report(test_label_toTest_first, pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.9570    0.8945    0.9247       199
agravo_em_recurso_extraordinario     0.6378    0.3803    0.4765       213
     despacho_de_admissibilidade     0.8889    0.5986    0.7154       147
                          outros     0.9860    0.9944    0.9902     25744
                   peticao_do_RE     0.7885    0.7051    0.7445       312
                        sentenca     0.8850    0.7547    0.8147       265

                        accuracy                         0.9809     26880
                       macro avg     0.8572    0.7213    0.7777     26880
                    weighted avg     0.9792    0.9809    0.9796     26880



In [22]:
test_predict_1 = model.predict(X_test_not_first, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)
target_names = ['acordao_de_2_instancia','agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']



In [23]:
print(classification_report(test_label_toTest_not_first, pred_1, target_names=target_names, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.8101    0.8649    0.8366        74
agravo_em_recurso_extraordinario     0.7202    0.4681    0.5674      1628
     despacho_de_admissibilidade     0.4419    0.3725    0.4043        51
                          outros     0.9562    0.9756    0.9658     59664
                   peticao_do_RE     0.7800    0.7343    0.7565      6019
                        sentenca     0.9275    0.7083    0.8032      1210

                        accuracy                         0.9371     68646
                       macro avg     0.7726    0.6873    0.7223     68646
                    weighted avg     0.9341    0.9371    0.9346     68646



## Evaluate images

In [3]:
path = Path("/mnt/nas/backups/08-07-2020/desktopg01/lisa/Data/small_flow")

In [4]:
dls = get_dls(path, 64, 224)

In [5]:
test_items = get_image_files(path, folders="test")

In [6]:
def get_page(file): return "1" == file.name.split("_")[-1].split(".")[0]

In [7]:
test_items_first = [x for x in test_items if get_page(x)]

In [8]:
test_items_not_first = [x for x in test_items if not get_page(x)]

In [9]:
test_dl_first = dls.test_dl(test_items_first, with_labels=True)
test_dl_not_first = dls.test_dl(test_items_not_first, with_labels=True)

In [10]:
learn = cnn_learner(dls, resnet50, loss_func=CrossEntropyLossFlat())

In [59]:
learn.load("best_image_weights_224")

<fastai.learner.Learner at 0x7f387c422250>

In [60]:
preds, targets = learn.get_preds(dl=test_dl_first)
preds = np.argmax(preds, axis=1)
print(classification_report(targets, preds, target_names=dls.vocab, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.2089    0.9543    0.3428       197
agravo_em_recurso_extraordinario     0.0350    0.7882    0.0671       203
     despacho_de_admissibilidade     0.0689    0.7260    0.1259       146
                          outros     0.9947    0.6441    0.7819     24193
                   peticao_do_RE     0.1098    0.5449    0.1828       301
                        sentenca     0.1637    0.7099    0.2661       262

                        accuracy                         0.6477     25302
                       macro avg     0.2635    0.7279    0.2944     25302
                    weighted avg     0.9564    0.6477    0.7565     25302



In [61]:
preds, targets = learn.get_preds(dl=test_dl_not_first)
preds = np.argmax(preds, axis=1)
print(classification_report(targets, preds, target_names=dls.vocab, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.0434    0.7955    0.0824        88
agravo_em_recurso_extraordinario     0.0659    0.7399    0.1210      2334
     despacho_de_admissibilidade     0.0191    0.5192    0.0368        52
                          outros     0.9754    0.3761    0.5429     63709
                   peticao_do_RE     0.2273    0.6525    0.3372      5876
                        sentenca     0.3645    0.7812    0.4971      1216

                        accuracy                         0.4172     73275
                       macro avg     0.2826    0.6441    0.2696     73275
                    weighted avg     0.8745    0.4172    0.5113     73275



In [11]:
learn.load("/img_model_no_weights/best_image_no_weights_224")

FileNotFoundError: [Errno 2] No such file or directory: '/img_model_no_weights/best_image_no_weights_224.pth'

In [None]:
preds, targets = learn.get_preds(dl=test_dl_first)
preds = np.argmax(preds, axis=1)
print(classification_report(targets, preds, target_names=dls.vocab, digits=4))

In [None]:
preds, targets = learn.get_preds(dl=test_dl_not_first)
preds = np.argmax(preds, axis=1)
print(classification_report(targets, preds, target_names=dls.vocab, digits=4))

## Evaluate Fusion

In [3]:
OUT_DIM=6

In [4]:
class GetActs(Transform):
    def encodes(self, x):        
        img_file = text_file = None
        
        if x["has_text"]:
            text_file = Path(x["activation_path"] + ".npy")
            if x["has_image"]:
                img_file = Path(text_file.as_posix().replace("text", "img").replace("npy", "pt"))
        else:
            img_file = Path(x["activation_path"] + ".pt")
        
        if img_file is None:
            img_act = torch.zeros((4096))
        else:
            img_act = torch.load(img_file)
                            
        if text_file is None:
            text_act = torch.zeros((3840))
            text_none = True
        else:
            text_act = tensor(np.load(text_file))
        
        img_none = img_file == None
        text_none = text_file == None
                            
        return (img_act, text_act, img_none, text_none)

In [5]:
class ImgTextFusion(Module):
    def __init__(self, head, embs_for_none=True, img_emb_dim=4096, text_emb_dim=3840):
        self.head = head
        self.embs_for_none = embs_for_none
        if embs_for_none:
            self.img_none_emb = torch.nn.Embedding(num_embeddings=1, embedding_dim=img_emb_dim)
            self.text_none_emb = torch.nn.Embedding(num_embeddings=1, embedding_dim=text_emb_dim)
            self.index= tensor(0)
    
    def forward(self, x):
        img_act, text_act, img_none, text_none = x
        if self.embs_for_none:
            img_act[img_none] = self.img_none_emb(self.index)
            text_act[text_none] = self.text_none_emb(self.index)
        return self.head(torch.cat([img_act, text_act], axis=-1))

In [6]:
def create_head(nf, n_out, lin_ftrs=None, ps=0.5, bn_final=False, lin_first=False):
    "Model head that takes `nf` features, runs through `lin_ftrs`, and out `n_out` classes."
    lin_ftrs = [nf, 512, n_out] if lin_ftrs is None else [nf] + lin_ftrs + [n_out]
    ps = L(ps)
    if len(ps) == 1: ps = [ps[0]/2] * (len(lin_ftrs)-2) + ps
    actns = [nn.ReLU(inplace=True)] * (len(lin_ftrs)-2) + [None]
    layers = []
    if lin_first: layers.append(nn.Dropout(ps.pop(0)))
    for ni,no,p,actn in zip(lin_ftrs[:-1], lin_ftrs[1:], ps, actns):
        layers += LinBnDrop(ni, no, bn=True, p=p, act=actn, lin_first=lin_first)
    if lin_first: layers.append(nn.Linear(lin_ftrs[-2], n_out))
    if bn_final: layers.append(nn.BatchNorm1d(lin_ftrs[-1], momentum=0.01))
    return nn.Sequential(*layers)

In [7]:
dls = torch.load("./data/fusion_dl_v2.pth").to("cpu")

In [29]:
test_dl = torch.load("./data/test_dl_fusion.pth").to("cpu")

In [30]:
head = create_head(4096 + 3840, OUT_DIM, lin_ftrs=[128])

In [31]:
model = ImgTextFusion(head)

In [32]:
learn = Learner(dls, model)

In [33]:
learn.load("best_fusion_128_moreEpochs", device="cpu")

<fastai.learner.Learner at 0x7f44fc619160>

In [34]:
preds, targets = learn.get_preds(dl=test_dl)
preds = np.argmax(preds, axis=1)

In [35]:
first_idxs = (test_dl.items["pages"] == 1).tolist()

In [36]:
not_first_idxs = (test_dl.items["pages"] != 1).tolist()

In [37]:
print(classification_report(targets[first_idxs], preds[first_idxs], target_names=dls.vocab, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.9436    0.9246    0.9340       199
agravo_em_recurso_extraordinario     0.5981    0.6009    0.5995       213
     despacho_de_admissibilidade     0.8304    0.6327    0.7181       147
                          outros     0.9889    0.9920    0.9904     25744
                   peticao_do_RE     0.7923    0.7212    0.7550       312
                        sentenca     0.8560    0.8075    0.8311       265

                        accuracy                         0.9815     26880
                       macro avg     0.8349    0.7798    0.8047     26880
                    weighted avg     0.9810    0.9815    0.9811     26880



In [38]:
print(classification_report(targets[not_first_idxs], preds[not_first_idxs], target_names=dls.vocab, digits=4))

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.7952    0.7500    0.7719        88
agravo_em_recurso_extraordinario     0.6250    0.5119    0.5628      2442
     despacho_de_admissibilidade     0.4865    0.3462    0.4045        52
                          outros     0.9566    0.9688    0.9626     66789
                   peticao_do_RE     0.7614    0.7395    0.7503      6074
                        sentenca     0.8941    0.7367    0.8078      1238

                        accuracy                         0.9317     76683
                       macro avg     0.7531    0.6755    0.7100     76683
                    weighted avg     0.9290    0.9317    0.9300     76683

