In [2]:
%load_ext autoreload
%autoreload 2

#from fastai.text.all import *
import tensorflow as tf
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import Input
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from os.path import join, split, splitext
from pathlib import Path

import pandas as pd
import pickle

import tqdm

In [3]:
seed = 42

# python RNG
import random
random.seed(seed)

# pytorch RNGs
# import torch
# torch.manual_seed(seed)
# torch.backends.cudnn.deterministic = True
# if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# numpy RNG
import numpy as np
np.random.seed(seed)

# tensorflow RNG
tf.random.set_seed(seed)

In [4]:
SEQUENCE_LEN = 500 # Size of input arrays

In [5]:
models_path = Path("./models/")
weights_path = models_path/"stf_no_weights.keras"
json_path = models_path/"cnn_text.json"
tokenizer_path = models_path/"tokenizer.pickle"

In [6]:
json_file = open(json_path,'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)

2021-09-21 20:33:14.013707: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-21 20:33:14.559396: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2021-09-21 20:33:14.559435: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1835] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
model.load_weights(weights_path)

In [8]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
data_path = Path("/mnt/nas/databases/Tobacco800/unziped/")

In [12]:
train = pd.read_csv(data_path/"train.csv", delimiter=';',  usecols=['binder','docid','class', 'text'])
train.rename(columns={'text':'body'}, inplace=True)
print(train.shape)
train.dropna(inplace=True)
print(train.shape)

(1031, 4)
(1019, 4)


In [13]:
val = train.iloc[-200:,:]; print(val.shape); val.head()

(200, 4)


Unnamed: 0,binder,docid,class,body
830,Tobacco800,pkc56d00,FirstPage,".>->aa, Mailand 20014 / (301) 654-3400 ..."
831,Tobacco800,pkj90c00,FirstPage,AMERICAN '93 THE NEETROnilER FIELD SALES INFOR...
832,Tobacco800,ply60e00,FirstPage,LORILLARD INC. • ONE PARK AVENUE. NEW YORK. N....
833,Tobacco800,pmx82f00-page04_1,FirstPage,4 4 -/ NEWELL W. ELLISON H. THOMAS AUSTERN ...
834,Tobacco800,pmx82f00-page04_2,NextPage,COVINGTON & BURLING CONF...


In [14]:
train = train.iloc[:-200,:]; print(train.shape);train.head()

(819, 4)


Unnamed: 0,binder,docid,class,body
0,Tobacco800,aah97e00-page02_1,FirstPage,"Dr. M.A. Manzelli, PHILIP MORRIS INC., Researc..."
1,Tobacco800,aah97e00-page02_2,NextPage,- 2 - Please let me krow if you have any chang...
2,Tobacco800,aam09c00,FirstPage,I NOIJ-04-97 13 = 25 FROM = I D : PAGE 10/1...
3,Tobacco800,aao54e00_1,FirstPage,i PHILIP .MORRIS INCORPORATED 120 PARK...
4,Tobacco800,aao54e00_2,NextPage,"In the meantime, I hope you and your friends a..."


In [19]:
test_data = pd.read_csv(data_path/"test.csv", delimiter=';',  usecols=['binder','docid','class', 'text'])
test_data.rename(columns={'text':'body'}, inplace=True)
print(test_data.shape)
test_data.head()
test_data.dropna(inplace=True)

(259, 4)


In [17]:
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle, encoding="utf-8")

In [20]:
sequences_train = tokenizer.texts_to_sequences(train['body'])
sequences_validation = tokenizer.texts_to_sequences(val['body'])
sequences_test = tokenizer.texts_to_sequences(test_data['body'])

In [21]:
X_train = sequence.pad_sequences(sequences_train, maxlen=SEQUENCE_LEN, padding='post')
X_val = sequence.pad_sequences(sequences_validation, maxlen=SEQUENCE_LEN, padding='post')
X_test = sequence.pad_sequences(sequences_test, maxlen=SEQUENCE_LEN, padding='post')

In [22]:
encoder = LabelEncoder()

In [23]:
train_label = train['class'] 
train_label_toTest = encoder.fit_transform(train_label)
train_label = np.transpose(train_label_toTest)
train_label = to_categorical(train_label)


valid_label = val['class'] 
valid_label_toTest = encoder.fit_transform(valid_label)
valid_label = np.transpose(valid_label_toTest)
valid_label = to_categorical(valid_label)

test_label = test_data['class'] 
test_label_toTest = encoder.fit_transform(test_label)
test_label = np.transpose(test_label_toTest)
test_label = to_categorical(test_label)

X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

In [24]:
test_predict_1 = model.predict(X_test, verbose=1)
pred_1 = test_predict_1.argmax(axis=1)
target_names = ['SD','ND']

2021-09-21 20:55:16.535696: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)




In [25]:
print(classification_report(test_label_toTest, pred_1, target_names=target_names, digits=4))

              precision    recall  f1-score   support

          SD     0.8354    0.9133    0.8726       150
          ND     0.8632    0.7523    0.8039       109

    accuracy                         0.8456       259
   macro avg     0.8493    0.8328    0.8383       259
weighted avg     0.8471    0.8456    0.8437       259



In [26]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 500)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 500, 100)     7000000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 500, 256)     77056       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 500, 256)     102656      embedding[0][0]                  
______________________________________________________________________________________________

In [27]:
activation_output = model.layers[-4].output; activation_output

<KerasTensor: shape=(None, 3840) dtype=float32 (created by layer 'flatten')>

In [28]:
inputs = model.input; inputs

<KerasTensor: shape=(None, 500) dtype=int32 (created by layer 'input_1')>

In [29]:
activation_model = tf.keras.Model(inputs, activation_output)

In [30]:
train_vecs = activation_model.predict(X_train, verbose=1)




In [31]:
val_vecs = activation_model.predict(X_val, verbose=1)



In [32]:
test_vecs = activation_model.predict(X_test, verbose=1)



In [33]:
train_vecs.shape, val_vecs.shape, test_vecs.shape

((819, 3840), (200, 3840), (259, 3840))

In [28]:
train_files = []
for idx, item in train.iterrows():
    filename = (Path("./activations/text/train")/Path(item["class"])/
               (Path(item["file_name"]).with_suffix("").as_posix()+"_"+str(item["pages"])))
    filename.parent.mkdir(parents=True, exist_ok=True)
    np.save(filename, train_vecs[idx])
    train_files.append(filename)
    print(f"Saving example {idx+1}", end='\r', flush=True)

Saving example 149217

In [29]:
train["activation_path"] = train_files

In [38]:
assert (np.load("./activations/text/train/outros/AI_856934_1926211_34_17072013_3.npy") == train_vecs[3]).all()

In [30]:
val_files = []
for idx, item in val.iterrows():
    filename = (Path("./activations/text/val")/Path(item["document_type"])/
               (Path(item["file_name"]).with_suffix("").as_posix()+"_"+str(item["pages"])))
    filename.parent.mkdir(parents=True, exist_ok=True)
    np.save(filename, val_vecs[idx])
    val_files.append(filename)
    print(f"Saving example {idx+1}", end='\r', flush=True)

Saving example 94735

In [31]:
val["activation_path"] = val_files

In [36]:
test_files = []
for idx, item in test_data.iterrows():
    filename = (Path("./activations/text/test")/Path(item["class"])/
               (Path(item["docid"]).with_suffix("").as_posix()))
#     filename.parent.mkdir(parents=True, exist_ok=True)
#     np.save(filename, test_vecs[idx])
    test_files.append(filename)
    print(f"Saving example {idx+1}", end='\r', flush=True)

Saving example 259

In [37]:
test_data["activation_path"] = test_files

In [40]:
train.to_csv("./data/train_fusion.csv", index= False)

In [41]:
val.to_csv("./data/val_fusion.csv", index=False)

In [42]:
test_data.to_csv("./data/test_fusion.csv", index=False)