In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import gensim
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

from collections import OrderedDict
from torch import nn, optim
from torchtext.data import utils
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [3]:
from dataset import AbstractDataset
from util import load_model_and_opt, save_model, batch_predict
from cnn_util import CNNBase3, train_model

In [4]:
DEVICE = (f'cuda:0' if torch.cuda.is_available() else 'cpu')
HOME = '/home/hice1/khom9/CSE 8803 BMI Final Project'
EMBED_KEYS_PATH = f'{HOME}/wordvectors/abstracts200_normalized.wordvectors'

In [None]:
# Type of CNN (1, 2, or 3); refer to cnn_util.py for more info
VERSION = 3

In [5]:
print(f'Using device {DEVICE}')
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())

Using device cuda:0
Tesla V100-PCIE-32GB


In [6]:
# Only run once
'''
from nltk.stem import WordNetLemmatizer
import nltk
from gensim.models import Word2Vec

df = pd.read_csv('CleanedAVdata.csv')
nltk.download('wordnet')
tk = utils.get_tokenizer('spacy')
null_word = '\0'
lemma = WordNetLemmatizer()
abstracts = df['Abstract']
embed_dim = 200
tokens = pd.Series([[lemma.lemmatize(w) for w in tk(abst)] for abst in abstracts])
model = Word2Vec(sentences=tokens, vector_size=embed_dim, window=5, min_count=1, workers=12)

mu = np.mean(model.wv.vectors)
sigma = np.sqrt(np.var(model.wv.vectors))
model.wv.vectors = (1 + (np.clip(model.wv.vectors, mu-3*sigma, mu+3*sigma) - mu) / (3*sigma)) / 2
model.wv.vectors = np.clip(model.wv.vectors, 0, 1)

model.wv[null_word] = np.zeros(embed_dim)
model.wv.save(f'{HOME}/wordvectors/abstracts{embed_dim}_normalized.wordvectors')
'''

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/hice1/khom9/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
tk = utils.get_tokenizer('spacy')
wv = gensim.models.KeyedVectors.load(EMBED_KEYS_PATH, mmap='r')
null_word = '\0'
d = AbstractDataset(f'{HOME}/CleanedAVdata.csv', 'Abstract', 'IPCR Classifications', tk, wv.key_to_index,
                    null_word=null_word, min_len=30, verbose=True)

100%|██████████| 23250/23250 [00:20<00:00, 1139.42it/s]


In [8]:
# model = CNNBase2(EMBED_KEYS_PATH, null_word=null_word).to(DEVICE)
# loader = DataLoader(d, batch_size=31, shuffle=True)
# txt, label = next(iter(loader))
# txt = txt.to(DEVICE)
# model(txt.squeeze())

In [9]:
batch_size = 48
lr = 1e-4

save_path = f'{HOME}/models/cnn_model-{VERSION}.pth'
act_path = f'{HOME}/models/cnn_model-{VERSION}-max-activations.pkl'
wv_out_path = f'{HOME}/wordvectors/abstracts200_trained_normalized_{VERSION}.wordvectors'

model = CNNBase3(EMBED_KEYS_PATH, null_word=null_word).to(DEVICE)

num_pos = d.labels.sum(axis=0, keepdim=True).to_dense()
pos_weight = (d.labels.shape[0] - num_pos) / num_pos
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(DEVICE))
optimizer = optim.NAdam(model.parameters(), lr=lr)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.5)

# model, optimizer = load_model_and_opt(model, optimizer, save_path)
# for param_group in optimizer.param_groups:
#         param_group['lr'] = lr

In [11]:
epochs = 350
train_model(model, optimizer, d, loss_fn, epochs=epochs, batch_size=batch_size, save_freq=25, 
            save_path=save_path, scheduler=scheduler, device=DEVICE)

Learning rate: 0.0001
Training for 25 epochs, with batch size=48
Using device: cuda:0
Saving model every 25 epochs to /home/hice1/khom9/CSE 8803 BMI Final Project/models/cnn_model-999.pth

-----Epoch 1/25-----
Batch 150/485, loss: 0.28314600254098576 (0.987s)
Batch 300/485, loss: 0.3028059339523315 (0.988s)
Batch 450/485, loss: 0.2838246899843216 (0.991s)
Batch 485/485, loss: 0.26886808446475435 (0.230s)
F1 score: 0.5004786081969066

-----Epoch 2/25-----
Batch 150/485, loss: 0.2522989002863566 (0.989s)
Batch 300/485, loss: 0.28299306213855746 (0.986s)
Batch 450/485, loss: 0.30251906712849935 (0.987s)
Batch 485/485, loss: 0.2942827011857714 (0.230s)
F1 score: 0.5041616204133762

-----Epoch 3/25-----
Batch 150/485, loss: 0.262884119451046 (0.992s)
Batch 300/485, loss: 0.2576737888654073 (0.991s)
Batch 450/485, loss: 0.24624839713176092 (0.995s)
Batch 485/485, loss: 0.24074789094073432 (0.232s)
F1 score: 0.5086417264595164

-----Epoch 4/25-----
Batch 150/485, loss: 0.2388402067621549 (0.9

In [12]:
# save_model(save_path, model, optimizer, epochs)
# print(f'Saved to {save_path}')

In [13]:
i = 8721 #18622
txt, label = d[i]
label = label.unsqueeze(0)
print(loss_fn(model(txt.to(DEVICE)).detach(), label.to(DEVICE)).item())
print(torch.cat([model(txt.to(DEVICE)).detach(), label.to(DEVICE)]).T)


0.39733967185020447
tensor([[ 2.2486e-01,  0.0000e+00],
        [-7.9525e+01,  0.0000e+00],
        [-1.6868e+01,  0.0000e+00],
        [-6.4548e+01,  0.0000e+00],
        [-7.1431e+01,  0.0000e+00],
        [-4.2384e+01,  0.0000e+00],
        [-6.1352e+01,  0.0000e+00],
        [ 1.2967e+00,  0.0000e+00],
        [-1.6110e-01,  0.0000e+00],
        [-1.6940e+01,  0.0000e+00],
        [-1.0913e+00,  0.0000e+00],
        [-2.4919e+01,  0.0000e+00],
        [-8.1480e+01,  0.0000e+00],
        [-1.3655e+01,  0.0000e+00],
        [-5.9665e+01,  0.0000e+00],
        [-8.3912e+01,  0.0000e+00],
        [-9.3107e+00,  0.0000e+00],
        [-5.7356e+01,  0.0000e+00],
        [-8.8922e+01,  0.0000e+00],
        [-2.9137e+01,  0.0000e+00],
        [-1.9960e-01,  0.0000e+00],
        [-9.2277e+01,  0.0000e+00],
        [-8.0748e+01,  0.0000e+00],
        [-7.3573e+01,  0.0000e+00],
        [-5.6682e+01,  0.0000e+00],
        [-9.8003e+01,  0.0000e+00],
        [-4.9442e+01,  0.0000e+00],
        

In [14]:
pred = (torch.sigmoid(batch_predict(model, d.abst_data, device=DEVICE).detach().cpu()) > 0.5).type(torch.float)
true = d.labels.to_dense()

In [15]:
total_loss = []
loss_fn_cpu = loss_fn.cpu()
for i in range(len(d)):
    total_loss.append(loss_fn_cpu(pred[i].unsqueeze(0), true[i].unsqueeze(0)).item())
    
print(f'Total avg loss: {np.mean(total_loss)}')

Total avg loss: 1.0110183169431584


In [16]:
x = pd.Series(total_loss)
x.sort_values(ascending=False).head(15)

11919    98.782166
130      89.474243
2267     88.225616
22573    85.160156
18414    85.030014
17995    84.579231
18534    82.953407
18548    81.760132
21263    81.263100
18622    81.229805
17761    81.229156
18642    81.198997
20998    44.929916
11239    44.679024
18483    41.625961
dtype: float64

In [17]:
print(precision_score(true, pred, average=None))
print(f'Total precision: {precision_score(true, pred, average="weighted")}')

[0.14389234 0.06666667 0.11538462 0.09090909 0.09375    0.04166667
 0.06666667 0.22026144 0.11806256 0.07428571 0.11642157 0.09813084
 0.06976744 0.10714286 0.10638298 0.05       0.08112324 0.11538462
 0.06666667 0.12539185 0.16653061 0.1        0.06451613 0.0625
 0.1182266  0.05263158 0.0990991  0.0942029  0.07142857 0.06666667
 0.75678195 0.08536585 0.22612011 0.07333333 0.11928105 0.11532385
 0.09731877 0.11827957 0.06451613 0.08474576 0.04651163 0.28571429
 0.05747126 0.0625     0.0625     0.09574468 0.08333333 0.07142857
 0.07142857 0.07142857 0.04545455 0.08333333 0.06989247 0.06929134
 0.07485605 0.05882353 0.06843575 0.09134045 0.08333333 0.06825939
 0.12142857 0.1147086  0.09302326 0.10240964 0.05645161 0.13632843
 0.09375    0.16717325 0.06962025 0.06859206 0.0952381  0.07692308
 0.06179775 0.0375     0.3327763  0.13735071 0.12098765 0.09259259
 0.52519073 0.59329473 0.17888199 0.28227571 0.12844037 0.13660377
 0.1375     0.09490085 0.14265233 0.16076611 0.191067   0.35893032

In [18]:
print(recall_score(true, pred, average=None))
print(f'Total recall: {recall_score(true, pred, average="weighted")}')

[0.97887324 1.         1.         1.         1.         1.
 1.         0.99410029 0.97771588 1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.99029126 1.         1.         1.
 1.         1.         1.         1.         1.         1.
 0.66969147 1.         0.82879106 1.         0.93890675 1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         0.99435028
 1.         1.         1.         1.         1.         1.
 1.         1.         0.8185725  0.99606299 1.         1.
 0.76232762 0.74304094 0.85714286 0.76677045 0.98054475 1.
 1.         1.         1.         0.99640288 1.         0.84008039
 1.        ]
Total recall: 0.7743511998563763


In [19]:
print(f1_score(true, pred, average='weighted'))

0.5577383648059518


In [20]:
print(f'Total accuracy: {accuracy_score(true, pred)}')

Total accuracy: 0.0636989247311828


In [21]:
wv_tuned = gensim.models.KeyedVectors.load(EMBED_KEYS_PATH, mmap='r')
wv_tuned.vectors = model.embedding.weight.data.detach().cpu().numpy()

wv_tuned.vectors = np.clip(wv_tuned.vectors, a_min=0, a_max=1.)

wv_tuned.save(wv_out_path)
print(f'Saved word embeddings to {wv_out_path}')

Saved word embeddings to /home/hice1/khom9/CSE 8803 BMI Final Project/wordvectors/abstracts200_trained_normalized_999.wordvectors


In [22]:
activations = []
model.eval()
with torch.no_grad():
    for txt, label in (d):
        outputs = list(model(txt.to(DEVICE), all_outputs_max=True))
        activations.append(outputs)
activations = torch.tensor(activations)

max_act = torch.max(activations, axis=0)[0]
max_act_dict = OrderedDict(zip(list(dict(model.named_modules()).keys())[1:], max_act))

output = open(act_path, 'wb')
pickle.dump(max_act_dict, output)
output.close()
print(f'Wrote max layer activations to {act_path}')

Wrote max layer activations to /home/hice1/khom9/CSE 8803 BMI Final Project/models/cnn_model-999-max-activations.pkl
