In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import json
import numpy as np
import pickle
from struct import unpack
from base64 import b64decode
from functools import partial
from keras.utils.np_utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
import torch.nn as nn

from imblearn.over_sampling import SMOTE

from common import models

Using TensorFlow backend.


In [3]:
IMG_LEN = 1024
TXT_LEN = 300
N_TOPICS = 50
N_WORTHINESSES = 2
BATCH_SIZE=2048

In [4]:
x_img, x_txt, y_topic, y_worthiness = pickle.load(open('unpacked_worth_topics.pickle', 'rb'))

In [5]:
x_img_train, x_img_test, x_txt_train, x_txt_test, y_topic_train, y_topic_test, y_worthiness_train, y_worthiness_test = train_test_split(
    x_img, 
    x_txt, 
    y_topic,
    y_worthiness,
    test_size=0.2, 
    random_state=42,
    stratify=y_topic
)

x_img_train, x_img_val, x_txt_train, x_txt_val, y_topic_train, y_topic_val, y_worthiness_train, y_worthiness_val = train_test_split(
    x_img_train,
    x_txt_train,
    y_topic_train,
    y_worthiness_train,
    test_size=0.2,
    random_state=42,
    stratify=y_topic_train
)

In [6]:
img_sscaler = StandardScaler()
img_sscaler.fit(x_img_train)

x_img_train = img_sscaler.transform(x_img_train)
x_img_val = img_sscaler.transform(x_img_val)
x_img_test = img_sscaler.transform(x_img_test)

txt_sscaler = StandardScaler()
txt_sscaler.fit(x_txt_train)

x_txt_train = txt_sscaler.transform(x_txt_train)
x_txt_val = txt_sscaler.transform(x_txt_val)
x_txt_test = txt_sscaler.transform(x_txt_test)

In [7]:
x_img_train_t = torch.tensor(x_img_train).float()
x_img_val_t = torch.tensor(x_img_val).float()
x_img_test_t = torch.tensor(x_img_test).float()

x_txt_train_t = torch.tensor(x_txt_train).float()
x_txt_val_t = torch.tensor(x_txt_val).float()
x_txt_test_t = torch.tensor(x_txt_test).float()

y_topic_train_t = torch.tensor(y_topic_train).float()
y_topic_val_t = torch.tensor(y_topic_val).float()
y_topic_test_t = torch.tensor(y_topic_test).float()

y_worthiness_train_t = torch.tensor(y_worthiness_train).float()
y_worthiness_val_t = torch.tensor(y_worthiness_val).float()
y_worthiness_test_t = torch.tensor(y_worthiness_test).float()

In [8]:
train_ds = TensorDataset(x_img_train_t, x_txt_train_t, y_topic_train_t, y_worthiness_train_t)
val_ds = TensorDataset(x_img_val_t, x_txt_val_t, y_topic_val_t, y_worthiness_val_t)
test_ds = TensorDataset(x_img_test_t, x_txt_test_t, y_topic_test_t, y_worthiness_test_t)

In [9]:
BATCH_SIZE = 2048

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

## classwise SMOTE

In [10]:
x_train_combined = np.concatenate((x_img_train, x_txt_train), axis=1)

In [11]:
sub_datasets = []
for topic in range(N_TOPICS):
    topic_indices = np.where(np.argmax(y_topic_train, axis=1) == topic)[0]
    x_train_cur = x_train_combined[topic_indices]
    y_worthiness_cur = y_worthiness_train[topic_indices]
    sub_datasets.append((x_train_cur, y_worthiness_cur))

In [35]:
sub_datasets_smoted = []

for topic in range(N_TOPICS):
    if topic != 35 and topic != 42:
        print(topic)
        smote = SMOTE(random_state=42)
        sub_datasets_smoted.append(smote.fit_resample(
            sub_datasets[topic][0], 
            sub_datasets[topic][1]
        ))
    else:
        sub_datasets_smoted.append(sub_datasets[topic])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
36
37
38
39
40
41
43
44
45
46
47
48
49


In [36]:
x_train_combined_smoted = np.concatenate([e[0] for e in sub_datasets_smoted], axis=0)

x_img_train_smoted = x_train_combined_smoted[:, :IMG_LEN]
x_txt_train_smoted = x_train_combined_smoted[:, IMG_LEN:]

y_topic_train_smoted_non_cat = np.concatenate(
    [topic * np.ones(samples_of_topic) for topic, samples_of_topic in enumerate([len(e[0]) for e in sub_datasets_smoted])],
    axis=0
)
y_topic_train_smoted = to_categorical(y_topic_train_smoted_non_cat, N_TOPICS)



In [37]:
sub_datasets_smoted[35] = (sub_datasets_smoted[35][0], sub_datasets_smoted[35][1].argmax(axis=1).reshape(-1, 1))
sub_datasets_smoted[42] = (sub_datasets_smoted[42][0], sub_datasets_smoted[42][1].argmax(axis=1).reshape(-1, 1))
y_worthiness_train_smoted_non_cat = np.concatenate([e[1] for e in sub_datasets_smoted], axis=0)

In [39]:
y_worthiness_train_smoted = to_categorical(y_worthiness_train_smoted_non_cat, N_WORTHINESSES)

In [40]:
print(x_img_train_smoted.shape)
print(x_txt_train_smoted.shape)
print(y_topic_train_smoted.shape)
print(y_worthiness_train_smoted.shape)

(262749, 1024)
(262749, 300)
(262749, 50)
(262749, 2)


In [41]:
x_img_train_smoted_t = torch.tensor(x_img_train_smoted).float()
x_txt_train_smoted_t = torch.tensor(x_txt_train_smoted).float()
y_topic_train_smoted_t = torch.tensor(y_topic_train_smoted).float()
y_worthiness_train_smoted_t = torch.tensor(y_worthiness_train_smoted).float()  

train_ds_smoted = TensorDataset(
    x_img_train_smoted_t, 
    x_txt_train_smoted_t, 
    y_topic_train_smoted_t, 
    y_worthiness_train_smoted_t
)

train_loader_smoted = DataLoader(train_ds_smoted, batch_size=BATCH_SIZE)

In [42]:
model = models.MultitargetTridentModelBN(d=128, drop=0.5)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.0005)
writer = SummaryWriter('runs/mt_trident_bn_d128_drop05_wd0005_classwise_smoted_2')

models.fit_multitarget_trident_model(
    model=model,
    optimizer=optimizer,
    epochs=100,
    writer=writer,
    train_loader=train_loader_smoted,
    val_loader=val_loader
)

epoch: 0 train_loss: tensor(12.9478, grad_fn=<AddBackward0>) average train loss tensor(12.6716, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.05005456744240923 
val_worthiness_common_acc: 0.16968999793528597 
val_avg_loss: tensor(12.3686) 
worthiness_roc_auc_score: 0.6052733430313998 
worthiness_pr_auc_score: 0.46914315468008505
epoch: 1 train_loss: tensor(12.9990, grad_fn=<AddBackward0>) average train loss tensor(12.4259, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.10963631537032122 
val_worthiness_common_acc: 0.43385541102557296 
val_avg_loss: tensor(11.9166) 
worthiness_roc_auc_score: 0.583832763862064 
worthiness_pr_auc_score: 0.368611341857031
epoch: 2 train_loss: tensor(13.0736, grad_fn=<AddBackward0>) average train loss tensor(12.3457, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.1559743975459399 
val_worthiness_common_acc: 0.5590360735038197 
val_avg_loss: tensor(11.6099) 
worthiness_roc_auc_score: 0.5635142460094698 
worthiness_pr_auc_score: 0.29015300006710715
epo

epoch: 25 train_loss: tensor(13.6050, grad_fn=<AddBackward0>) average train loss tensor(11.5958, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.18638468572102765 
val_worthiness_common_acc: 0.8988290121818128 
val_avg_loss: tensor(10.2140) 
worthiness_roc_auc_score: 0.5911574163690074 
worthiness_pr_auc_score: 0.08680216547227398
epoch: 26 train_loss: tensor(13.5963, grad_fn=<AddBackward0>) average train loss tensor(11.5821, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.19030764239152878 
val_worthiness_common_acc: 0.9039318054449459 
val_avg_loss: tensor(10.1992) 
worthiness_roc_auc_score: 0.5990097269737011 
worthiness_pr_auc_score: 0.09172064169814029
epoch: 27 train_loss: tensor(13.6046, grad_fn=<AddBackward0>) average train loss tensor(11.5708, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.1822552576468159 
val_worthiness_common_acc: 0.9039023095301301 
val_avg_loss: tensor(10.1770) 
worthiness_roc_auc_score: 0.5985272571150246 
worthiness_pr_auc_score: 0.08947663505812603

epoch: 50 train_loss: tensor(13.6241, grad_fn=<AddBackward0>) average train loss tensor(11.3849, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.20069020440668967 
val_worthiness_common_acc: 0.9033123912338141 
val_avg_loss: tensor(10.0342) 
worthiness_roc_auc_score: 0.6140723997686193 
worthiness_pr_auc_score: 0.1144947644765015
epoch: 51 train_loss: tensor(13.5803, grad_fn=<AddBackward0>) average train loss tensor(11.3730, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.2015455859363478 
val_worthiness_common_acc: 0.9042562605079196 
val_avg_loss: tensor(10.0198) 
worthiness_roc_auc_score: 0.6130361510856017 
worthiness_pr_auc_score: 0.10740753963121286
epoch: 52 train_loss: tensor(13.6264, grad_fn=<AddBackward0>) average train loss tensor(11.3658, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.20316786125121672 
val_worthiness_common_acc: 0.8911600743297053 
val_avg_loss: tensor(10.0286) 
worthiness_roc_auc_score: 0.6162919994624374 
worthiness_pr_auc_score: 0.12330572532157262


epoch: 75 train_loss: tensor(13.4822, grad_fn=<AddBackward0>) average train loss tensor(11.1959, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.20658938736984928 
val_worthiness_common_acc: 0.9106568740229478 
val_avg_loss: tensor(9.9853) 
worthiness_roc_auc_score: 0.6186214933520411 
worthiness_pr_auc_score: 0.11705243007764538
epoch: 76 train_loss: tensor(13.4873, grad_fn=<AddBackward0>) average train loss tensor(11.1874, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.21644102291832581 
val_worthiness_common_acc: 0.8981211102262336 
val_avg_loss: tensor(9.9865) 
worthiness_roc_auc_score: 0.6181048255566248 
worthiness_pr_auc_score: 0.13465748669206642
epoch: 77 train_loss: tensor(13.4575, grad_fn=<AddBackward0>) average train loss tensor(11.1762, grad_fn=<DivBackward0>)
val_topics_common_acc: 0.2150252190071675 
val_worthiness_common_acc: 0.8857918178332301 
val_avg_loss: tensor(9.9966) 
worthiness_roc_auc_score: 0.6196548819081862 
worthiness_pr_auc_score: 0.13886316576091037
ep

In [43]:
predictions_topics = torch.tensor([])
predictions_worthiness = torch.tensor([])

with torch.no_grad():
    for x_img_cur, x_txt_cur, _, _ in test_loader:
        out_topics,_, _, out_worthiness = model(x_img_cur.float(), x_txt_cur.float())
        predictions_topics = torch.cat((predictions_topics, out_topics), 0)
        predictions_worthiness = torch.cat((predictions_worthiness, out_worthiness), 0)

predictions_topics = predictions_topics.numpy()
predictions_worthiness = predictions_worthiness.numpy()

print(classification_report(np.argmax(y_topic_test, axis=1), np.argmax(predictions_topics, axis=1)))
print(classification_report(np.argmax(y_worthiness_test, axis=1), np.argmax(predictions_worthiness, axis=1)))

print('topics roc auc score:', roc_auc_score(y_topic_test, predictions_topics))
print('worthiness roc auc score:', roc_auc_score(y_worthiness_test, predictions_worthiness))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1694
           1       0.29      0.00      0.00      1512
           2       0.00      0.00      0.00       213
           3       0.72      0.30      0.42      1736
           4       0.00      0.00      0.00       295
           5       0.00      0.00      0.00       682
           6       0.93      0.17      0.29      1082
           7       0.72      0.06      0.11       543
           8       1.00      0.04      0.08       501
           9       1.00      0.00      0.01       903
          10       0.83      0.65      0.73      1307
          11       0.76      0.28      0.41      1130
          12       0.00      0.00      0.00      1241
          13       0.67      0.00      0.01      1007
          14       0.00      0.00      0.00       816
          15       0.61      0.06      0.11       514
          16       0.41      0.08      0.14      1851
          17       0.00    

In [44]:
PATH = 'saved_models/model_trained_on_classwise_smoted_2.tar'

torch.save({
    'epoch': 100,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
}, PATH)