In [2]:
from __future__ import print_function

from misc.config import Config
from dataset import IUDataset, build_dataset
from trainer import JoImTeR as trainer
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances,cosine_distances

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
# import os
import sys
import time
import random
import pprint
import datetime
import dateutil.tz
import argparse
import numpy as np
import pandas as pd
import torch
import torchvision.transforms as transforms
import pickle
import warnings
from tqdm import tqdm
if not sys.warnoptions:
    warnings.simplefilter("ignore")
from matplotlib import pyplot as plt
import torch.nn as nn
from torch.autograd import Variable
import torch.backends.cudnn as cudnn

from PIL import Image
import datetime
import dateutil.tz
from misc.utils import mkdir_p
# from datasets import prepare_data
from model import TextEncoder, ImageEncoder
from transformers import BertConfig
from misc.config import Config
cfg = Config()

In [3]:
torch.cuda.set_device(0)
cudnn.benchmark = True

In [4]:

np.random.seed(cfg.seed)
torch.manual_seed(cfg.seed)
if cfg.CUDA:
    torch.cuda.manual_seed_all(cfg.seed)


In [5]:
data_set = build_dataset('test', cfg)
test_loader = torch.utils.data.DataLoader(
                data_set, batch_size=100, drop_last=False,
                shuffle=False, num_workers=cfg.num_workers)

In [6]:
print(data_set.vocab_size)

1196


In [7]:

bert_config = BertConfig(vocab_size=test_loader.dataset.vocab_size, hidden_size=512, num_hidden_layers=3,
                    num_attention_heads=8, intermediate_size=2048, hidden_act='gelu',
                    hidden_dropout_prob=cfg.hidden_dropout_prob, attention_probs_dropout_prob=cfg.attention_probs_dropout_prob,
                    max_position_embeddings=512, layer_norm_eps=1e-12,
                    initializer_range=0.02, type_vocab_size=2, pad_token_id=0)

def build_model(cfg):
    image_encoder = ImageEncoder(output_channels=cfg.hidden_dim)

    if cfg.text_encoder_path != '':

        img_encoder_path = cfg.text_encoder_path.replace('text_encoder', 'image_encoder')
#         print('Load image encoder from:', img_encoder_path)
        state_dict = torch.load(img_encoder_path, map_location='cpu')
        if 'model' in state_dict.keys():
            image_encoder.load_state_dict(state_dict['model'])
        else:
            image_encoder.load_state_dict(state_dict)
    for p in image_encoder.parameters(): # make image encoder grad on
        p.requires_grad = False


    #         image_encoder.eval()
    epoch = 0

    ###################################################################
    text_encoder = TextEncoder(bert_config = bert_config)
    if cfg.text_encoder_path != '':
        text_encoder_path = cfg.text_encoder_path
#         print('Load text encoder from:', text_encoder_path)
        state_dict = torch.load(text_encoder_path, map_location='cpu')
        if 'model' in state_dict.keys():
            text_encoder.load_state_dict(state_dict['model'])
        else:
            text_encoder.load_state_dict(state_dict)
    for p in text_encoder.parameters(): # make text encoder grad on
        p.requires_grad = False

    # ########################################################### #
    if cfg.CUDA:
        text_encoder = text_encoder.cuda()
        image_encoder = image_encoder.cuda()
    return text_encoder.eval(),image_encoder.eval()
    

In [16]:
tp1_arr = []
tp5_arr = []
tp10_arr = []
tp1_arr_t = []
tp5_arr_t = []
tp10_arr_t = []
run_arr = []
for run in tqdm(os.listdir('../output')):
# for run in tqdm(['../output/OpenI_test_only.s.triplet_01.01_04_2020_12_05_14_32_12/']):
    cos = None
    
    cfg.text_encoder_path = os.path.join('../output',run,'Model',os.listdir(os.path.join('../output',run,'Model'))[0].replace('image','text'))

#     print(cfg.text_encoder_path)

    text_encoder, image_encoder = build_model(cfg)

    data_iter = iter(test_loader)
#     print('num_batches:',len(data_iter))


    for step in range(len(data_iter)):
        imgs, captions, masks, class_ids, cap_lens = data_iter.next()
        class_ids = class_ids.numpy()

        if cfg.CUDA:
            imgs, captions, masks, cap_lens = imgs.cuda(), captions.cuda(), masks.cuda(), cap_lens.cuda()
    #     b, n, l = captions.shape
        r_feats, v_feats = image_encoder(imgs)
        w_feats, s_feats = text_encoder(captions,masks)
        v_feats = v_feats.detach().cpu().numpy()
        s_feats = s_feats.detach().cpu().numpy()
        
        if cos is None:
            cos = cosine_similarity(v_feats,s_feats)
            cos_t = cosine_similarity(s_feats,v_feats)
        else:
            cos = np.concatenate([cos,cosine_similarity(v_feats,s_feats)],axis = 0)
            cos_t = np.concatenate([cos_t,cosine_similarity(s_feats,v_feats)],axis = 0)
    #     print(len(cos))
                # len(dataset.wordtoix)
    total_count=len(cos)
#     print(total_count)
    cos = np.array(cos) #i2t
    
    gt = np.concatenate(np.repeat([range(test_loader.batch_size)],cos.shape[0]/test_loader.batch_size,axis=0))
    run_arr.append(','.join(run.split('_')[2:5]))
#     print(cos.shape)

    # i2t r@100
    tp = cos.argsort()[:,-10:][:,::-1]
    tp_arr = []
    for i in range(10):
        tp_arr.append(np.array(tp[:,i]))
        
        
    t1,t5,t10=0,0,0
    for i in range(10):
        if i==0:
#             print(i,'top1')
            t1=((gt == tp_arr[i]).sum())
            t5 = t1
        elif i>0 and i<5:
#             print(i,'top5')
            t5+=((gt == tp_arr[i]).sum())
            t10 = t5
        elif i>4:
#             print(i,'top10')
            t10+=((gt == tp_arr[i]).sum())
        
    tp1_arr.append(t1*100/total_count)
    tp5_arr.append(t5*100/total_count)
    tp10_arr.append(t10*100/total_count)
    
    # t2i r@100
    cos_t = np.array(cos_t)
    tp = cos_t.argsort()[:,-10:][:,::-1]
    tp_arr = []
    for i in range(10):
        tp_arr.append(np.array(tp[:,i]))
        
        
    t1,t5,t10=0,0,0
    for i in range(10):
        if i==0:
#             print(i,'top1')
            t1=((gt == tp_arr[i]).sum())
            t5 = t1
        elif i>0 and i<5:
#             print(i,'top5')
            t5+=((gt == tp_arr[i]).sum())
            t10 = t5
        elif i>4:
#             print(i,'top10')
            t10+=((gt == tp_arr[i]).sum())
        
    tp1_arr_t.append(t1*100/total_count)
    tp5_arr_t.append(t5*100/total_count)
    tp10_arr_t.append(t10*100/total_count)
    
#     'top1:{0:.4f}, top2:{1:.4f}, top3:{2:.4f}'.format(((gt == tp1).sum())/total_count,((gt == tp2).sum()+(gt == tp1).sum())/total_count,((gt == tp3).sum()+(gt == tp2).sum()+(gt == tp1).sum())/total_count)
#     break

100%|██████████| 45/45 [06:21<00:00,  8.48s/it]


In [13]:
print(cfg.text_encoder_path)
# cos,cos_t
# tp_arr[i].shape
# gt.shape

../output/../output/OpenI_test_only.s.triplet_01.01_04_2020_12_05_14_32_12/Model/text_encoder150.pth


In [17]:
# os.listdir(os.path.join('../output',run,'Model'))[0]
os.listdir(os.path.join('../output',run,'Model'))
# os.listdir('../output')

['image_encoder0.pth',
 'text_encoder50.pth',
 'text_encoder0.pth',
 'image_encoder50.pth']

In [18]:
df_results = pd.DataFrame()
df_results['g1,g2,g3'] = run_arr

df_results['i2t_tp1'] = tp1_arr
df_results['i2t_tp5'] = tp5_arr
df_results['i2t_tp10'] = tp10_arr
df_results['t2i_tp1'] = tp1_arr_t
df_results['t2i_tp5'] = tp5_arr_t
df_results['t2i_tp10'] = tp10_arr_t
df_results.sort_values(by=['g1,g2,g3'])

Unnamed: 0,"g1,g2,g3",i2t_tp1,i2t_tp5,i2t_tp10,t2i_tp1,t2i_tp5,t2i_tp10
42,010101,4.0,11.333333,23.666667,3.0,12.333333,23.333333
10,010102,5.0,17.666667,30.0,6.333333,18.333333,30.333333
33,010103,3.666667,13.666667,25.0,4.333333,15.0,25.333333
13,010104,2.333333,18.333333,28.0,5.0,17.0,27.333333
6,010110,3.333333,15.333333,26.0,3.333333,13.0,25.666667
26,010210,2.333333,6.0,10.333333,1.333333,4.333333,10.0
18,010510,2.333333,12.333333,25.666667,3.0,14.333333,26.666667
3,011010,3.333333,14.333333,27.0,5.333333,19.0,27.666667
12,011510,2.0,15.0,24.666667,3.0,16.333333,27.333333
19,020110,2.0,14.333333,25.666667,4.666667,16.0,26.333333


In [14]:
df_results = pd.DataFrame()
df_results['g1,g2,g3'] = run_arr

df_results['i2t_tp1'] = tp1_arr
df_results['i2t_tp5'] = tp5_arr
df_results['i2t_tp10'] = tp10_arr
df_results['t2i_tp1'] = tp1_arr_t
df_results['t2i_tp5'] = tp5_arr_t
df_results['t2i_tp10'] = tp10_arr_t
df_results.sort_values(by=['g1,g2,g3'])

Unnamed: 0,"g1,g2,g3",i2t_tp1,i2t_tp5,i2t_tp10,t2i_tp1,t2i_tp5,t2i_tp10
0,"triplet.s0.5.w0.5,01,01",3.0,18.0,30.666667,2.666667,17.0,29.0


In [15]:
df_results = pd.DataFrame()
df_results['g1,g2,g3'] = run_arr

df_results['i2t_tp1'] = tp1_arr
df_results['i2t_tp5'] = tp5_arr
df_results['i2t_tp10'] = tp10_arr
df_results['t2i_tp1'] = tp1_arr_t
df_results['t2i_tp5'] = tp5_arr_t
df_results['t2i_tp10'] = tp10_arr_t
df_results.sort_values(by=['g1,g2,g3'])

Unnamed: 0,"g1,g2,g3",i2t_tp1,i2t_tp5,i2t_tp10,t2i_tp1,t2i_tp5,t2i_tp10
0,"only.s.triplet,01.01,04",3.666667,13.0,26.333333,3.0,9.333333,22.0


In [11]:
# df_results = pd.DataFrame()
# df_results['g1,g2,g3'] = run_arr
# df_results['tp1'] = tp1_arr
# df_results['tp2'] = tp2_arr
# df_results['tp3'] = tp3_arr
# df_results.sort_values(by=['g1,g2,g3'])

In [12]:
# df_results['g1,g2,g3'] = run_arr
# df_results['tp1'] = tp1_arr
# df_results['tp2'] = tp2_arr
# df_results['tp3'] = tp3_arr
# df_results.sort_values(by=['g1,g2,g3'])

In [13]:
# df_results.sort_values(by=['g1,g2,g3'])

In [19]:
df_results.sort_values(by=['g1,g2,g3']).to_csv('../data/r_i2t_t2i_100.csv')

In [15]:
# cos = np.array(cos)
# # cos.shape

In [16]:
# tp = cos.argsort()[:,-3:][:,::-1]
# tp1 = np.array(tp[:,0])
# tp2 = np.array(tp[:,1])
# tp3 = np.array(tp[:,2])
# gt = np.concatenate(np.repeat([range(100)],cos.shape[0]/100,axis=0))
# 'top1:{0:.4f}, top2:{1:.4f}, top3:{2:.4f}'.format(((gt == tp1).sum())/total_count,((gt == tp2).sum()+(gt == tp1).sum())/total_count,((gt == tp3).sum()+(gt == tp2).sum()+(gt == tp1).sum())/total_count)

In [17]:
# r_feats.shape, v_feats.shape, w_feats.shape, s_feats.shape

In [18]:

# for step in tqdm(range(len(data_iter))):
#     imgs, captions, masks, class_ids, cap_lens = data_iter.next()
#     class_ids = class_ids.numpy()

# #     if cfg.CUDA:
# #         imgs, captions, masks, cap_lens = imgs.cuda(), captions.cuda(), masks.cuda(), cap_lens.cuda()
# #     b, n, l = captions.shape
#     r_feats, v_feats = image_encoder(imgs)
#     w_feats, s_feats = text_encoder(captions,masks)
#     v_feats = v_feats.detach().cpu().numpy()
#     s_feats = s_feats.detach().cpu().numpy()
#     cos = cosine_similarity(v_feats,s_feats)
#     cos = np.array(cos)
#     print(cos.shape)
#     total_count = 100
#     tp = cos.argsort()[:,-3:][:,::-1]
#     tp1 = np.array(tp[:,0])
#     tp2 = np.array(tp[:,1])
#     tp3 = np.array(tp[:,2])
#     gt = np.zeros(cos.shape[0], dtype='int')
#     print('top1:{0}, top2:{1}, top3:{2}'.format(((gt == tp1).sum())/total_count,((gt == tp2).sum()+(gt == tp1).sum())/total_count,((gt == tp3).sum()+(gt == tp2).sum()+(gt == tp1).sum())/total_count))
#     break

In [19]:
# cos[0].argsort()

In [20]:
# plt.imshow((imgs[0].squeeze(0).detach().cpu().numpy()+1)/2)

In [21]:
# ' '.join([data_set.idx2word[x] for x in captions[0][:masks[0].sum()].detach().cpu().numpy()])

In [22]:
# ' '.join([data_set.idx2word[x] for x in captions[59][:masks[59].sum()].detach().cpu().numpy()])

In [23]:
# total_count = 100
# tp = cos.argsort()[:,-3:][:,::-1]
# tp1 = np.array(tp[:,0])
# tp2 = np.array(tp[:,1])
# tp3 = np.array(tp[:,2])
# gt = np.zeros(cos.shape[0], dtype='int')
# 'top1:{0}, top2:{1}, top3:{2}'.format(((gt == tp1).sum())/total_count,((gt == tp2).sum()+(gt == tp1).sum())/total_count,((gt == tp3).sum()+(gt == tp2).sum()+(gt == tp1).sum())/total_count)

In [24]:
# tp = cos.argsort()[:,-10:][:,::-1] // 5
# # tp1 = np.array(tp[:,0])
# # tp2 = np.array(tp[:,1])
# # tp3 = np.array(tp[:,2])
# # tp4 = np.array(tp[:,3])
# # tp5 = np.array(tp[:,4])
# sn = tp.shape[0]
# gt = np.repeat(np.arange(sn).reshape(sn,1), 10, axis=1)
# hits = np.equal(tp,gt)

In [25]:
# top1 = hits[:,:1].any(axis=1).sum() / hits.shape[0]
# top5 = hits[:,:5].any(axis=1).sum() / hits.shape[0]
# top10 = hits[:,:10].any(axis=1).sum() / hits.shape[0]
# print('top1: %.4f, top5: %.4f, top10: %.4f' % (top1, top5, top10))

In [26]:
# gt.shape,cos.shape

In [27]:
# a = imgs[-1][8].detach().cpu().numpy() 
# print(a.min(),a.max())
# a = (a - a.min()) / (a.max()-a.min())
# print(a.min(),a.max())
# plt.imshow(np.rollaxis(a=a,axis=0,start=3))

In [28]:
# [dataset.ixtoword[x] for x in captions[8].detach().cpu().numpy()]

In [29]:
# imgs[0].shape,imgs[1].shape,imgs[].shape

In [30]:
# sentences

In [31]:
import numpy as np
import random
import torch
import torch.nn as nn

In [32]:
# torch.dot()
# positive_pair - reduce the dot
# negative_pair - increase

In [33]:
ids = np.array(list(range(32)))

In [34]:
def softmax(x):
    return np.exp(prob)/sum(np.exp(prob))

In [35]:
cur_id = 12

In [36]:
neg_id = np.random.choice(np.array(ids),size=1)[0]
while neg_id==cur_id:
    neg_id = np.random.choice(np.array(ids),size=1)[0]

In [37]:
neg_id

6

In [38]:
np.random.choice(np.array(ids),size=1)[0]

19

In [39]:
neg_ids = torch.tensor([np.random.choice(ids[ids!=x]) for x in ids])

In [40]:
neg_ids

tensor([29, 15, 11,  8, 29, 21,  7, 26, 19, 23, 11, 10, 24, 21,  3,  7, 24,  2,
        22, 21,  1, 24, 11, 30,  5,  1, 28, 20,  0, 11, 25, 21])

In [41]:
a = torch.tensor(np.array([1,2,3,4]))
b = torch.tensor(np.array([2,4,6,8]))

In [42]:
def triplet_loss_with_distance_fn(distance_fn, anc, pos, neg, margin=0.5):
    score = distance_fn(anc,pos) - distance_fn(anc,neg) + margin
    z = torch.zeros_like(score)
    return torch.max(score,z)

In [43]:
top_5,top_10

NameError: name 'top_5' is not defined

In [None]:
import pandas as pd

In [None]:
d_100 = pd.read_csv('../data/r_100.csv')

In [None]:
d_100

In [None]:
'''
most common triplet loss
32 image text pairs
1 image as anchor - 1st text +ve sample - random sample 1 -ve sample from remaining 31 samples 

expecially in medical - no problem - is much large
                        problem - less
                        chance of -ve sample +ve sample being no problem high
                        solution: N-paired
                        


N paired triplet loss

-- table to select hyper param
1. w1.5s0.5 IU - 3 triplet - top 1
2. R@1, 5 , 10 (100) -  find 5
2.1. transpose matrix to get t2i as well
3. top 5 damsm add basic triplet top 1 - get idea after 2
4. top 1 damsm with N-paired triplet
----------------------------------------
-- find best - final ablation table
5. only with s_loss
6. only with s_triplet
7. MIMIC - s_loss, s_triplet alone
----------------------------------------
8. MIMIC learning rate 1e-4  works well

--------------------------------------
9. IU Classification data - make (use only 14 labels from the MIMIC)
10. Paper Diagram - make

11. IU/MIMIC classifcation comparable paper? 
12. Classification on IU - 3
13. Classifiaction on MIMIC - 3







4. Mimic - try the best settings on IU, (finetune) -> R@1K
    i. only attnGAN - s_loss
    ii. attnGAN - s_loss + 


Downstream:
    Classification :
        IU,MIMIC:
            Image - with random init image encoder
            Image - with our pretrained frozen image encoder  - [fc1 (256) - act(relu) - do - fc2 (14) - act(sm)] - non-trainable
            Image - [with our pretrained trainable image encoder - fc1 (256) - act(relu) - do - fc2 (14) - act(sm)] - trainable
        
            
        
    I2T - R-Prec and T2I:
        IU - top 7 results from r-prec - top 1, top5, top 10
        MIMIC - top 2 3 - top 1, top5, top 10
    
    
    
    
        
        
'''
