In [1]:
import os
import cv2
import numpy as np
import sys
import glob
import json
import h5py

from tqdm import tqdm
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import pretrainedmodels

root = os.sep.join(os.getcwd().split(os.sep)[:-4])
device = torch.device('cuda:0')

In [2]:
print(pretrainedmodels.model_names)
model_name = 'inceptionresnetv2' # could be fbresnet152 or inceptionresnetv2
model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet')
model = model.to(device)
model.eval()

['fbresnet152', 'bninception', 'resnext101_32x4d', 'resnext101_64x4d', 'inceptionv4', 'inceptionresnetv2', 'alexnet', 'densenet121', 'densenet169', 'densenet201', 'densenet161', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'inceptionv3', 'squeezenet1_0', 'squeezenet1_1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19_bn', 'vgg19', 'nasnetamobile', 'nasnetalarge', 'dpn68', 'dpn68b', 'dpn92', 'dpn98', 'dpn131', 'dpn107', 'xception', 'senet154', 'se_resnet50', 'se_resnet101', 'se_resnet152', 'se_resnext50_32x4d', 'se_resnext101_32x4d', 'cafferesnet101', 'pnasnet5large', 'polynet']


InceptionResNetV2(
  (conv2d_1a): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2a): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2b): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (maxpool_3a): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2d_3b): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_4a): 

In [7]:
sp_model = torch.nn.Sequential(*list(model.children())[:-8])
sp_model

Sequential(
  (0): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (1): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (2): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (5): BasicConv2d(
    (conv): Conv2d(80, 192, kernel_size=(3,

In [6]:
data_transform = transforms.Compose([transforms.ToPILImage(),transforms.Resize((299,299)),transforms.ToTensor()])
def video2tensor(video_path):
    vidObj = cv2.VideoCapture(video_path) 
    count = 0
    success = 1
    frames = []
    fail = 0
    while success: 
        # OpenCV Uses BGR Colormap
        success, image = vidObj.read() 
        if success:
            RGBimage = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            #transform images
            #print(RGBimage.shape)
            frames.append(data_transform(RGBimage))
            count += 1
        else:
            fail += 1
    vidObj.release()
    frames = torch.stack(frames)
    #frames = torch.from_numpy(frames)
    # take 28 frames per clip uniformly
    interval = count//28
    frames = frames[range(0,interval*28,interval)]
    return frames,count,fail

# MSVD

In [8]:
dset_path = os.path.join(os.path.join(root,'Datasets'),'MSVD')
msvd_path = os.path.join(dset_path,'YouTube')

msvd_name_list = glob.glob(msvd_path+os.sep+'*')

url2id = {}
for line in open('youtube_mapping.txt','r').readlines():
    url2id[line.strip().split(' ')[0]] = line.strip().split(' ')[-1]

In [9]:
with torch.no_grad():
    with h5py.File('MSVD_SPATIO_TEMPORAL_320x17x17_INCEPTIONRESNETV2.hdf5', 'w') as f:
        for name in tqdm(msvd_name_list):
            tensor,_,_ = video2tensor(name)
            output_features = sp_model(tensor.to(device))
            print(output_features.size())
            ide = url2id[name.split(os.sep)[-1].split('.')[0]]
            f.create_dataset(ide, data = output_features.cpu().numpy())

  0%|          | 1/1970 [00:01<52:44,  1.61s/it]

torch.Size([28, 320, 35, 35])


  0%|          | 2/1970 [00:02<50:33,  1.54s/it]

torch.Size([28, 320, 35, 35])


  0%|          | 2/1970 [00:03<1:02:33,  1.91s/it]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/nasibullah/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-e95ed99f17b6>", line 4, in <module>
    tensor,_,_ = video2tensor(name)
  File "<ipython-input-6-921ccf1f2b07>", line 15, in video2tensor
    frames.append(data_transform(RGBimage))
  File "/home/nasibullah/anaconda3/lib/python3.8/site-packages/torchvision/transforms/transforms.py", line 67, in __call__
    img = t(img)
  File "/home/nasibullah/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/nasibullah/anaconda3/lib/python3.8/site-packages/torchvision/transforms/transforms.py", line 267, in forward
    return F.resize(img, self.size, self.interpolation)
  File "/home/nasibullah/anaconda3/lib/python3.8/site-packages/torchvision/transforms/functional.py", li

TypeError: object of type 'NoneType' has no len()

# MSRVTT

In [None]:
dset_path = os.path.join(os.path.join(root,'Datasets'),'MSR-VTT')
msrvtt_trnval_path = os.path.join(dset_path,'TrainValVideo')
msrvtt_test_path = os.path.join(dset_path,'TestVideo')

msrvtt_trnval_name_list = glob.glob(msrvtt_trnval_path+os.sep+'*')
msrvtt_test_name_list = glob.glob(msrvtt_test_path+os.sep+'*')
msrvtt_name_list = msrvtt_trnval_name_list+msrvtt_test_name_list

In [None]:
with torch.no_grad():
    with h5py.File('MSRVTT_APPEARANCE_INCEPTIONRESNETV2.hdf5', 'w') as f:
        for name in tqdm(msrvtt_name_list):
            tensor,_,_ = video2tensor(name)
            output_features = model.features(tensor.to(device)).mean(dim=(2,3))
            #print(output_features.size())
            ide = name.split(os.sep)[-1].split('.')[0]
            f.create_dataset(ide, data = output_features.cpu().numpy())