# Reference

https://github.com/Cadene/pretrained-models.pytorch

In [1]:
import os
import cv2
import numpy as np
import sys
import glob
import json
import h5py
from tqdm import tqdm
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import pretrainedmodels

root = os.sep.join(os.getcwd().split(os.sep)[:5]) # may change based on dataset path
save_path = 'Saved_features'
device = torch.device('cuda:0')

In [2]:
print(pretrainedmodels.model_names)
model_name = 'inceptionresnetv2' # could be fbresnet152 or inceptionresnetv2
model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet')
model = model.to(device)
model.eval()

['fbresnet152', 'bninception', 'resnext101_32x4d', 'resnext101_64x4d', 'inceptionv4', 'inceptionresnetv2', 'alexnet', 'densenet121', 'densenet169', 'densenet201', 'densenet161', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'inceptionv3', 'squeezenet1_0', 'squeezenet1_1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19_bn', 'vgg19', 'nasnetamobile', 'nasnetalarge', 'dpn68', 'dpn68b', 'dpn92', 'dpn98', 'dpn131', 'dpn107', 'xception', 'senet154', 'se_resnet50', 'se_resnet101', 'se_resnet152', 'se_resnext50_32x4d', 'se_resnext101_32x4d', 'cafferesnet101', 'pnasnet5large', 'polynet']


InceptionResNetV2(
  (conv2d_1a): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2a): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_2b): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (maxpool_3a): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2d_3b): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (conv2d_4a): 

In [3]:
print(pretrainedmodels.pretrained_settings[model_name])

{'imagenet': {'url': 'http://data.lip6.fr/cadene/pretrainedmodels/inceptionresnetv2-520b38e4.pth', 'input_space': 'RGB', 'input_size': [3, 299, 299], 'input_range': [0, 1], 'mean': [0.5, 0.5, 0.5], 'std': [0.5, 0.5, 0.5], 'num_classes': 1000}, 'imagenet+background': {'url': 'http://data.lip6.fr/cadene/pretrainedmodels/inceptionresnetv2-520b38e4.pth', 'input_space': 'RGB', 'input_size': [3, 299, 299], 'input_range': [0, 1], 'mean': [0.5, 0.5, 0.5], 'std': [0.5, 0.5, 0.5], 'num_classes': 1001}}


In [4]:
model.mean,model.std

([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])

In [5]:
data_transform = transforms.Compose([transforms.ToPILImage(),transforms.Resize((299,299)),transforms.ToTensor(),
                                    transforms.Normalize(model.mean,model.std,inplace=True)])
def video2tensor(video_path,total_frame):
    vidObj = cv2.VideoCapture(video_path) 
    count = 0
    success = 1
    frames = []
    fail = 0
    while success: 
        # OpenCV Uses BGR Colormap
        success, image = vidObj.read() 
        if success:
            RGBimage = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            #transform images
            #print(RGBimage.shape)
            frames.append(data_transform(RGBimage))
            count += 1
        else:
            fail += 1
    vidObj.release()
    frames = torch.stack(frames)
    #frames = torch.from_numpy(frames)
    # take 28 frames per clip uniformly
    interval = count//total_frame
    frames = frames[range(0,interval*total_frame,interval)]
    return frames,count,fail

# MSVD

In [6]:
dset_path = os.path.join(os.path.join(root,'Datasets'),'MSVD')
msvd_path = os.path.join(dset_path,'YouTube')

msvd_name_list = glob.glob(msvd_path+os.sep+'*')

url2id = {}
for line in open('youtube_mapping.txt','r').readlines():
    url2id[line.strip().split(' ')[0]] = line.strip().split(' ')[-1]

In [7]:
save_file = os.path.join(save_path,'MSVD_APPEARANCE_INCEPTIONRESNETV2_28.hdf5')
with torch.no_grad():
    with h5py.File(save_file, 'w') as f:
        for name in tqdm(msvd_name_list):
            tensor,_,_ = video2tensor(name,28)
            output_features = model.features(tensor.to(device)).mean(dim=(2,3))
            ide = url2id[name.split(os.sep)[-1].split('.')[0]]
            f.create_dataset(ide, data = output_features.cpu().numpy())

100%|██████████| 1970/1970 [1:05:47<00:00,  2.00s/it]


# MSRVTT

In [8]:
dset_path = os.path.join(os.path.join(root,'Datasets'),'MSR-VTT')
msrvtt_trnval_path = os.path.join(dset_path,'TrainValVideo')
msrvtt_test_path = os.path.join(dset_path,'TestVideo')

msrvtt_trnval_name_list = glob.glob(msrvtt_trnval_path+os.sep+'*')
msrvtt_test_name_list = glob.glob(msrvtt_test_path+os.sep+'*')
msrvtt_name_list = msrvtt_trnval_name_list+msrvtt_test_name_list

In [9]:
save_file = os.path.join(save_path,'MSRVTT_APPEARANCE_INCEPTIONRESNETV2_28.hdf5')
with torch.no_grad():
    with h5py.File(save_file, 'w') as f:
        for name in tqdm(msrvtt_name_list):
            tensor,_,_ = video2tensor(name,28)
            output_features = model.features(tensor.to(device)).mean(dim=(2,3))
            #print(output_features.size())
            ide = name.split(os.sep)[-1].split('.')[0]
            f.create_dataset(ide, data = output_features.cpu().numpy())

100%|██████████| 10000/10000 [4:39:09<00:00,  1.67s/it] 
