In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone https://github.com/DevinDeSilva/kinetics-i3d-Pytorch.git
!pip install -q scikit-video
!cp -r kinetics-i3d-Pytorch/* .

Cloning into 'kinetics-i3d-Pytorch'...
remote: Enumerating objects: 794, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 794 (delta 12), reused 0 (delta 0), pack-reused 772[K
Receiving objects: 100% (794/794), 446.72 MiB | 17.93 MiB/s, done.
Resolving deltas: 100% (83/83), done.
Updating files: 100% (58/58), done.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q ffmpeg-python

In [None]:
import json
import numbers
import csv
import h5py
import random
import os
import os.path
import skvideo
import skvideo.io 
import cv2
from glob import glob
from tqdm.autonotebook import tqdm
import ffmpeg

import torch
import numpy as np
import torch.nn.functional as F
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

from model.I3D_Pytorch import I3D
from video_utils import *

  from tqdm.autonotebook import tqdm


In [None]:
def read_vid_ffmpeg(file_loc,width=224,height=224,num_frames=90):
  out, _ = (
    ffmpeg
    .input(file_loc)
    .filter('select', 'lte(n,{})'.format(num_frames))
    .output('pipe:', format='rawvideo', pix_fmt='rgb24',s='{}x{}'.format(width, height))
    .run(capture_stdout=True)
  )
  video = (
      np
      .frombuffer(out, np.uint8)
      .reshape([-1, height, width, 3])
  )

  return video

In [None]:
vid = read_vid_ffmpeg("/content/drive/MyDrive/22_FYP42 - Zero-shot Explainable HAR/Datasets/Consolidated/PAMPA2/Videos/Nordic walking/nw14.mp4")

In [None]:

vid.shape

(91, 224, 224, 3)

In [None]:
class CollectedDataset(Dataset):

    def __init__(self, data_dir,transforms):
        self.data_dir = data_dir
        self.gen_cls_map()
        self.__get_data()
        self.transforms = transforms

    def gen_cls_map(self):
      self.id2cls = {}
      self.cls2id = {}
      for i,fol in enumerate(os.listdir(self.data_dir)):
        self.id2cls[i] = fol
        self.cls2id[fol] = i
      
    def __get_data(self):
      vid_path = []
      vid_name = []
      vid_class = []
      for x in glob(f"{self.data_dir}/*/*.mp4"):
        path_parts = x.split(os.path.sep)
        vid_name.append(path_parts[-1].split(".")[0])
        vid_class.append(self.cls2id[path_parts[-2]])
        vid_path.append(x)

      for x in glob(f"{self.data_dir}/*/*.avi"):
        path_parts = x.split(os.path.sep)
        vid_name.append(path_parts[-1].split(".")[0])
        vid_class.append(self.cls2id[path_parts[-2]])
        vid_path.append(x)

      self.vid_paths = np.asarray(vid_path)
      self.video_name = np.asarray(vid_name)
      self.video_class = np.asarray(vid_class)


    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target) where target is class_index of the target class.
        """
        vid_path = self.vid_paths[index]
        vid_name = self.video_name[index]
        vid_class  = self.video_class[index]
        
        frames = read_vid_ffmpeg(vid_path)

        return video_to_tensor(frames), vid_class, vid_name

    def __len__(self):
        return len(self.vid_paths)


In [None]:
label_map = {
    "a1":"Swipe left",
    "a2":"Swipe right",
    "a3":"Wave",
    "a4":"Clap",
    "a5":"Throw",
    "a6":"Arm cross",
    "a7":"Basketball shoot",
    "a8":"Draw X",
    "a9":"Draw circle (clockwise)",
    "a10":"Draw circle (counter clockwise)",
    "a11":"Draw triangle",
    "a12":"Bowling",
    "a13":"Boxing",
    "a14":"Baseball swing",
    "a15":"Tennis swing",
    "a16":"Arm curl",
    "a17":"Tennis serve",
    "a18":"Push",
    "a19":"Knock",
    "a20":"Catch",
    "a21":"Pickup and throw",
    "a22":"Jog",
    "a23":"Walk",
    "a24":"Sit to stand",
    "a25":"Stand to sit",
    "a26":"Lunge",
    "a27":"Squat",
}

In [None]:
_NUM_CLASSES = 400

_SAMPLE_VIDEO_FRAMES = 79
_SAMPLE_PATHS = {
    'rgb': 'data/v_CricketShot_g04_c01_rgb.npy',
    'flow': 'data/v_CricketShot_g04_c01_flow.npy',
}

_CHECKPOINT_PATHS = {
    'rgb': 'data/pytorch_checkpoints/rgb_scratch.pkl',
    'flow': 'data/pytorch_checkpoints/flow_scratch.pkl',
    'rgb_imagenet': 'data/pytorch_checkpoints/rgb_imagenet.pkl',
    'flow_imagenet': 'data/pytorch_checkpoints/flow_imagenet.pkl',
}

_LABEL_MAP_PATH = 'data/label_map.txt'
VID_LOC = "/content/drive/MyDrive/22_FYP42 - Zero-shot Explainable HAR/Datasets/Consolidated/UTD-MHAD/Videos"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)]

In [None]:
"""import shutil
from tqdm.autonotebook import tqdm

for k in tqdm(os.listdir(VID_LOC)):
  cls = k.split("_")[0]
  dest_fol = os.path.join(VID_LOC,cls)
  os.makedirs(dest_fol,exist_ok=True)
  shutil.move(os.path.join(VID_LOC,k),os.path.join(dest_fol,k))  """

'import shutil\nfrom tqdm.autonotebook import tqdm\n\nfor k in tqdm(os.listdir(VID_LOC)):\n  cls = k.split("_")[0]\n  dest_fol = os.path.join(VID_LOC,cls)\n  os.makedirs(dest_fol,exist_ok=True)\n  shutil.move(os.path.join(VID_LOC,k),os.path.join(dest_fol,k))  '

In [None]:
"""for k in tqdm(os.listdir(VID_LOC)):
  os.rename(
      os.path.join(VID_LOC,k), 
      os.path.join(VID_LOC,label_map[k]))"""

'for k in tqdm(os.listdir(VID_LOC)):\n  os.rename(\n      os.path.join(VID_LOC,k), \n      os.path.join(VID_LOC,label_map[k]))'

In [None]:
rgb_i3d = I3D(input_channel=3)
rgb_i3d.eval()
state_dict = torch.load(_CHECKPOINT_PATHS['rgb_imagenet'])
rgb_i3d.load_state_dict(state_dict)
rgb_i3d.to(device)
print('RGB checkpoint restored')

RGB checkpoint restored


In [None]:
rgb_i3d.features[16]

AvgPool3d(kernel_size=(2, 7, 7), stride=1, padding=0)

In [None]:
rgb_sample = torch.from_numpy(np.load(_SAMPLE_PATHS['rgb'])).to(device)
rgb_sample = rgb_sample.permute(0, 4, 1, 2 ,3)
print('RGB data loaded, shape=', str(rgb_sample.size()), rgb_sample.dtype)

rbg_score, rgb_logits, rgb_features = rgb_i3d(rgb_sample)

RGB data loaded, shape= torch.Size([1, 3, 79, 224, 224]) torch.float32


In [None]:
rbg_score.shape,rgb_logits.shape,rgb_features.shape

(torch.Size([1, 400]), torch.Size([1, 400]), torch.Size([1, 1024]))

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
test_transforms = transforms.Compose([CenterCrop(224)])

In [None]:
test_data = CollectedDataset(VID_LOC,test_transforms)
test_dl = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=True, num_workers=1, pin_memory=True)

In [None]:
"""combined_arr = []
for d in tqdm(test_dl):
  vid = d[0][0]
  if vid.shape[1] <90:
    vid = vid.repeat(1,2,1,1)

  print(vid[:,0:90,:,:].shape)
  combined_arr.append(vid[:,0:90,:,:])

combined_arr = torch.concat(combined_arr)"""

'combined_arr = []\nfor d in tqdm(test_dl):\n  vid = d[0][0]\n  if vid.shape[1] <90:\n    vid = vid.repeat(1,2,1,1)\n\n  print(vid[:,0:90,:,:].shape)\n  combined_arr.append(vid[:,0:90,:,:])\n\ncombined_arr = torch.concat(combined_arr)'

In [None]:
len(test_data)

861

In [None]:
outputs = {}
for data in tqdm(test_dl,desc="Read Videos",total= len(test_data)):
  X,y, vid_name  = data
  X = X.float().to(device)
  rbg_score, rgb_logits, rgb_features = rgb_i3d(X)

  rgb_logits, rgb_features = rgb_logits.squeeze(0), rgb_features.squeeze(0)
  
  rgb_logits = rgb_logits.cpu().detach().numpy()
  rgb_features = rgb_features.cpu().detach().numpy()
  
  try:
    outputs[test_data.id2cls[int(y)]]
  except KeyError:
    outputs[test_data.id2cls[int(y)]] = []

  outputs[test_data.id2cls[int(y)]].append([rgb_features,rgb_logits])


Read Videos:   0%|          | 0/861 [00:00<?, ?it/s]

  return torch.from_numpy(pic.transpose([3,0,1,2]))


In [None]:
feat_dict = outputs

In [None]:
#feat_dict["Jump front & back"] = feat_dict["Jump front _ back"]

In [None]:
feat_dict.keys()

dict_keys(['Squat', 'Lunge', 'Knock', 'Swipe left', 'Throw', 'Basketball shoot', 'Sit to stand', 'Bowling', 'Draw circle (counter clockwise)', 'Tennis serve', 'Tennis swing', 'Jog', 'Clap', 'Pickup and throw', 'Boxing', 'Catch', 'Walk', 'Swipe right', 'Draw X', 'Wave', 'Push', 'Arm curl', 'Stand to sit', 'Draw triangle', 'Draw circle (clockwise)', 'Baseball swing', 'Arm cross'])

In [None]:
os.makedirs("/content/drive/MyDrive/22_FYP42 - Zero-shot Explainable HAR/Datasets/Consolidated/UTD-MHAD/I3D",exist_ok=True)

In [None]:
import pickle 

with open('/content/drive/MyDrive/22_FYP42 - Zero-shot Explainable HAR/Datasets/Consolidated/UTD-MHAD/I3D/video_featV2.pkl', 'wb') as f:
    pickle.dump(feat_dict, f)