In [82]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torchvision.transforms as T
import torch.nn.functional as F
from einops import rearrange

### Annotations file exploration

In [4]:
CSV_NAME = 'EPIC_100_train.csv'

In [6]:
df = pd.read_csv(CSV_NAME)
df.head(1)

In [15]:
df['verb_class'].unique()

array([ 3,  6,  0,  5,  1,  4, 10,  7, 59, 77, 13, 23,  9, 38, 17, 28, 12,
        2, 72, 16, 35, 60,  8, 37, 18, 14, 11, 20, 39, 31, 15, 41, 22, 45,
       24, 42, 19, 34, 51, 27, 53, 52, 48, 47, 87, 64, 71, 49, 32, 21, 33,
       56, 44, 73, 25, 63, 30, 36, 82, 67, 26, 50, 89, 46, 61, 70, 76, 68,
       74, 40, 55, 86, 43, 91, 57, 66, 83, 94, 81, 29, 65, 58, 78, 62, 95,
       54, 85, 88, 69, 75, 79, 84, 80, 92, 96, 90, 93], dtype=int64)

In [32]:
columns = ['participant_id', 'video_id', 'start_frame', 'stop_frame', 'verb_class']
clip_df = df[columns]
clip_df

Unnamed: 0,participant_id,video_id,start_frame,stop_frame,verb_class
0,P01,P01_01,8,202,3
1,P01,P01_01,262,370,6
2,P01,P01_01,1498,1572,3
3,P01,P01_01,28785,28852,0
4,P01,P01_01,28888,28932,3
...,...,...,...,...,...
67212,P37,P37_103,18858,18883,8
67213,P37,P37_103,18893,19188,0
67214,P37,P37_103,19172,19633,9
67215,P37,P37_103,2028,2209,30


In [35]:
clip_info = clip_df.iloc[0]
clip_info

participant_id       P01
video_id          P01_01
start_frame            8
stop_frame           202
verb_class             3
Name: 0, dtype: object

### Dataset class testing

In [38]:
middel_frame = clip_info['start_frame'] + (clip_info['stop_frame'] - clip_info['start_frame']) // 2
clip_frames = (middel_frame - 30 // 2, middel_frame + 30 // 2)
clip_paths = ['home/' + 'frame_' + str(x).rjust(10, '0') + '.jpg' for x in range(clip_frames[0], clip_frames[1])]

In [39]:
for path in clip_paths:
    print(path)

home/frame_0000000090.jpg
home/frame_0000000091.jpg
home/frame_0000000092.jpg
home/frame_0000000093.jpg
home/frame_0000000094.jpg
home/frame_0000000095.jpg
home/frame_0000000096.jpg
home/frame_0000000097.jpg
home/frame_0000000098.jpg
home/frame_0000000099.jpg
home/frame_0000000100.jpg
home/frame_0000000101.jpg
home/frame_0000000102.jpg
home/frame_0000000103.jpg
home/frame_0000000104.jpg
home/frame_0000000105.jpg
home/frame_0000000106.jpg
home/frame_0000000107.jpg
home/frame_0000000108.jpg
home/frame_0000000109.jpg
home/frame_0000000110.jpg
home/frame_0000000111.jpg
home/frame_0000000112.jpg
home/frame_0000000113.jpg
home/frame_0000000114.jpg
home/frame_0000000115.jpg
home/frame_0000000116.jpg
home/frame_0000000117.jpg
home/frame_0000000118.jpg
home/frame_0000000119.jpg


In [103]:
x = torch.rand((1920, 1080, 3)).unsqueeze(-1)
y = torch.rand((1920, 1080, 3)).unsqueeze(-1)
z = torch.rand((1920, 1080, 3)).unsqueeze(-1)

print(x.shape)

torch.Size([1920, 1080, 3, 1])


In [104]:
lst = [x, y, z]
v = torch.cat(lst, -1)
print(v.shape)

torch.Size([1920, 1080, 3, 3])


In [107]:
x = torch.rand((640, 640, 3))
x
resize = T.Resize(size=(224,224))
x_resized = resize(x)
print(x_resized.shape)

torch.Size([640, 224, 224])


In [75]:
missing_frames = 7

# check if missing frames is odd in order to ensure that after padding, 
# num_frames equals NUM_FRAMES
if missing_frames % 2 == 0:
    print('1')
    pad = (missing_frames // 2, missing_frames // 2)
else:
    print('2')
    pad = (missing_frames // 2 + 1, missing_frames // 2)
print(f'pad: {pad}')
video = F.pad(v, pad, 'constant', 0)
print(video.shape)

2
pad: (4, 3)
torch.Size([3, 1920, 1080, 10])


In [89]:
# pool of size=3, stride=2
v1 = rearrange(video, 'c w h t -> c (w h) t')
print(f'v1 shape: {v1.shape}')
total = 10
num = 3
s = (total - 1)//(num - 1)
print(f'stride: {s}')
m = torch.nn.MaxPool1d(kernel_size = 2, stride=s)
output = m(v1)
output = rearrange(output, 'c (w h) t -> c w h t', h = 1080)
print(output.shape)

v1 shape: torch.Size([3, 2073600, 10])
stride: 4
torch.Size([3, 1920, 1080, 3])


In [86]:
a = 1920 * 1080
a

2073600

### image load test

In [91]:
img_path = 'sample_image.jpg'

In [100]:
from PIL import Image
import numpy as np

In [98]:
x = Image.open(img_path)

In [102]:
y = torch.tensor(np.asarray(x)).to('cpu')
print(y.shape)

torch.Size([200, 150, 3])
