In [2]:
import torch
import einops
import numpy as np
from glob import glob
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_video
from torchvision.transforms.functional import resize
from torchvision.transforms import Compose, Resize
from panaf.datasets import PanAfFullScene
from torchvision.transforms.v2 import ConvertDtype



In [3]:
transform = Compose([Resize((224,224)), ConvertDtype(dtype=torch.float32)])
videos = glob('/home/dl18206/Desktop/phd/data/panaf/acp/videos/all/**/*.mp4', recursive=True)

**High-memory method for mean and std calc**

In [4]:
def calculate_mean_std_highmem(videos):
    frames = []
    for video in tqdm(videos):
        video = read_video(video, pts_unit='sec')[0]
        video = einops.rearrange(video, 'b h w c -> b c h w ')
        video = transform(video)
        frames.append(video)

    imgs = torch.stack(frames, dim=0).numpy()
    imgs = einops.rearrange(imgs, 's t c h w -> (s t) c h w ')

    mean_r = imgs[:,0,:,:].mean()
    mean_g = imgs[:,1,:,:].mean()
    mean_b = imgs[:,2,:,:].mean()

    std_r = imgs[:,0,:,:].std()
    std_g = imgs[:,1,:,:].std()
    std_b = imgs[:,2,:,:].std()

    return mean_r, mean_g, mean_b, std_r, std_g, std_b

In [5]:
mean_r, mean_g, mean_b, std_r, std_g, std_b = calculate_mean_std_highmem(videos[:5])
print("Mean:", mean_r,mean_g,mean_b)
print("Std:", std_r,std_g,std_b)

100%|██████████| 5/5 [00:08<00:00,  1.64s/it]


Mean: 0.4452473 0.46967298 0.45827425
Std: 0.31590375 0.31013167 0.29014185


In [7]:
def calculate_mean_std(videos):
    temp = 0.
    mean = 0.
    nb_images = 0.
    nb_samples = 0.

    # First pass to calculate mean
    for video in tqdm(videos):
        video = read_video(video, pts_unit='sec')[0]
        video = einops.rearrange(video, 'b h w c -> b c h w ')
        video = transform(video)
        nb_images += video.size()[0]
        for f in video:
            mean += torch.mean(f.to(torch.float32), dim=(1, 2))
    mean /= nb_images

    # Second pass to calculate std
    for video in tqdm(videos):
        video = read_video(video, pts_unit='sec')[0]
        video = einops.rearrange(video, 'b h w c -> b c h w ')
        video = transform(video)
        for f in video:
            temp += ((f.view(3, -1) - mean.unsqueeze(1)) ** 2).sum(dim=1)
            nb_samples += np.prod(f.size()[1:])
    std = torch.sqrt(temp/nb_samples)
    return mean, std

In [8]:
mean, std = calculate_mean_std(videos[:5])

100%|██████████| 5/5 [00:08<00:00,  1.63s/it]
100%|██████████| 5/5 [00:08<00:00,  1.69s/it]


In [9]:
print("Mean:", mean)
print("Std:", std)

Mean: tensor([0.4452, 0.4697, 0.4583])
Std: tensor([0.3159, 0.3101, 0.2901])
