In [1]:
import sys
sys.path.insert(0, '..')

from causal_eval_model import *
from eval_model import *

import numpy as np
import torch

In [2]:
map_file = '/data/Datasets/Charades/Annotations/Charades_v1_test.csv'
root_data_path = '/data/Datasets/Charades/Charades_v1_480'
pretrained_weights = '/data/OnlineActionRecognition/models/charades_resnet50nl32_full_config1/charades_resnet50nl32_full_config1_best_model.pth'
arch = 'nonlocal_net'
backbone = 'resnet50'
baseline = True
mode = 'val'
subset = False
dataset = 'charades'
sample_frames = 32
workers = 4

hvd.init()

In [3]:
def eval_stream(map_file, root_data_path, pretrained_weights, arch, backbone, baseline, mode, subset,
                dataset, sample_frames, workers):
    start_time = time.time()

    LOG = logging.getLogger(name='eval')
    RESULTS = logging.getLogger(name='results')

    # Loading data
    data_sampler = get_distributed_sampler(dataset, list_file=map_file, root_path=root_data_path,
                                           subset=subset, mode='stream',
                                           sample_frames=sample_frames)
    video_dataset = data_sampler.dataset
    total_per_gpu = data_sampler.num_samples
    num_classes = video_dataset.num_classes
    data_time = time.time()
    LOG.info('Loading dataset took {:.3f}s'.format(data_time - start_time))
    LOG.info('Sampler total_size: {} | Sampler num_samples: {}'.format(
        data_sampler.total_size, total_per_gpu))
    LOG.debug(video_dataset)

    # Loading model
    model = get_model(arch=arch, backbone=backbone, pretrained_weights=pretrained_weights,
                      mode='val', num_classes=num_classes, non_local=baseline,
                      frame_num=sample_frames, log_name='eval')
    model.eval()
    model_time = time.time()

    def avg_output(outputs):
        avg_pool = AvgPool1d(3)

        data = outputs.view(1, -1, num_classes).contiguous()
        data = data.permute(0, 2, 1).contiguous()

        # During test, fullyconv transform takes 3 random crops of each clip
        data = avg_pool(data)
        video_data = data.view(-1, num_classes).contiguous()

        return video_data

    # Horovod: broadcast parameters.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    LOG.info('Loading model took {:.3f}s'.format(model_time - data_time))
    LOG.debug(model)

    video_metric = m.Video_cAP(
        m.cAP(num_classes)) if data_sampler.dataset.multi_label else m.Video_Accuracy(
            m.TopK(k=(1, 5)))
    batch_time = m.AverageMeter('batch_time')
    data_time = m.AverageMeter('data_time')
    with torch.no_grad():
        end = time.time()
        for i, vid in enumerate(data_sampler, start=1):
            video_path, label = video_dataset[vid]

            # measure data loading time
            data_time.update(time.time() - end)

            video_stream = get_dataloader('VideoStream', video_path=video_path, label=label,
                                          batch_size=1, num_classes=num_classes, mode=mode,
                                          distributed=False, num_workers=0)
            
            for j, (chunk_data, chunk_target) in enumerate(video_stream):                
                return chunk_data, chunk_target, video_stream.dataset.first_frame, video_path
                

In [4]:
def eval_clip(map_file, root_data_path, pretrained_weights, arch, backbone, baseline, mode,
              dataset, sample_frames, workers):
    start_time = time.time()

    LOG = logging.getLogger(name='eval')
    RESULTS = logging.getLogger(name='results')

    # Loading data
    data_loader = get_dataloader(dataset, list_file=map_file, root_path=root_data_path, mode=mode,
                                 sample_frames=sample_frames, batch_size=1, num_workers=workers,
                                 distributed=True)

    total_num = len(data_loader.dataset)
    num_classes = data_loader.dataset.num_classes
    data_gen = enumerate(data_loader, start=1)

    data_time = time.time()
    LOG.info('Loading dataset took {:.3f}s'.format(data_time - start_time))
    LOG.debug(data_loader.dataset)

    # Loading model
    model = get_model(arch=arch, backbone=backbone, pretrained_weights=pretrained_weights,
                      mode=mode, num_classes=num_classes, non_local=baseline,
                      frame_num=sample_frames, log_name='eval')
    model.eval()
    model_time = time.time()

    if data_loader.dataset.multi_label:
        def video_output(outputs):
            max_pool = MaxPool1d(data_loader.dataset.test_clips)
            avg_pool = AvgPool1d(3)

            data = outputs.view(1, -1, num_classes).contiguous()
            data = data.permute(0, 2, 1).contiguous()

            data = max_pool(data)
            if mode == 'test':
                # During test, fullyconv transform takes 3 random crops of each clip
                data = avg_pool(data)
            video_data = data.view(-1, num_classes).contiguous()

            return video_data

    else:
        def video_output(outputs):
            return outputs.mean(0)

    # Horovod: broadcast parameters.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    LOG.info('Loading model took {:.3f}s'.format(model_time - data_time))
    LOG.debug(model)

    video_metric = m.Video_mAP(m.mAP()) if data_loader.dataset.multi_label else m.Video_Accuracy(
        m.TopK(k=(1, 5)))
    batch_time = m.AverageMeter('batch_time')
    data_time = m.AverageMeter('data_time')
    with torch.no_grad():
        end = time.time()
        for i, (data, label) in data_gen:
            
            return data, label

In [5]:
stream_data, stream_label, first_frame, video_path = eval_stream(
    map_file, root_data_path, pretrained_weights, arch, backbone, baseline, mode, subset, dataset, sample_frames, workers)

[0, 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 18, 20, 22, 23, 25, 26, 28, 30, 31, 33, 34, 36, 37, 39, 41, 42, 44, 45, 47, 49]
[1, 2, 4, 5, 7, 8, 10, 12, 13, 15, 16, 18, 19, 21, 23, 24, 26, 27, 29, 31, 32, 34, 35, 37, 38, 40, 42, 43, 45, 46, 48, 50]
[2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 17, 19, 20, 22, 24, 25, 27, 28, 30, 32, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51]
[3, 4, 6, 7, 9, 10, 12, 14, 15, 17, 18, 20, 21, 23, 25, 26, 28, 29, 31, 33, 34, 36, 37, 39, 40, 42, 44, 45, 47, 48, 50, 52]
[4, 5, 7, 8, 10, 11, 13, 15, 16, 18, 19, 21, 22, 24, 26, 27, 29, 30, 32, 34, 35, 37, 38, 40, 41, 43, 45, 46, 48, 49, 51, 53]
[5, 6, 8, 9, 11, 12, 14, 16, 17, 19, 20, 22, 23, 25, 27, 28, 30, 31, 33, 35, 36, 38, 39, 41, 42, 44, 46, 47, 49, 50, 52, 54]
[6, 7, 9, 10, 12, 13, 15, 17, 18, 20, 21, 23, 24, 26, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 45, 47, 48, 50, 51, 53, 55]
[7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 22, 24, 25, 27, 29, 30, 32, 33, 35, 37, 38, 40, 41, 43, 44, 46, 48, 49, 51, 52, 54, 56]
[8, 9

[202, 203, 205, 206, 208, 209, 211, 213, 214, 216, 217, 219, 220, 222, 224, 225, 227, 228, 230, 232, 233, 235, 236, 238, 239, 241, 243, 244, 246, 247, 249, 251]
[203, 204, 206, 207, 209, 210, 212, 214, 215, 217, 218, 220, 221, 223, 225, 226, 228, 229, 231, 233, 234, 236, 237, 239, 240, 242, 244, 245, 247, 248, 250, 252]
[204, 205, 207, 208, 210, 211, 213, 215, 216, 218, 219, 221, 222, 224, 226, 227, 229, 230, 232, 234, 235, 237, 238, 240, 241, 243, 245, 246, 248, 249, 251, 253]
[205, 206, 208, 209, 211, 212, 214, 216, 217, 219, 220, 222, 223, 225, 227, 228, 230, 231, 233, 235, 236, 238, 239, 241, 242, 244, 246, 247, 249, 250, 252, 254]
[206, 207, 209, 210, 212, 213, 215, 217, 218, 220, 221, 223, 224, 226, 228, 229, 231, 232, 234, 236, 237, 239, 240, 242, 243, 245, 247, 248, 250, 251, 253, 255]
[207, 208, 210, 211, 213, 214, 216, 218, 219, 221, 222, 224, 225, 227, 229, 230, 232, 233, 235, 237, 238, 240, 241, 243, 244, 246, 248, 249, 251, 252, 254, 256]
[208, 209, 211, 212, 214, 215, 217

In [6]:
clip_data, clip_label = eval_clip(
    map_file, root_data_path, pretrained_weights, arch, backbone, baseline, mode, dataset, sample_frames, workers)

49
[ 0  1  3  4  6  7  9 11 12 14 15 17 18 20 22 23 25 26 28 30 31 33 34 36
 37 39 41 42 44 45 47 49]


In [7]:
print(stream_data.shape, stream_label.shape, first_frame, video_path)
stream_data_frame = stream_data.squeeze()[0].numpy()
stream_label_frame = stream_label.squeeze()[0].numpy()
print(stream_data_frame.shape, stream_label_frame.shape)

torch.Size([1, 90, 3, 32, 224, 224]) torch.Size([1, 90, 157]) 49 /data/Datasets/Charades/Charades_v1_480/YSKX3.mp4
(3, 32, 224, 224) (157,)


In [8]:
print(clip_data.shape, clip_label['target'].shape, clip_label['video_path'])
clip_data_frame = clip_data.squeeze().squeeze().numpy()
clip_label_frame = clip_label['target'].squeeze().numpy()
print(clip_data_frame.shape, clip_label_frame.shape)

torch.Size([1, 1, 3, 32, 224, 224]) torch.Size([1, 157]) ['YSKX3']
(3, 32, 224, 224) (157,)


In [9]:
np.all(stream_label_frame == clip_label_frame)

True

In [10]:
np.all(stream_data_frame == clip_data_frame)

True