In [1]:
import sys
sys.path.insert(0, '..')

from causal_eval_model import *
from eval_model import *

import numpy as np
import torch

In [2]:
map_file = '/data/Datasets/Charades/Annotations/Charades_v1_test.csv'
root_data_path = '/data/Datasets/Charades/Charades_v1_480'
pretrained_weights = '/data/OnlineActionRecognition/models/charades_resnet50nl32_full_config1/charades_resnet50nl32_full_config1_best_model.pth'
arch = 'nonlocal_net'
backbone = 'resnet50'
baseline = True
mode = 'val'
subset = False
dataset = 'charades'
sample_frames = 32
workers = 4

hvd.init()

In [6]:
def eval_stream(map_file, root_data_path, pretrained_weights, arch, backbone, baseline, mode, subset,
                dataset, sample_frames, workers):
    start_time = time.time()

    LOG = logging.getLogger(name='eval')
    RESULTS = logging.getLogger(name='results')

    # Loading data
    data_sampler = get_distributed_sampler(dataset, list_file=map_file, root_path=root_data_path,
                                           subset=subset, mode='stream',
                                           sample_frames=sample_frames)
    video_dataset = data_sampler.dataset
    total_per_gpu = data_sampler.num_samples
    num_classes = video_dataset.num_classes
    data_time = time.time()
    LOG.info('Loading dataset took {:.3f}s'.format(data_time - start_time))
    LOG.info('Sampler total_size: {} | Sampler num_samples: {}'.format(
        data_sampler.total_size, total_per_gpu))
    LOG.debug(video_dataset)

    # Loading model
    model = get_model(arch=arch, backbone=backbone, pretrained_weights=pretrained_weights,
                      mode='val', num_classes=num_classes, non_local=baseline,
                      frame_num=sample_frames, log_name='eval')
    model.eval()
    model_time = time.time()

    def avg_output(outputs):
        avg_pool = AvgPool1d(3)

        data = outputs.view(1, -1, num_classes).contiguous()
        data = data.permute(0, 2, 1).contiguous()

        # During test, fullyconv transform takes 3 random crops of each clip
        data = avg_pool(data)
        video_data = data.view(-1, num_classes).contiguous()

        return video_data

    # Horovod: broadcast parameters.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    LOG.info('Loading model took {:.3f}s'.format(model_time - data_time))
    LOG.debug(model)

    video_metric = m.VideoPerFrameMAP(
        m.mAP()) if data_sampler.dataset.multi_label else m.VideoPerFrameAccuracy(m.TopK(k=(1, 5)))
    batch_time = m.AverageMeter('batch_time')
    data_time = m.AverageMeter('data_time')
    with torch.no_grad():

        end = time.time()
        for i, vid in enumerate(data_sampler, start=1):
            video_path, label = video_dataset[vid]
            # measure data loading time
            data_time.update(time.time() - end)
            all_data = []
            all_targets = []

            video_stream = get_dataloader(
                (dataset, 'stream'), video_path=video_path, label=label, batch_size=1,
                num_classes=num_classes, mode=mode, distributed=False, num_workers=0)

            for j, (chunk_data, chunk_target) in enumerate(video_stream):
                all_data.append(chunk_data)
                all_targets.append(chunk_target)
                
            return all_data, all_targets, video_stream.dataset.target['video_path']
                

In [25]:
def eval_clip(map_file, root_data_path, pretrained_weights, arch, backbone, baseline, mode,
              dataset, sample_frames, workers):
    start_time = time.time()

    LOG = logging.getLogger(name='eval')
    RESULTS = logging.getLogger(name='results')

    # Loading data
    data_loader = get_dataloader(dataset, list_file=map_file, root_path=root_data_path, mode=mode,
                                 sample_frames=sample_frames, batch_size=1, num_workers=workers,
                                 distributed=True)

    total_num = len(data_loader.dataset)
    num_classes = data_loader.dataset.num_classes
    data_gen = enumerate(data_loader, start=1)

    data_time = time.time()
    LOG.info('Loading dataset took {:.3f}s'.format(data_time - start_time))
    LOG.debug(data_loader.dataset)

    # Loading model
    model = get_model(arch=arch, backbone=backbone, pretrained_weights=pretrained_weights,
                      mode=mode, num_classes=num_classes, non_local=baseline,
                      frame_num=sample_frames, log_name='eval')
    model.eval()
    model_time = time.time()

    if data_loader.dataset.multi_label:
        def video_output(outputs):
            max_pool = MaxPool1d(data_loader.dataset.test_clips)
            avg_pool = AvgPool1d(3)

            data = outputs.view(1, -1, num_classes).contiguous()
            data = data.permute(0, 2, 1).contiguous()

            data = max_pool(data)
            if mode == 'test':
                # During test, fullyconv transform takes 3 random crops of each clip
                data = avg_pool(data)
            video_data = data.view(-1, num_classes).contiguous()

            return video_data

    else:
        def video_output(outputs):
            return outputs.mean(0)

    # Horovod: broadcast parameters.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    LOG.info('Loading model took {:.3f}s'.format(model_time - data_time))
    LOG.debug(model)

    video_metric = m.VideoMAP(m.mAP()) if data_loader.dataset.multi_label else m.VideoAccuracy(
        m.TopK(k=(1, 5)))
    batch_time = m.AverageMeter('batch_time')
    data_time = m.AverageMeter('data_time')
    with torch.no_grad():

        end = time.time()
        for i, (data, label) in data_gen:
            return data, label

In [5]:
stream_data, stream_label, clips_path = eval_stream(
    map_file, root_data_path, pretrained_weights, arch, backbone, baseline, mode, subset, dataset, sample_frames, workers)

[ 49  74  99 125 150 176 201 227 252 278]
[0, 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 18, 20, 22, 23, 25, 26, 28, 30, 31, 33, 34, 36, 37, 39, 41, 42, 44, 45, 47, 49, 25, 26, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 45, 47, 48, 50, 51, 53, 55, 56, 58, 59, 61, 62, 64, 66, 67, 69, 70, 72, 74, 50, 51, 53, 54, 56, 57, 59, 61, 62, 64, 65, 67, 68, 70, 72, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, 91, 92, 94, 95, 97, 99, 76, 77, 79, 80, 82, 83, 85, 87, 88, 90, 91, 93, 94, 96, 98, 99, 101, 102, 104, 106, 107, 109, 110, 112, 113, 115, 117, 118, 120, 121, 123, 125, 101, 102, 104, 105, 107, 108, 110, 112, 113, 115, 116, 118, 119, 121, 123, 124, 126, 127, 129, 131, 132, 134, 135, 137, 138, 140, 142, 143, 145, 146, 148, 150, 127, 128, 130, 131, 133, 134, 136, 138, 139, 141, 142, 144, 145, 147, 149, 150, 152, 153, 155, 157, 158, 160, 161, 163, 164, 166, 168, 169, 171, 172, 174, 176, 152, 153, 155, 156, 158, 159, 161, 163, 164, 166, 167, 169, 170, 172, 174, 175, 177, 178, 180, 182, 183, 185, 186, 1

In [27]:
clip_data, clip_label = eval_clip(
    map_file, root_data_path, pretrained_weights, arch, backbone, baseline, 'test', dataset, sample_frames, workers)

In [26]:
print(clips_path)
print(stream_data.shape, stream_label.shape)
stream_data_frame = stream_data.squeeze().numpy()
stream_label_frame = stream_label.squeeze().numpy()
stream_label_frame = (np.sum(stream_label_frame, axis=0) > 0).astype(int)
print(stream_data_frame.shape, stream_label_frame.shape)

['YSKX3_000049', 'YSKX3_000050', 'YSKX3_000051', 'YSKX3_000052', 'YSKX3_000053', 'YSKX3_000054', 'YSKX3_000055', 'YSKX3_000056', 'YSKX3_000057', 'YSKX3_000058', 'YSKX3_000059', 'YSKX3_000060', 'YSKX3_000061', 'YSKX3_000062', 'YSKX3_000063', 'YSKX3_000064', 'YSKX3_000065', 'YSKX3_000066', 'YSKX3_000067', 'YSKX3_000068', 'YSKX3_000069', 'YSKX3_000070', 'YSKX3_000071', 'YSKX3_000072', 'YSKX3_000073', 'YSKX3_000074', 'YSKX3_000075', 'YSKX3_000076', 'YSKX3_000077', 'YSKX3_000078', 'YSKX3_000079', 'YSKX3_000080', 'YSKX3_000081', 'YSKX3_000082', 'YSKX3_000083', 'YSKX3_000084', 'YSKX3_000085', 'YSKX3_000086', 'YSKX3_000087', 'YSKX3_000088', 'YSKX3_000089', 'YSKX3_000090', 'YSKX3_000091', 'YSKX3_000092', 'YSKX3_000093', 'YSKX3_000094', 'YSKX3_000095', 'YSKX3_000096', 'YSKX3_000097', 'YSKX3_000098', 'YSKX3_000099', 'YSKX3_000100', 'YSKX3_000101', 'YSKX3_000102', 'YSKX3_000103', 'YSKX3_000104', 'YSKX3_000105', 'YSKX3_000106', 'YSKX3_000107', 'YSKX3_000108', 'YSKX3_000109', 'YSKX3_000110', 'YSKX3_

AttributeError: 'list' object has no attribute 'shape'

In [28]:
print(clip_data.shape, clip_label['target'].shape, clip_label['video_path'])
clip_data_frame = clip_data.squeeze().squeeze().numpy()
clip_label_frame = clip_label['target'].squeeze().numpy()
print(clip_data_frame.shape, clip_label_frame.shape)

torch.Size([1, 10, 3, 32, 224, 224]) torch.Size([1, 157]) ['YSKX3']
(10, 3, 32, 224, 224) (157,)


In [36]:
np.all(stream_label_frame == clip_label_frame)

True

In [37]:
np.all(stream_data_frame == clip_data_frame)

True

In [7]:
# Now loading all frames from stream and manually selecting the 10 clips

stream_data, stream_label, clips_path = eval_stream(
    map_file, root_data_path, pretrained_weights, arch, backbone, baseline, mode, subset, dataset, sample_frames, workers)

In [15]:
# print(clips_path)
print(stream_data[0].shape, stream_label[0].shape)
stream_data_cat = torch.cat(stream_data, axis=1).squeeze().numpy()
stream_label_cat = torch.cat(stream_label, axis=1).squeeze().numpy()
print(stream_data_cat.shape, stream_label_cat.shape)

torch.Size([1, 90, 3, 32, 224, 224]) torch.Size([1, 90, 157])
(229, 3, 32, 224, 224) (229, 157)


In [18]:
def select_ten_clips(video_ids, video_classes):
    expanded_sample_length = int(video_ids[0].split('_')[1])
    num_frames = int(video_ids[-1].split('_')[1])
    
    sample_start_pos = np.linspace(expanded_sample_length, num_frames, 10, dtype=int)
    ids = sample_start_pos - expanded_sample_length
    print(ids)
    
    return video_ids[ids], video_classes[ids]

In [21]:
stream_clip_ids, stream_clip_data = select_ten_clips(np.array(clips_path), stream_label_cat)

[  0  25  50  76 101 126 152 177 202 228]


In [22]:
stream_clip_data.shape

(10, 157)

In [30]:
stream_target = (np.sum(stream_clip_data, axis=0) > 0).astype(int)
np.all(stream_target == clip_label_frame)

True