In [1]:
from rekall import Interval, IntervalSet, IntervalSetMapping, Bounds3D
from rekall.predicates import *
from rekall.stdlib import ingest
from vgrid import VGridSpec, VideoMetadata, VideoBlockFormat, FlatFormat
from vgrid import SpatialType_Bbox, SpatialType_Caption, Metadata_Generic
from vgrid_jupyter import VGridWidget
import urllib3, requests, os
import math

In [2]:
urllib3.disable_warnings()

# Video Metadata

In [3]:
VIDEO_COLLECTION_BASEURL = "http://olimar.stanford.edu/hdd/tvnews-sandbox"
VIDEO_ENDPOINT = "http://olimar.stanford.edu/hdd/tvnews-sandbox/videos"
VIDEO_METADATA_FILENAME = "data/video_meta_sandbox.json"

In [4]:
req = requests.get(os.path.join(VIDEO_COLLECTION_BASEURL, VIDEO_METADATA_FILENAME), verify=False)
video_collection = req.json()

In [5]:
video_metadata = [
    VideoMetadata(v["path"], v["id"], v["fps"], int(v["num_frames"]), v["width"], v["height"])
    for v in video_collection
]

# Dev, Test Splits

In [6]:
dev_set = [38275, 42756, 52945, 34642, 19959, 37170, 55711, 45698, 20380, 3952,
           20450, 52749, 13927, 16215, 57384, 8859, 41725, 10323, 33541, 38420,
           23184, 19882, 17458, 34359]
test_set = [54377, 26386, 5281, 763, 9499, 24847, 13247, 29001, 9480, 9215, 27188,
            13058, 32996, 6185, 36755, 13993, 4143, 3730, 15916, 529, 11579, 48140,
            41480, 16693]

In [7]:
print(len(dev_set + test_set))

48


# Load GT Annotations (Dev)

In [8]:
def filter_by_id(ism, valid_ids):
    return IntervalSetMapping({
        vid: ism.get_grouped_intervals()[vid]
        for vid in list(ism.get_grouped_intervals().keys()) if vid in valid_ids
    })

In [9]:
def load_json(video_baseurl, json_path):
    req = requests.get(os.path.join(video_baseurl, json_path), verify=False)
    json_objs = req.json()
    ism = ingest.ism_from_iterable_with_schema_bounds3D(
        json_objs,
        ingest.getter_accessor,
        {
            'key': 'video_id',
            't1': 'start',
            't2': 'end'
        },
        with_payload = lambda item: item,
        progress = True
    )
    return ism

In [10]:
PANELS_JSON = "data/panels.json"

In [11]:
panels = load_json(VIDEO_COLLECTION_BASEURL, PANELS_JSON)

100%|██████████| 135/135 [00:00<00:00, 227219.52it/s]


# Load Faces

In [12]:
video_meta_by_id = {
    vm.id: vm
    for vm in video_metadata
}
FACES_JSON = "data/face_dump.json"
req = requests.get(os.path.join(VIDEO_COLLECTION_BASEURL, FACES_JSON), verify=False)
faces_json = req.json()

In [13]:
faces_ism = ingest.ism_from_iterable_with_schema_bounds3D(
    faces_json,
    ingest.getter_accessor,
    {
        'key': 'video_id',
        't1': 'frame_number',
        't2': 'frame_number',
        'x1': 'x1',
        'x2': 'x2',
        'y1': 'y1',
        'y2': 'y2'
    },
    with_payload = lambda item: {
        'face': item,
        'video': video_meta_by_id[item['video_id']]
    },
    progress = True
).filter(
    lambda intrvl: (intrvl['t1'] % math.floor(intrvl['payload']['video'].fps * 3)) == 0
).map(
    lambda face: Interval(
        Bounds3D(
            face['t1'] / face['payload']['video'].fps - 1.5,
            face['t2'] / face['payload']['video'].fps + 1.5,
            face['x1'],
            face['x2'],
            face['y1'],
            face['y2']
        ),
        face['payload']['face']
    )
)

100%|██████████| 218006/218006 [00:01<00:00, 117874.16it/s]


In [14]:
hosts_ism = ingest.ism_from_iterable_with_schema_bounds3D(
    faces_json,
    ingest.getter_accessor,
    {
        'key': 'video_id',
        't1': 'frame_number',
        't2': 'frame_number',
        'x1': 'x1',
        'x2': 'x2',
        'y1': 'y1',
        'y2': 'y2'
    },
    with_payload = lambda item: {
        'face': item,
        'video': video_meta_by_id[item['video_id']]
    },
    progress = True
).filter(
    lambda intrvl: intrvl['payload']['face']['is_host']
).map(
    lambda face: Interval(
        Bounds3D(
            face['t1'] / face['payload']['video'].fps - 1.5,
            face['t2'] / face['payload']['video'].fps + 1.5,
            face['x1'],
            face['x2'],
            face['y1'],
            face['y2']
        ),
        face['payload']['face']
    )
)

100%|██████████| 218006/218006 [00:02<00:00, 89462.25it/s] 


# Load Captions

In [15]:
CAPTIONS_JSON = "data/captions.json"

In [16]:
def load_json(video_baseurl, json_path):
    req = requests.get(os.path.join(video_baseurl, json_path), verify=False)
    json_objs = req.json()
    ism = ingest.ism_from_iterable_with_schema_bounds3D(
        json_objs,
        ingest.getter_accessor,
        {
            'key': 'video_id',
            't1': 'start',
            't2': 'end'
        },
        with_payload = lambda item: item,
        progress = True
    )
    return ism

In [17]:
captions = load_json(VIDEO_COLLECTION_BASEURL, CAPTIONS_JSON).map(
    lambda caption: Interval(caption['bounds'], caption['payload']['caption'])
)

100%|██████████| 836546/836546 [00:04<00:00, 182318.41it/s]


In [18]:
def for_vgrid(caption_intrvl):
    return Interval(
        caption_intrvl['bounds'],
        {
            'spatial_type': SpatialType_Caption(caption_intrvl['payload']),
            'metadata': {}
        }
    )

# Evaluate Precision/Recall/F1

In [19]:
vm_by_video = {
    video_id: [vm for vm in video_metadata if vm.id == video_id][0]
    for video_id in dev_set + test_set
}

In [20]:
def frame_second_conversion(c, mode='f2s'):
    def second_to_frame(fps):
        def map_fn(intrvl):
            i2 = intrvl.copy()
            curr_bounds = intrvl['bounds'].copy()
            curr_bounds['t1'] = int(curr_bounds['t1']*fps)
            curr_bounds['t2'] = int(curr_bounds['t2']*fps)
            i2['bounds'] = curr_bounds
            return i2
        return map_fn
    
    def frame_to_second(fps):
        def map_fn(intrvl):
            i2 = intrvl.copy()
            curr_bounds = intrvl['bounds'].copy()
            curr_bounds['t1'] = int(curr_bounds['t1']/fps)
            curr_bounds['t2'] = int(curr_bounds['t2']/fps)
            i2['bounds'] = curr_bounds
            return i2
        return map_fn
    
    if mode=='f2s':
        fn = frame_to_second
    if mode=='s2f':
        fn = second_to_frame
    output = {}
    for vid, intervals in c.get_grouped_intervals().items():
        output[vid] = intervals.map(fn(vm_by_video[vid].fps))
    return IntervalSetMapping(output)

def frame_to_second_collection(c):
    return frame_second_conversion(c, 'f2s')

def second_to_frame_collection(c):
    return frame_second_conversion(c, 's2f')

In [21]:
interval = 30

In [22]:
segs_dict = {}
for video_id in dev_set + test_set:
    video = vm_by_video[video_id]
    iset = IntervalSet([
        Interval(Bounds3D(i, i), video.fps)
        for i in range(0, video.num_frames) if (i % (
            math.floor(video.fps * 3) * (interval / 3)
        )) == 0
    ])
    segs_dict[video_id] = iset
    
segments = frame_to_second_collection(IntervalSetMapping(segs_dict)).dilate(interval / 2)

In [23]:
segments_all_negative = segments.map(
    lambda intrvl: Interval(intrvl['bounds'], 0)
)

In [24]:
def filter_by_id(ism, valid_ids):
    return IntervalSetMapping({
        vid: ism.get_grouped_intervals()[vid]
        for vid in list(ism.get_grouped_intervals().keys()) if vid in valid_ids
    })

In [25]:
panel_segments = segments.filter_against(
    panels, predicate = overlaps()
).map(
    lambda intrvl: Interval(intrvl['bounds'], 1)
)

panel_labels = segments_all_negative.minus(
    panel_segments
).union(panel_segments)

print(panel_segments.size())
print(panel_labels.size())

{41480: 25, 48140: 7, 52749: 26, 529: 29, 38420: 23, 57384: 7, 6185: 46, 4143: 25, 17458: 37, 27188: 37, 34359: 18, 13927: 25, 54377: 21, 23184: 8, 8859: 23, 5281: 27, 13993: 12, 52945: 21, 32996: 29, 763: 28, 41725: 7, 13058: 11, 42756: 25, 9480: 13, 26386: 21, 9499: 69, 37170: 11, 16693: 11, 29001: 14, 3952: 44, 38275: 19, 36755: 8, 19882: 30, 20450: 12, 9215: 12}
{13058: 124, 45698: 124, 38275: 123, 42756: 124, 33541: 123, 9480: 123, 41480: 123, 48140: 123, 52749: 245, 24847: 123, 23184: 123, 529: 123, 26386: 123, 36755: 123, 38420: 124, 3730: 485, 9499: 366, 8859: 123, 20380: 124, 55711: 124, 5281: 123, 57384: 123, 6185: 123, 19882: 124, 13993: 123, 15916: 123, 4143: 123, 37170: 124, 17458: 123, 27188: 124, 16693: 123, 34359: 124, 11579: 123, 13247: 124, 29001: 124, 52945: 124, 34642: 124, 10323: 123, 16215: 63, 20450: 123, 32996: 123, 13927: 124, 54377: 124, 3952: 123, 19959: 124, 763: 124, 41725: 123, 9215: 123}


In [26]:
def evaluate_preds(predictions, panel_labels, video_ids):
    predictions = filter_by_id(predictions, video_ids)
    panel_labels = filter_by_id(panel_labels, video_ids)
    
    prediction_segments = segments.filter_against(
        predictions,
        predicate = overlaps()
    ).map(lambda intrvl: Interval(intrvl['bounds'], 1))

    prediction_labels = segments_all_negative.minus(
        prediction_segments
    ).union(prediction_segments)

    prediction_scores = prediction_labels.join(
        panel_labels,
        predicate = equal(),
        merge_op = lambda i1, i2: Interval(
            i1['bounds'],
            'tp' if i1['payload'] == i2['payload'] and i1['payload'] == 1 else
            'tn' if i1['payload'] == i2['payload'] and i1['payload'] == 0 else
            'fp' if i1['payload'] != i2['payload'] and i1['payload'] == 1 else
            'fn'
        )
    )
    
    def precision_recall_f1(pred_labels):
        def sum_values(obj):
            return sum([v for v in list(obj.values())])
        tp = sum_values(pred_labels.filter(payload_satisfies(lambda p: p == 'tp')).size())
        tn = sum_values(pred_labels.filter(payload_satisfies(lambda p: p == 'tn')).size())
        fp = sum_values(pred_labels.filter(payload_satisfies(lambda p: p == 'fp')).size())
        fn = sum_values(pred_labels.filter(payload_satisfies(lambda p: p == 'fn')).size())

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * precision * recall / (precision + recall)

        return (precision, recall, f1, tp, tn, fp, fn)
    
    return precision_recall_f1(prediction_scores)

In [27]:
evaluate_preds(filter_by_id(hosts_ism, dev_set), panel_labels, dev_set)

(0.15136054421768708,
 0.5933333333333334,
 0.24119241192411928,
 178,
 1482,
 998,
 122)

# Develop Query

In [28]:
faces_dev = filter_by_id(faces_ism, dev_set)
hosts_dev = filter_by_id(hosts_ism, dev_set)
captions_dev = filter_by_id(captions, dev_set)
panels_dev = filter_by_id(panels, dev_set)

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('faces', faces_dev.filter(
            lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn'
        )),
        ('hosts', hosts_dev),
        ('panel text', captions_dev.filter(
            lambda intrvl: ('panel' in intrvl['payload'].lower() or
                            'panelist' in intrvl['payload'].lower()
                           )
        ).dilate(10).map(
            lambda intrvl: Interval(
                intrvl['bounds'],
                { 'spatial_type': SpatialType_Bbox(), 'metadata': {
                    'caption': Metadata_Generic(intrvl['payload'])
                } }
            )
        )),
        ('panel text2', captions_dev.filter(
            lambda intrvl: ('bring' in intrvl['payload'].lower() and
                            'bringing' not in intrvl['payload'].lower())
        ).join(
            captions_dev.filter(
                lambda intrvl: ('in' == intrvl['payload'].lower().strip())
            ),
            predicate = before(max_dist=5),
            merge_op = lambda i1, i2: i1,
            window = 0.0
        ).dilate(10).map(
            lambda intrvl: Interval(
                intrvl['bounds'],
                { 'spatial_type': SpatialType_Bbox(), 'metadata': {
                    'caption': Metadata_Generic(intrvl['payload'])
                } }
            )
        )),
        ('panel text3', captions_dev.filter(
            lambda intrvl: ('joining' in intrvl['payload'].lower())
        ).dilate(10).map(
            lambda intrvl: Interval(
                intrvl['bounds'],
                { 'spatial_type': SpatialType_Bbox(), 'metadata': {
                    'caption': Metadata_Generic(intrvl['payload'])
                } }
            )
        )),
        ('panel text4', captions_dev.filter(
            lambda intrvl: ('in' in intrvl['payload'].lower())
        ).dilate(10).map(
            lambda intrvl: Interval(
                intrvl['bounds'],
                { 'spatial_type': SpatialType_Bbox(), 'metadata': {
                    'caption': Metadata_Generic(intrvl['payload'])
                } }
            )
        )),
        ('_captions', captions_dev.coalesce(
            ('t1', 't2'),
            Bounds3D.span,
            lambda p1, p2: p1 + ' ' + p2,
            predicate = lambda i1, i2: '>>' not in i2['payload'],
            epsilon = 1.0
        ).map(for_vgrid))
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

### Signal from the transcripts: "panel" or "panelist" or "bring" near "in"

In [30]:
evaluate_preds(
    captions_dev.filter(
        lambda intrvl: ('panel' in intrvl['payload'].lower() or
                        'panelist' in intrvl['payload'].lower())
    ).union(
        captions_dev.filter(
            lambda intrvl: ('bring' in intrvl['payload'].lower() and
                            'bringing' not in intrvl['payload'].lower())
        ).join(
            captions_dev.filter(
                lambda intrvl: ('in' == intrvl['payload'].lower().strip())
            ),
            predicate = overlaps(),
            merge_op = lambda i1, i2: i1,
            window = 0.0
        )
    ),
    panel_labels, dev_set)

(0.5555555555555556,
 0.029940119760479042,
 0.05681818181818182,
 10,
 2667,
 8,
 324)

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('hosts', hosts_dev.coalesce(
            ('t1', 't2'),
            Bounds3D.span,
            epsilon = 120.0
        ).filter_size(min_size=60, max_size=1800)),
        ('panel text', captions_dev.filter(
            lambda intrvl: ('panel' in intrvl['payload'].lower() or
                            'panelist' in intrvl['payload'].lower() or
                            'thank' in intrvl['payload'].lower())
        ))
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

In [31]:
host_segment_with_panel_text = hosts_dev.coalesce(
    ('t1', 't2'),
    Bounds3D.span,
    epsilon = 120.0
).filter_size(
    min_size=60, max_size=1800
).filter_against(
    captions_dev.filter(
        lambda intrvl: ('panel' in intrvl['payload'].lower() or
                        'panelist' in intrvl['payload'].lower())
    ).union(
        captions_dev.filter(
            lambda intrvl: ('bring' in intrvl['payload'].lower() and
                            'bringing' not in intrvl['payload'].lower())
        ).join(
            captions_dev.filter(
                lambda intrvl: ('in' == intrvl['payload'].lower().strip())
            ),
            predicate = overlaps(),
            merge_op = lambda i1, i2: i1,
            window = 0.0
        )
    ),
    predicate = or_pred(overlaps(), before(max_dist=10.0), after(max_dist=10.0))
)

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('panel_query', host_segment_with_panel_text)
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

In [32]:
evaluate_preds(filter_by_id(host_segment_with_panel_text, dev_set), panel_labels, dev_set)

(0.6451612903225806,
 0.2994011976047904,
 0.40899795501022485,
 100,
 2621,
 55,
 234)

### Signal from the number of faces on screen

In [33]:
faces_by_frame = faces_dev.filter(
    lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn'
).group_by_axis(('t1', 't2'), Bounds3D(0, 1, 0, 1, 0, 1))

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('panel_query', host_segment_with_panel_text),
        ('two faces', faces_by_frame.filter(lambda intrvl: len(intrvl['payload']) >= 2)),
        ('three faces', faces_by_frame.filter(lambda intrvl: len(intrvl['payload']) >= 3))
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

In [34]:
three_face_segments = faces_by_frame.filter(
    lambda intrvl: len(intrvl['payload']) >= 3
).coalesce(
    ('t1', 't2'),
    Bounds3D.span,
    epsilon = 60.0
)#.filter_size(min_size=180)

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('panel_query', host_segment_with_panel_text),
        ('hosts', hosts_dev.coalesce(
            ('t1', 't2'),
            Bounds3D.span,
            epsilon = 120.0
        ).filter_size(min_size=60, max_size=1800)),
        ('three faces', three_face_segments)
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('panel_query', host_segment_with_panel_text),
        ('three faces', three_face_segments),
        ('panels_query2', host_segment_with_panel_text.union(three_face_segments))
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

In [35]:
evaluate_preds(host_segment_with_panel_text.union(three_face_segments), panel_labels, dev_set)

(0.2541766109785203,
 0.6513761467889908,
 0.36566523605150214,
 213,
 1947,
 625,
 114)

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('_faces', faces_dev.filter(
            lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn'
        )),
        ('panel_query', host_segment_with_panel_text),
        ('hosts', hosts_dev.coalesce(
            ('t1', 't2'),
            Bounds3D.span,
            epsilon = 180.0
        ).filter_size(min_size=60, max_size=1800)),
        ('three faces', three_face_segments),
        ('panels_query2', host_segment_with_panel_text.union(
            three_face_segments.filter_size(min_size=180)))
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

In [36]:
evaluate_preds(
    host_segment_with_panel_text.union(three_face_segments.filter_size(min_size=180)),
    panel_labels, dev_set)

(0.5061728395061729,
 0.49101796407185627,
 0.4984802431610942,
 164,
 2514,
 160,
 170)

### Signal from faces on screen in certain position

In [37]:
def get_range(lst):
    return max(lst) - min(lst)

In [38]:
exactly_three_face_segments = faces_dev.filter(
    lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn' and intrvl['payload']['score'] > 0.7
).group_by_axis(
    ('t1', 't2'), Bounds3D(0, 1, 0, 1, 0, 1)
).filter(
    lambda intrvl: (
        len(intrvl['payload']) == 3 and 
        get_range([face['y1'] for face in intrvl['payload'].get_intervals()]) < 0.1
    )
)

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('_faces', faces_dev.filter(
            lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn'
        )),
        ('three faces', exactly_three_face_segments.coalesce(
            ('t1', 't2'),
            Bounds3D.span,
            epsilon = 180.0
        ).filter_size(min_size=60, max_size=1800)),
        ('panels_query2', host_segment_with_panel_text.union(
            three_face_segments.filter_size(min_size=180)))
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

In [39]:
evaluate_preds(
    host_segment_with_panel_text.union(
        three_face_segments.filter_size(min_size=180)
    ).union(
        exactly_three_face_segments.coalesce(
            ('t1', 't2'),
            Bounds3D.span,
            epsilon = 180.0
        ).filter_size(min_size=60)
    ),
    panel_labels, dev_set)

(0.4166666666666667,
 0.6756756756756757,
 0.5154639175257731,
 225,
 2352,
 315,
 108)

In [40]:
panels_query3 = host_segment_with_panel_text.union(
    three_face_segments.filter_size(min_size=180)
).union(
    exactly_three_face_segments.coalesce(
        ('t1', 't2'),
        Bounds3D.span,
        epsilon = 180.0
    ).filter_size(min_size=60)
)

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('_faces', faces_dev.filter(
            lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn'
        )),
        ('panels_query3', panels_query3)
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

In [41]:
query_three_faces = three_face_segments.filter_size(min_size=180).union(
    exactly_three_face_segments.coalesce(
        ('t1', 't2'),
        Bounds3D.span,
        epsilon = 180.0
    ).filter_size(min_size=60)
)

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('_faces', faces_dev.filter(
            lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn'
        )),
        ('three_faces_loose', three_face_segments.filter_size(min_size=180)),
        ('three_faces_strict',
            exactly_three_face_segments.coalesce(
                ('t1', 't2'),
                Bounds3D.span,
                epsilon = 180.0
            ).filter_size(min_size=60)),
        ('three_faces', query_three_faces),
        ('panels_query3', panels_query3)
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

### Ignore presidential candidates

In [42]:
presidential_candidates = [
    'donald trump',
    'hillary clinton',
    'bernie sanders',
    'barack obama'
]

In [43]:
def not_presidential_candidate(face):
    for identity in face['identities']:
        if identity['score'] > .75 and identity['identity'] in presidential_candidates:
            return False
    return True

In [44]:
faces_dev[38275].get_intervals()[0]

<Interval t1:-1.5 t2:1.5 x1:0.260848045349121 x2:0.563493549823761 y1:0.15109895169735 y2:0.760732710361481 payload:{'genders': [{'labeler': 'knn-gender', 'gender': 'M', 'score': 1.0}, {'labeler': 'rudecarnie', 'gender': 'M', 'score': 0.999956130981445}], 'labeler': 'mtcnn', 'y2': 0.760732710361481, 'score': 1.0, 'frame_number': 0, 'x2': 0.563493549823761, 'id': 307400923, 'video_id': 38275, 'y1': 0.15109895169735, 'is_host': False, 'x1': 0.260848045349121, 'identities': [{'labeler': 'face-identity-rekognition', 'identity': 'géza m. tóth', 'score': 0.61}]}>

In [45]:
exactly_three_face_segments_non_pres = faces_dev.filter(
    lambda intrvl: (
        intrvl['payload']['labeler'] == 'mtcnn' and
        intrvl['payload']['score'] > 0.7 and
        not_presidential_candidate(intrvl['payload'])
    )
).group_by_axis(
    ('t1', 't2'), Bounds3D(0, 1, 0, 1, 0, 1)
).filter(
    lambda intrvl: (
        len(intrvl['payload']) == 3 and 
        get_range([face['y1'] for face in intrvl['payload'].get_intervals()]) < 0.1
    )
)

In [46]:
query_three_faces2 = three_face_segments.filter_size(min_size=180).union(
    exactly_three_face_segments_non_pres.coalesce(
        ('t1', 't2'),
        Bounds3D.span,
        epsilon = 180.0
    ).filter_size(min_size=60)
)

In [47]:
evaluate_preds(
    host_segment_with_panel_text.union(query_three_faces2),
    panel_labels, dev_set)

(0.4376237623762376,
 0.6636636636636637,
 0.5274463007159904,
 221,
 2385,
 284,
 112)

### Look for long segments where multiple people appear contiguously

In [48]:
def get_likeliest_identity(face):
    top_identity = None
    top_identity_score = 0
    
    for identity in face['identities']:
        if 'rekognition' not in identity['labeler']:
            continue
        if identity['score'] > top_identity_score and identity['score'] > .9:
            top_identity = identity['identity']
            top_identity_score = identity['score']
            
    return top_identity

In [49]:
def has_amazon_identities(face):
    for identity in face['identities']:
        if 'rekognition' in identity['labeler'] and identity['score'] > .9:
            return True
    return False

In [50]:
faces_same_person = faces_dev.filter(
    lambda intrvl: (
        intrvl['y2'] - intrvl['y1'] > .25 and
        has_amazon_identities(intrvl['payload']) and
        get_likeliest_identity(intrvl['payload']) not in presidential_candidates
    )
).coalesce(
    ('t1', 't2'),
    Bounds3D.span,
    predicate = lambda f1, f2:
        get_likeliest_identity(f1['payload']) == get_likeliest_identity(f2['payload']),
    epsilon = 180
)

In [51]:
faces_same_person.size()

{19882: 23,
 34642: 24,
 45698: 36,
 8859: 30,
 34359: 36,
 57384: 24,
 38420: 31,
 17458: 33,
 41725: 38,
 52749: 69,
 3952: 43,
 52945: 17,
 23184: 44,
 19959: 8,
 20450: 30,
 33541: 37,
 20380: 33,
 38275: 27,
 42756: 31,
 55711: 36,
 16215: 21,
 13927: 46,
 10323: 31,
 37170: 42}

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('_faces', faces_dev.filter(
            lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn'
        )),
        ('faces_same_person', faces_same_person.filter_size(min_size=60))
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

In [52]:
two_faces_same_person = faces_same_person.filter_size(
    min_size=60
).join(
    faces_same_person.filter_size(min_size=60),
    predicate = and_pred(
        overlaps(),
        lambda f1, f2:
            get_likeliest_identity(f1['payload']) != get_likeliest_identity(f2['payload']),
    ),
    merge_op = lambda f1, f2: Interval(
        Bounds3D.intersect_time_span_space(f1['bounds'], f2['bounds']),
        (f1['payload'], f2['payload'])
    )
).filter_size(min_size=60)

In [53]:
two_faces_same_person.size()

{45698: 10,
 38275: 6,
 42756: 14,
 33541: 18,
 19959: 2,
 52749: 2,
 23184: 8,
 52945: 16,
 34642: 14,
 10323: 12,
 38420: 10,
 8859: 8,
 20380: 8,
 55711: 6,
 20450: 8,
 13927: 36,
 57384: 8,
 19882: 8,
 3952: 28,
 17458: 4,
 37170: 16,
 34359: 10,
 41725: 14}

In [54]:
three_faces_same_person = two_faces_same_person.join(
    faces_same_person.filter_size(min_size=60),
    predicate = and_pred(
        overlaps(),
        lambda f1, f2: (
            get_likeliest_identity(f1['payload'][0]) != get_likeliest_identity(f2['payload']) and
            get_likeliest_identity(f1['payload'][1]) != get_likeliest_identity(f2['payload'])
        ),
    ),
    merge_op = lambda f1, f2: Interval(
        Bounds3D.intersect_time_span_space(f1['bounds'], f2['bounds']),
        (f1['payload'][0], f1['payload'][1], f2['payload'])
    )
).filter_size(min_size=60)

In [55]:
three_faces_same_person.size()

{45698: 6,
 42756: 6,
 33541: 6,
 52945: 24,
 34642: 12,
 13927: 36,
 3952: 18,
 37170: 6,
 41725: 6}

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('_faces', faces_dev.filter(
            lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn'
        )),
        ('faces_same_person', faces_same_person.filter_size(min_size=60).map(
            lambda intrvl: Interval(
                intrvl['bounds'],
                { 'spatial_type': SpatialType_Bbox(), 'metadata': {
                    'faces1': Metadata_Generic(get_likeliest_identity(intrvl['payload']))
                } }
            )
        )),
        ('two_faces_same_person', two_faces_same_person.filter_size(min_size=60).map(
            lambda intrvl: Interval(
                intrvl['bounds'],
                { 'spatial_type': SpatialType_Bbox(), 'metadata': {
                    'faces2': Metadata_Generic((
                        get_likeliest_identity(intrvl['payload'][0]),
                        get_likeliest_identity(intrvl['payload'][1])
                    ))
                } }
            )
        )),
        ('three_faces_same_person', three_faces_same_person.filter_size(min_size=60).map(
            lambda intrvl: Interval(
                intrvl['bounds'],
                { 'spatial_type': SpatialType_Bbox(), 'metadata': {
                    'faces3': Metadata_Generic((
                        get_likeliest_identity(intrvl['payload'][0]),
                        get_likeliest_identity(intrvl['payload'][1]),
                        get_likeliest_identity(intrvl['payload'][2])
                    ))
                } }
            )
        )),
        ('query', host_segment_with_panel_text.union(query_three_faces2))
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

In [56]:
evaluate_preds(
    host_segment_with_panel_text.union(query_three_faces2).union(three_faces_same_person),
    panel_labels, dev_set)

(0.4489795918367347,
 0.7245508982035929,
 0.5544100801832761,
 242,
 2374,
 297,
 92)

In [None]:
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('panels', panels_dev),
        ('_faces', faces_dev.filter(
            lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn'
        )),
        ('query', host_segment_with_panel_text.union(
            query_three_faces2
        ).union(
            three_faces_same_person
        ))
    ]),
    video_endpoint = os.path.join(VIDEO_COLLECTION_BASEURL, 'videos')
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

# Evaluate on test

In [57]:
faces_test = filter_by_id(faces_ism, test_set)
hosts_test = filter_by_id(hosts_ism, test_set)
captions_test = filter_by_id(captions, test_set)
panels_test = filter_by_id(panels, test_set)

In [58]:
host_segment_with_panel_text_test = hosts_test.coalesce(
    ('t1', 't2'),
    Bounds3D.span,
    epsilon = 120.0
).filter_size(
    min_size=60, max_size=1800
).filter_against(
    captions_test.filter(
        lambda intrvl: ('panel' in intrvl['payload'].lower() or
                        'panelist' in intrvl['payload'].lower())
    ).union(
        captions_dev.filter(
            lambda intrvl: ('bring' in intrvl['payload'].lower() and
                            'bringing' not in intrvl['payload'].lower())
        ).join(
            captions_dev.filter(
                lambda intrvl: ('in' == intrvl['payload'].lower().strip())
            ),
            predicate = overlaps(),
            merge_op = lambda i1, i2: i1,
            window = 0.0
        )
    ),
    predicate = or_pred(overlaps(), before(max_dist=10.0), after(max_dist=10.0))
)

In [59]:
faces_by_frame_test = faces_test.filter(
    lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn'
).group_by_axis(('t1', 't2'), Bounds3D(0, 1, 0, 1, 0, 1))

In [60]:
three_face_segments_test = faces_by_frame_test.filter(
    lambda intrvl: len(intrvl['payload']) >= 3
).coalesce(
    ('t1', 't2'),
    Bounds3D.span,
    epsilon = 60.0
)#.filter_size(min_size=180)

In [61]:
exactly_three_face_segments_test = faces_test.filter(
    lambda intrvl: intrvl['payload']['labeler'] == 'mtcnn' and intrvl['payload']['score'] > 0.7
).group_by_axis(
    ('t1', 't2'), Bounds3D(0, 1, 0, 1, 0, 1)
).filter(
    lambda intrvl: (
        len(intrvl['payload']) == 3 and 
        get_range([face['y1'] for face in intrvl['payload'].get_intervals()]) < 0.1
    )
)

In [62]:
exactly_three_face_segments_non_pres_test = faces_test.filter(
    lambda intrvl: (
        intrvl['payload']['labeler'] == 'mtcnn' and
        intrvl['payload']['score'] > 0.7 and
        not_presidential_candidate(intrvl['payload'])
    )
).group_by_axis(
    ('t1', 't2'), Bounds3D(0, 1, 0, 1, 0, 1)
).filter(
    lambda intrvl: (
        len(intrvl['payload']) == 3 and 
        get_range([face['y1'] for face in intrvl['payload'].get_intervals()]) < 0.1
    )
)

In [63]:
query_three_faces_test = three_face_segments_test.filter_size(min_size=180).union(
    exactly_three_face_segments_non_pres_test.coalesce(
        ('t1', 't2'),
        Bounds3D.span,
        epsilon = 180.0
    ).filter_size(min_size=60)
)

In [64]:
faces_same_person_test = faces_test.filter(
    lambda intrvl: (
        intrvl['y2'] - intrvl['y1'] > .25 and
        has_amazon_identities(intrvl['payload']) and
        get_likeliest_identity(intrvl['payload']) not in presidential_candidates
    )
).coalesce(
    ('t1', 't2'),
    Bounds3D.span,
    predicate = lambda f1, f2:
        get_likeliest_identity(f1['payload']) == get_likeliest_identity(f2['payload']),
    epsilon = 180
)

In [65]:
two_faces_same_person_test = faces_same_person_test.filter_size(
    min_size=60
).join(
    faces_same_person_test.filter_size(min_size=60),
    predicate = and_pred(
        overlaps(),
        lambda f1, f2:
            get_likeliest_identity(f1['payload']) != get_likeliest_identity(f2['payload']),
    ),
    merge_op = lambda f1, f2: Interval(
        Bounds3D.intersect_time_span_space(f1['bounds'], f2['bounds']),
        (f1['payload'], f2['payload'])
    )
).filter_size(min_size=60)

In [66]:
three_faces_same_person_test = two_faces_same_person_test.join(
    faces_same_person_test.filter_size(min_size=60),
    predicate = and_pred(
        overlaps(),
        lambda f1, f2: (
            get_likeliest_identity(f1['payload'][0]) != get_likeliest_identity(f2['payload']) and
            get_likeliest_identity(f1['payload'][1]) != get_likeliest_identity(f2['payload'])
        ),
    ),
    merge_op = lambda f1, f2: Interval(
        Bounds3D.intersect_time_span_space(f1['bounds'], f2['bounds']),
        (f1['payload'][0], f1['payload'][1], f2['payload'])
    )
).filter_size(min_size=60)

In [67]:
evaluate_preds(
    host_segment_with_panel_text_test.union(
        query_three_faces_test
    ).union(
        three_faces_same_person_test
    ),
    panel_labels, test_set)

(0.3927789934354486,
 0.8122171945701357,
 0.5294985250737463,
 359,
 2526,
 555,
 83)