# Export all original interviews with Bernie Sanders

Run this notebook from an `esper-tv2` environment.

In [50]:
from app.models import Video, Face, FaceIdentity, LabeledInterview
from rekall import Interval, IntervalSet, IntervalSetMapping, Bounds3D
from rekall.stdlib import ingest
from rekall.predicates import *
from app.captions import *
import math
import numpy as np
import random

In [99]:
from esperlib.widget import vgrid_widget
from vgrid import VideoBlockFormat
from app.models import Video

video_meta = [Video.objects.filter(id=9901).all()[0].for_vgrid()]
vgrid_widget(
    video_meta=video_meta,
    vis_format=VideoBlockFormat(video_meta=video_meta))

VGridWidget(vgrid_spec={'settings': {'blocks_per_page': 50, 'key_mode': 'Jupyter', 'show_timeline': True, 'spi…

In [3]:
def get_fps_map(vids):
    vs = Video.objects.filter(id__in=vids)
    return {v.id: v.fps for v in vs}

def frame_second_conversion(c, mode='f2s'):
    fps_map = get_fps_map(set(c.get_grouped_intervals().keys()))
    
    def second_to_frame(fps):
        def map_fn(intrvl):
            i2 = intrvl.copy()
            curr_bounds = intrvl['bounds'].copy()
            curr_bounds['t1'] = int(curr_bounds['t1']*fps)
            curr_bounds['t2'] = int(curr_bounds['t2']*fps)
            i2['bounds'] = curr_bounds
            return i2
        return map_fn
    
    def frame_to_second(fps):
        def map_fn(intrvl):
            i2 = intrvl.copy()
            curr_bounds = intrvl['bounds'].copy()
            curr_bounds['t1'] = int(curr_bounds['t1']/fps)
            curr_bounds['t2'] = int(curr_bounds['t2']/fps)
            i2['bounds'] = curr_bounds
            return i2
        return map_fn
    
    if mode=='f2s':
        fn = frame_to_second
    if mode=='s2f':
        fn = second_to_frame
    output = {}
    for vid, intervals in c.get_grouped_intervals().items():
        output[vid] = intervals.map(fn(fps_map[vid]))
    return IntervalSetMapping(output)

def frame_to_second_collection(c):
    return frame_second_conversion(c, 'f2s')

def second_to_frame_collection(c):
    return frame_second_conversion(c, 's2f')

In [4]:
gt_interviews_all = LabeledInterview.objects.all()

In [5]:
interviews_ism = ingest.ism_from_django_qs(
    gt_interviews_all,
    bounds_schema={ 't1': 'start', 't2': 'end' },
    with_payload=lambda row: {
        'original': row.original,
        'guest1': row.guest1,
        'guest2': row.guest2,
        'interviewer1': row.interviewer1,
        'interviewer2': row.interviewer2
    }
)

In [6]:
interviews_ism

{13058: [<Interval t1:1219.0 t2:1479.0 x1:0.0 x2:1.0 y1:0.0 y2:1.0 payload:{'guest1': None, 'original': True, 'guest2': None, 'interviewer1': None, 'interviewer2': None}>, <Interval t1:2344.0 t2:2692.0 x1:0.0 x2:1.0 y1:0.0 y2:1.0 payload:{'guest1': 'john mccain', 'original': True, 'guest2': 'lindsey graham', 'interviewer1': 'jake tapper', 'interviewer2': None}>], 4611: [<Interval t1:180.0 t2:975.0 x1:0.0 x2:1.0 y1:0.0 y2:1.0 payload:{'guest1': None, 'original': True, 'guest2': None, 'interviewer1': None, 'interviewer2': None}>, <Interval t1:1012.0 t2:1683.0 x1:0.0 x2:1.0 y1:0.0 y2:1.0 payload:{'guest1': 'kellyanne conway', 'original': True, 'guest2': None, 'interviewer1': 'jake tapper', 'interviewer2': None}>], 42756: [<Interval t1:112.0 t2:750.0 x1:0.0 x2:1.0 y1:0.0 y2:1.0 payload:{'guest1': None, 'original': True, 'guest2': None, 'interviewer1': None, 'interviewer2': None}>, <Interval t1:1020.0 t2:1675.0 x1:0.0 x2:1.0 y1:0.0 y2:1.0 payload:{'guest1': 'bernie sanders', 'original': Tru

In [7]:
video_ids = sorted(list(interviews_ism.get_grouped_intervals().keys()))

In [8]:
print('hi')

hi


In [9]:
jake_qs = FaceIdentity.objects.filter(
    identity__name='jake tapper',
    face__frame__video_id__in=video_ids
).annotate(
    min_frame=F('face__frame__number'),
    max_frame=F('face__frame__number'),
    video_id=F('face__frame__video_id'),
    bbox_x1=F('face__bbox_x1'),
    bbox_y1=F('face__bbox_y1'),
    bbox_x2=F('face__bbox_x2'),
    bbox_y2=F('face__bbox_y2'),
    fps=F('face__frame__video__fps')
)

In [10]:
jake_intervals = ingest.ism_from_django_qs(
    jake_qs,
    bounds_schema={
        't1': 'min_frame',
        't2': 'max_frame',
        'x1': 'bbox_x1',
        'x2': 'bbox_x2',
        'y1': 'bbox_y1',
        'y2': 'bbox_y2',
        'payload': 'fps'
    }
).filter(lambda intrvl: (intrvl['t1'] % math.floor(intrvl['payload'] * 3)) == 0)

In [11]:
jake_intervals.get_grouped_intervals().keys()

dict_keys([50561, 13058, 4611, 42756, 24193, 41480, 38275, 23181, 24847, 23184, 529, 26386, 13827, 38420, 60186, 9499, 8220, 16542, 45472, 5281, 11555, 12837, 2648, 13993, 19882, 15916, 37170, 4143, 9480, 17458, 27188, 34359, 3769, 17983, 24992, 8859, 29001, 57804, 45645, 52945, 13141, 45655, 32472, 10335, 11792, 11003, 32996, 53355, 51175, 52075, 16879, 3952, 59122, 36755, 13556, 6185, 57592, 763, 41725, 9215])

In [12]:
bernie_qs = FaceIdentity.objects.filter(
    identity__name='bernie sanders',
    face__frame__video_id__in=video_ids
).annotate(
    min_frame=F('face__frame__number'),
    max_frame=F('face__frame__number'),
    video_id=F('face__frame__video_id'),
    bbox_x1=F('face__bbox_x1'),
    bbox_y1=F('face__bbox_y1'),
    bbox_x2=F('face__bbox_x2'),
    bbox_y2=F('face__bbox_y2'),
    fps=F('face__frame__video__fps')
)

In [13]:
bernie_intervals = ingest.ism_from_django_qs(
    bernie_qs,
    bounds_schema={
        't1': 'min_frame',
        't2': 'max_frame',
        'x1': 'bbox_x1',
        'x2': 'bbox_x2',
        'y1': 'bbox_y1',
        'y2': 'bbox_y2',
        'payload': 'fps'
    }
).filter(lambda intrvl: (intrvl['t1'] % math.floor(intrvl['payload'] * 3)) == 0)

In [15]:
widget = vgrid_widget(
    video_meta=[
        Video.objects.get(id=vid).for_vgrid()
        for vid in video_ids
    ],
    vis_format=VideoBlockFormat(imaps = [
        ('all interviews', interviews_ism),
        ('bernie-jake interviews', interviews_ism.filter(
            payload_satisfies(lambda p: (
                p['guest1'] == 'bernie sanders' and
#                 p['interviewer1'] == 'jake tapper' and
                p['guest2'] is None and
#                 p['interviewer2'] is None and
                p['original']
            ))
        )),
        ('bernie_sanders', frame_to_second_collection(bernie_intervals).dilate(1.5)),
#         ('jake tapper', frame_to_second_collection(jake_intervals).dilate(1.5)),
    ]))
widget

VGridWidget(vgrid_spec={'settings': {'blocks_per_page': 50, 'key_mode': 'Jupyter', 'show_timeline': True, 'spi…

In [101]:
bernie_interviews = interviews_ism.filter(
    payload_satisfies(lambda p: (
        p['guest1'] == 'bernie sanders' and
#                 p['interviewer1'] == 'jake tapper' and
        p['guest2'] is None and
#         p['interviewer2'] is None and
        p['original']
    ))
)

In [102]:
bernie_interviews.duration()

{763: 569.0,
 3769: 471.0,
 5281: 581.0,
 8220: 534.0,
 9901: 331.0,
 12837: 642.0,
 13141: 522.0,
 26386: 455.0,
 33004: 494.0,
 34642: 623.0,
 38275: 471.0,
 42756: 655.0,
 50164: 2111.0,
 52075: 322.0,
 52945: 615.0,
 54377: 1442.0,
 59122: 716.0,
 59398: 977.0}

In [103]:
frame_to_second_collection(bernie_intervals).dilate(1.5).duration()

{763: 612.0,
 2648: 90.0,
 3459: 99.0,
 3769: 471.0,
 3952: 15.0,
 5281: 618.0,
 6185: 3.0,
 7262: 3.0,
 8220: 528.0,
 8697: 297.0,
 8859: 3.0,
 9215: 3.0,
 9480: 24.0,
 9499: 180.0,
 9901: 336.0,
 10335: 195.0,
 11003: 3.0,
 11555: 15.0,
 12837: 579.0,
 13058: 3.0,
 13141: 546.0,
 13247: 204.0,
 13556: 18.0,
 13927: 15.0,
 13993: 30.0,
 16215: 51.0,
 16542: 21.0,
 17983: 3.0,
 19959: 69.0,
 20380: 15.0,
 20450: 6.0,
 23181: 18.0,
 24193: 84.0,
 24847: 75.0,
 26386: 465.0,
 27188: 42.0,
 27410: 3.0,
 29001: 129.0,
 32996: 12.0,
 33004: 477.0,
 33387: 15.0,
 34642: 675.0,
 36755: 33.0,
 37107: 6.0,
 37113: 69.0,
 37170: 75.0,
 38275: 471.0,
 40203: 3.0,
 42756: 657.0,
 45472: 51.0,
 45655: 63.0,
 45698: 120.0,
 50164: 1710.0,
 50561: 6.0,
 51175: 9.0,
 52075: 450.0,
 52749: 9.0,
 52945: 630.0,
 53684: 9.0,
 54377: 1479.0,
 55711: 363.0,
 57384: 39.0,
 57592: 33.0,
 57708: 177.0,
 57804: 411.0,
 57990: 102.0,
 59122: 681.0,
 59398: 1080.0,
 60186: 6.0}

In [104]:
bernie_interview_videos = set(bernie_interviews.get_grouped_intervals().keys())
bernie_videos = set(bernie_intervals.get_grouped_intervals().keys())
all_videos = set(video_ids)

In [105]:
print(len(bernie_interview_videos))
print(len(bernie_videos))
print(len(all_videos))

18
69
97


In [106]:
bernie_durations = frame_to_second_collection(bernie_intervals).dilate(1.5).duration()

In [107]:
bernie_durations_no_interviews = {
    vid: bernie_durations[vid]
    for vid in bernie_durations if vid not in bernie_interview_videos
}

In [108]:
bernie_durations_no_interviews_minute_or_longer = {
    vid: bernie_durations_no_interviews[vid]
    for vid in bernie_durations_no_interviews
    if bernie_durations_no_interviews[vid] >= 60
}
print(len(bernie_durations_no_interviews_minute_or_longer))
bernie_durations_no_interviews_minute_or_longer

18


{2648: 90.0,
 3459: 99.0,
 8697: 297.0,
 9499: 180.0,
 10335: 195.0,
 13247: 204.0,
 19959: 69.0,
 24193: 84.0,
 24847: 75.0,
 29001: 129.0,
 37113: 69.0,
 37170: 75.0,
 45655: 63.0,
 45698: 120.0,
 55711: 363.0,
 57708: 177.0,
 57804: 411.0,
 57990: 102.0}

In [109]:
bernie_durations_no_interviews_less_than_minute = {
    vid: bernie_durations_no_interviews[vid]
    for vid in bernie_durations_no_interviews
    if bernie_durations_no_interviews[vid] < 60
}
print(len(bernie_durations_no_interviews_less_than_minute))
bernie_durations_no_interviews_less_than_minute

33


{3952: 15.0,
 6185: 3.0,
 7262: 3.0,
 8859: 3.0,
 9215: 3.0,
 9480: 24.0,
 11003: 3.0,
 11555: 15.0,
 13058: 3.0,
 13556: 18.0,
 13927: 15.0,
 13993: 30.0,
 16215: 51.0,
 16542: 21.0,
 17983: 3.0,
 20380: 15.0,
 20450: 6.0,
 23181: 18.0,
 27188: 42.0,
 27410: 3.0,
 32996: 12.0,
 33387: 15.0,
 36755: 33.0,
 37107: 6.0,
 40203: 3.0,
 45472: 51.0,
 50561: 6.0,
 51175: 9.0,
 52749: 9.0,
 53684: 9.0,
 57384: 39.0,
 57592: 33.0,
 60186: 6.0}

In [110]:
videos_no_bernie = all_videos.difference(bernie_videos)
print(len(videos_no_bernie))
videos_no_bernie

28


{529,
 3730,
 4143,
 4611,
 10323,
 11579,
 11792,
 13827,
 14482,
 15916,
 16693,
 16879,
 17458,
 19882,
 23184,
 24992,
 25463,
 31378,
 32472,
 33541,
 34359,
 38420,
 41480,
 41725,
 45645,
 48140,
 49931,
 53355}

In [111]:
ids_bernie_interviews = sorted(list(bernie_interview_videos))
ids_bernie_minute_or_longer = sorted(list(
    bernie_durations_no_interviews_minute_or_longer.keys()))
ids_bernie_less_than_minute = sorted(list(
    bernie_durations_no_interviews_less_than_minute.keys()))
ids_no_bernie = sorted(list(videos_no_bernie))

random.shuffle(ids_bernie_interviews)
random.shuffle(ids_bernie_minute_or_longer)
random.shuffle(ids_bernie_less_than_minute)
random.shuffle(ids_no_bernie)

In [112]:
train_set = (ids_bernie_interviews[8:] +
             ids_bernie_minute_or_longer[8:] +
             ids_bernie_less_than_minute[16:] +
             ids_no_bernie[16:])
val_set = (ids_bernie_interviews[:4] +
           ids_bernie_minute_or_longer[:4] +
           ids_bernie_less_than_minute[:8] +
           ids_no_bernie[:8])
test_set = (ids_bernie_interviews[4:8] +
            ids_bernie_minute_or_longer[4:8] +
            ids_bernie_less_than_minute[8:16] +
            ids_no_bernie[8:16])

In [113]:
print(train_set)

[8220, 59398, 13141, 50164, 33004, 9901, 59122, 12837, 3769, 52075, 10335, 57990, 45655, 57804, 24193, 3459, 37113, 2648, 8697, 57708, 57592, 11003, 7262, 27410, 60186, 17983, 45472, 33387, 50561, 13556, 16542, 40203, 53684, 11555, 37107, 51175, 23181, 49931, 24992, 14482, 4611, 16879, 13827, 32472, 11792, 25463, 31378, 45645, 53355]


In [114]:
print(val_set)

[38275, 42756, 52945, 34642, 19959, 37170, 55711, 45698, 20380, 3952, 20450, 52749, 13927, 16215, 57384, 8859, 41725, 10323, 33541, 38420, 23184, 19882, 17458, 34359]


In [115]:
print(test_set)

[54377, 26386, 5281, 763, 9499, 24847, 13247, 29001, 9480, 9215, 27188, 13058, 32996, 6185, 36755, 13993, 4143, 3730, 15916, 529, 11579, 48140, 41480, 16693]


# Export images and videos

In [96]:
train_set = [8220, 59398, 13141, 50164, 33004, 9901, 59122, 12837, 3769, 52075,
             10335, 57990, 45655, 57804, 24193, 3459, 37113, 2648, 8697, 57708,
             57592, 11003, 7262, 27410, 60186, 17983, 45472, 33387, 50561, 13556,
             16542, 40203, 53684, 11555, 37107, 51175, 23181, 49931, 24992, 14482,
             4611, 16879, 13827, 32472, 11792, 25463, 31378, 45645, 53355]
val_set = [38275, 42756, 52945, 34642, 19959, 37170, 55711, 45698, 20380, 3952,
           20450, 52749, 13927, 16215, 57384, 8859, 41725, 10323, 33541, 38420,
           23184, 19882, 17458, 34359]
test_set = [54377, 26386, 5281, 763, 9499, 24847, 13247, 29001, 9480, 9215, 27188,
            13058, 32996, 6185, 36755, 13993, 4143, 3730, 15916, 529, 11579, 48140,
            41480, 16693]

In [116]:
print(len(train_set + val_set + test_set))

97


## Downsample

In [98]:
interval = 30

In [76]:
segs_dict = {}
for video_id in train_set + val_set + test_set:
    video = Video.objects.get(id=video_id)
    iset = IntervalSet([
        Interval(Bounds3D(i, i), video.fps)
        for i in range(0, video.num_frames) if (i % (
            math.floor(video.fps * 3) * (interval / 3)
        )) == 0
    ])
    segs_dict[video_id] = iset
    
segments = frame_to_second_collection(IntervalSetMapping(segs_dict)).dilate(interval / 2)

In [85]:
segments_all_negative = segments.map(
    lambda intrvl: Interval(intrvl['bounds'], 0)
)

In [87]:
def filter_by_id(ism, valid_ids):
    return IntervalSetMapping({
        vid: ism.get_grouped_intervals()[vid]
        for vid in list(ism.get_grouped_intervals().keys()) if vid in valid_ids
    })

In [117]:
interview_segments = segments.filter_against(
    filter_by_id(interviews_ism, video_ids).filter(
        payload_satisfies(lambda p: (
            p['guest1'] == 'bernie sanders' and
#                 p['interviewer1'] == 'jake tapper' and
            p['guest2'] is None and
#             p['interviewer2'] is None and
            p['original']
        ))
    ), predicate=overlaps()
).map(
    lambda intrvl: Interval(intrvl['bounds'], 1)
)

interview_labels = segments_all_negative.minus(
    interview_segments
).union(interview_segments)

print(interview_segments.size())
print(interview_labels.size())

{5281: 20, 38275: 16, 42756: 23, 12837: 23, 59398: 37, 54377: 51, 52075: 12, 33004: 19, 9901: 12, 52945: 22, 34642: 22, 50164: 80, 13141: 19, 3769: 17, 59122: 26, 26386: 16, 763: 20, 8220: 19}
{13058: 124, 4611: 123, 42756: 124, 33541: 123, 59398: 124, 50561: 123, 9480: 123, 16215: 63, 40203: 364, 48140: 123, 52749: 245, 24847: 123, 11792: 123, 529: 123, 26386: 123, 13827: 124, 38420: 124, 60186: 123, 9499: 366, 8220: 123, 11555: 123, 12837: 124, 57384: 123, 6185: 123, 15916: 123, 17458: 123, 4143: 123, 41480: 123, 37170: 124, 27188: 124, 16693: 123, 34359: 124, 11579: 123, 17983: 124, 49931: 123, 29001: 124, 45645: 124, 34642: 124, 10323: 123, 13141: 124, 45655: 124, 2648: 245, 7262: 123, 10335: 366, 38275: 123, 13927: 124, 54377: 124, 33387: 123, 57708: 124, 14482: 124, 3952: 123, 25463: 123, 27410: 124, 24193: 124, 45698: 124, 3459: 124, 57990: 124, 23181: 123, 23184: 123, 3730: 485, 36755: 123, 53355: 123, 8859: 123, 20380: 124, 16542: 123, 55711: 124, 24992: 123, 5281: 123, 13993:

## Export Videos - Don't run, too many videos

In [77]:
paths = [
    Video.objects.get(id=vid).path
    for vid in video_ids
]

In [78]:
paths

['tvnews/videos/CNNW_20161023_130000_State_of_the_Union_With_Jake_Tapper.mp4',
 'tvnews/videos/CNNW_20160327_160000_State_of_the_Union_With_Jake_Tapper.mp4',
 'tvnews/videos/CNNW_20151026_210000_Situation_Room_With_Wolf_Blitzer.mp4',
 'tvnews/videos/MSNBCW_20160414_040000_The_Rachel_Maddow_Show.mp4',
 'tvnews/videos/FOXNEWSW_20150912_100000_Fox_and_Friends_Saturday.mp4',
 'tvnews/videos/CNNW_20160221_170000_State_of_the_Union_With_Jake_Tapper.mp4',
 'tvnews/videos/CNNW_20170108_170000_State_of_the_Union_With_Jake_Tapper.mp4',
 'tvnews/videos/FOXNEWSW_20170313_220000_Special_Report_With_Bret_Baier.mp4',
 'tvnews/videos/CNNW_20161031_060000_State_of_the_Union_With_Jake_Tapper.mp4',
 'tvnews/videos/CNNW_20170205_170000_State_of_the_Union_With_Jake_Tapper.mp4',
 'tvnews/videos/CNNW_20171206_210000_The_Lead_With_Jake_Tapper.mp4',
 'tvnews/videos/FOXNEWSW_20170720_060000_Tucker_Carlson_Tonight.mp4',
 'tvnews/videos/CNNW_20170618_160000_State_of_the_Union_With_Jake_Tapper.mp4',
 'tvnews/video

In [79]:
# output the paths to a file
with open('/app/data/video_paths.txt', 'w') as f:
    for path in paths:
        f.write('gs://esper/{}\n'.format(path))

## Now, download the videos.

From the data folder (either within Docker or from the host), run these commands:

```
mkdir -p videos
cat video_paths.txt | gsutil -m cp -I videos
```

In [32]:
import os
os.path.basename(paths[0])

'CNNW_20150705_130000_State_of_the_Union_With_Jake_Tapper.mp4'

In [34]:
# Now, rename the files
import os

for vid, path in zip(selected_ids, paths):
    src = '/app/data/videos/{}'.format(os.path.basename(path))
    dst = '/app/data/videos/{}.mp4'.format(vid)
    
    os.rename(src, dst)

# Export Frames

In [80]:
import hwang, storehouse

In [81]:
interval = 30
segs_dict = {}
for video_id in video_ids:
    video = Video.objects.get(id=video_id)
    iset = IntervalSet([
        Interval(Bounds3D(i, i), video.fps)
        for i in range(0, video.num_frames) if (i % (
            math.floor(video.fps * 3) * (interval / 3)
        )) == 0
    ])
    segs_dict[video_id] = iset
    
segments = frame_to_second_collection(IntervalSetMapping(segs_dict)).dilate(interval / 2)

In [82]:
segments_frames = second_to_frame_collection(segments)

In [83]:
from PIL import Image
from tqdm import tqdm

In [84]:
for video_id in tqdm(video_ids):
    video = Video.objects.get(id=video_id)
    backend = storehouse.StorageBackend.make_from_config(
        storehouse.StorageConfig.make_gcs_config(os.environ.get('BUCKET')))
    dec = hwang.Decoder(storehouse.RandomReadFile(backend, video.path))
    
    frame_nums = [
        int((intrvl['t1'] + intrvl['t2']) / 2)
        for intrvl in segments_frames.get_grouped_intervals()[video_id].get_intervals()
    ]
    
    frames = dec.retrieve(frame_nums)
    
    os.makedirs('/app/data/images/{}'.format(video_id), exist_ok=True)
    
    for i, frame in enumerate(frames):
        im = Image.fromarray(frame)
        im.save('/app/data/images/{}/{:04d}.jpg'.format(video_id, i))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 97/97 [1:19:29<00:00, 44.63s/it]


# Export train/test/val splits

In [118]:
os.makedirs('/app/data/data', exist_ok=True)
with open('/app/data/data/train.txt', 'w') as f:
    for video_id in train_set:
        for i, intrvl in enumerate(interview_labels[video_id].get_intervals()):
            f.write('{} {} {}\n'.format(video_id, i, intrvl['payload']))
with open('/app/data/data/val.txt', 'w') as f:
    for video_id in val_set:
        for i, intrvl in enumerate(interview_labels[video_id].get_intervals()):
            f.write('{} {} {}\n'.format(video_id, i, intrvl['payload']))
with open('/app/data/data/test.txt', 'w') as f:
    for video_id in test_set:
        for i, intrvl in enumerate(interview_labels[video_id].get_intervals()):
            f.write('{} {} {}\n'.format(video_id, i, intrvl['payload']))