In [38]:
from tqdm import tqdm
import pickle

In [2]:
video_ids = sorted([
    8220, 59398, 13141, 50164, 33004, 9901, 59122, 12837, 3769, 52075,
    10335, 57990, 45655, 57804, 24193, 3459, 37113, 2648, 8697, 57708,
    57592, 11003, 7262, 27410, 60186, 17983, 45472, 33387, 50561, 13556,
    16542, 40203, 53684, 11555, 37107, 51175, 23181, 49931, 24992, 14482,
    4611, 16879, 13827, 32472, 11792, 25463, 31378, 45645, 53355,
    38275, 42756, 52945, 34642, 19959, 37170, 55711, 45698, 20380, 3952,
    20450, 52749, 13927, 16215, 57384, 8859, 41725, 10323, 33541, 38420,
    23184, 19882, 17458, 34359,
    54377, 26386, 5281, 763, 9499, 24847, 13247, 29001, 9480, 9215, 27188,
    13058, 32996, 6185, 36755, 13993, 4143, 3730, 15916, 529, 11579, 48140,
    41480, 16693
])

# Export videos

In [3]:
paths = [
    Video.objects.get(id=vid).path
    for vid in video_ids
]

In [5]:
# output the paths to a file
with open('/app/data/video_paths_sandbox.txt', 'w') as f:
    for path in paths:
        f.write('gs://esper/{}\n'.format(path))

## Now, download the videos.

From the data folder (either within Docker or from the host), run these commands:

```
mkdir -p videos
cat video_paths_sandbox.txt | gsutil -m cp -I videos
```

# Export Video Metadata

In [4]:
Video.objects.get(id=8220)

Video(id: 8220, path: tvnews/videos/CNNW_20170618_160000_State_of_the_Union_With_Jake_Tapper.mp4, num_frames: 219437, fps: 59.94, width: 640, height: 360, has_captions: False, time: 2017-06-18 16:00:00, commercials_labeled: True, srt_extension: cc5, threeyears_dataset: True, duplicate: False, corrupted: False)

In [5]:
import json
import os

In [6]:
video_meta = [
    {
        'id': vid.id,
        'path': os.path.basename(vid.path),
        'num_frames': vid.num_frames,
        'fps': vid.fps,
        'width': vid.width,
        'height': vid.height
    }
    for vid in Video.objects.filter(id__in=video_ids).all()
]

In [30]:
with open('/app/data/video_meta_sandbox.json', 'w') as f:
    json.dump(video_meta, f)

# Export faces, with all metadata

## Load up metadata

In [7]:
from app.models import Face

In [8]:
all_faces = Face.objects.filter(frame__video_id__in=video_ids).annotate(
    video_id=F('frame__video_id'),
    frame_number=F('frame__number'),
    labeler_name=F('labeler__name')
).all()

In [9]:
all_faces.count()

218006

In [10]:
all_face_ids = [
    face.id
    for face in tqdm(all_faces, total=218006)
]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 218006/218006 [07:56<00:00, 457.94it/s]


In [11]:
face_ids_qs = FaceIdentity.objects.filter(face_id__in=all_face_ids).annotate(
    identity_name=F('identity__name'),
    labeler_name=F('labeler__name')
).all()

In [12]:
face_ids = [
    (fid.face_id, fid.identity_name, fid.labeler_name, fid.probability)
    for fid in tqdm(face_ids_qs)
]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 204929/204929 [00:00<00:00, 1608422.45it/s]


In [13]:
face_ids[:10]

[(371244,
  'anthony bourdain',
  'face-identity:anthony bourdain',
  0.999534487724304),
 (371244,
  'donald trump',
  'face-identity-old:donald trump',
  0.00488064000000035),
 (371246,
  'hillary clinton',
  'face-identity:hillary clinton',
  0.673712015151978),
 (371246,
  'hillary clinton',
  'face-identity-old:hillary clinton',
  0.706798400000001),
 (371246,
  'mika brzezinski',
  'face-identity-old:mika brzezinski',
  0.00754991999999999),
 (371249, 'donald trump', 'face-identity:donald trump', 0.977125525474548),
 (371249, 'donald trump', 'face-identity-old:donald trump', 1.0),
 (371252, 'donald trump', 'face-identity-old:donald trump', 0.18628128),
 (371252, 'tucker carlson', 'face-identity-old:tucker carlson', 0.02),
 (371253, 'donald trump', 'face-identity:donald trump', 0.497754991054535)]

In [14]:
face_genders_qs = FaceGender.objects.filter(face_id__in=all_face_ids).annotate(
    gender_name=F('gender__name'),
    labeler_name=F('labeler__name')
).all()

In [15]:
face_genders = [
    (fid.face_id, fid.gender_name, fid.labeler_name, fid.probability)
    for fid in tqdm(face_genders_qs)
]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 435157/435157 [00:00<00:00, 1454946.36it/s]


In [16]:
face_genders[:10]

[(371244, 'M', 'knn-gender', 1.0),
 (371244, 'M', 'rudecarnie', 0.566955983638763),
 (371245, 'M', 'knn-gender', 1.0),
 (371245, 'M', 'rudecarnie', 0.999832034111023),
 (371246, 'F', 'knn-gender', 1.0),
 (371246, 'F', 'rudecarnie', 0.999978423118591),
 (371247, 'F', 'knn-gender', 0.714285714285714),
 (371247, 'M', 'rudecarnie', 0.999894618988037),
 (371248, 'M', 'knn-gender', 0.857142857142857),
 (371248, 'F', 'rudecarnie', 0.976446032524109)]

## Associate metadata with faces

In [17]:
faces_with_metadata = {
    face.id: {
        'face': face,
        'identities': [],
        'genders': []
    }
    for face in all_faces
}

In [18]:
for fid, identity, labeler, score in face_ids:
    faces_with_metadata[fid]['identities'].append((identity, labeler, score))

In [19]:
for fid, gender, labeler, score in face_genders:
    faces_with_metadata[fid]['genders'].append((gender, labeler, score))

In [20]:
faces_with_metadata[371244]

{'face': Face(id: 371244, bbox_x1: 0.296029949188232, bbox_x2: 0.468071746826172, bbox_y1: 0.0534073299831814, bbox_y2: 0.396803622775608, background: False, is_host: False, blurriness: 114.526557177708, probability: 1.0),
 'genders': [('M', 'knn-gender', 1.0), ('M', 'rudecarnie', 0.566955983638763)],
 'identities': [('anthony bourdain',
   'face-identity:anthony bourdain',
   0.999534487724304),
  ('donald trump', 'face-identity-old:donald trump', 0.00488064000000035)]}

## Convert to JSON

In [32]:
faces_json = [
    {
        'id': fid,
        'frame_number': faces_with_metadata[fid]['face'].frame_number,
        'video_id': faces_with_metadata[fid]['face'].video_id,
        'x1': faces_with_metadata[fid]['face'].bbox_x1,
        'x2': faces_with_metadata[fid]['face'].bbox_x2,
        'y1': faces_with_metadata[fid]['face'].bbox_y1,
        'y2': faces_with_metadata[fid]['face'].bbox_y2,
        'is_host': faces_with_metadata[fid]['face'].is_host,
        'score': faces_with_metadata[fid]['face'].probability,
        'labeler': faces_with_metadata[fid]['face'].labeler_name,
        'genders': [
            {
                'gender': gender,
                'labeler': labeler,
                'score': score
            }
            for gender, labeler, score in faces_with_metadata[fid]['genders']
        ],
        'identities': [
            {
                'identity': identity,
                'labeler': labeler,
                'score': score
            }
            for identity, labeler, score in faces_with_metadata[fid]['identities']
        ]
    }
    for fid in faces_with_metadata
]

In [59]:
with open('/app/data/face_dump.json', 'w') as f:
    json.dump(faces_json, f)

# Face Embeddings

In [22]:
from app.face_embeddings import *

In [29]:
face_features = get(all_face_ids)

In [33]:
face_to_video_id = {}
for face in faces_json:
    face_to_video_id[face['id']] = face['video_id']

In [36]:
features_by_video_id = {}
for face_id, feature_vec in tqdm(face_features):
    video_id = face_to_video_id[face_id]
    if video_id not in features_by_video_id:
        features_by_video_id[video_id] = {}
    features_by_video_id[video_id][face_id] = feature_vec

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 217983/217983 [00:00<00:00, 681961.71it/s]


In [40]:
for video_id in features_by_video_id:
    with open('/app/data/face_features/{}.pkl'.format(video_id), 'wb') as f:
        pickle.dump(features_by_video_id[video_id], f)

# Segment Annotations

In [41]:
from app.models import LabeledCommercial, LabeledPanel, LabeledInterview

In [46]:
commercials = [
    {
        'video_id': comm.video_id,
        'start': comm.start,
        'end': comm.end
    }
    for comm in LabeledCommercial.objects.all()
]

In [47]:
panels = [
    {
        'video_id': panel.video_id,
        'start': panel.start,
        'end': panel.end,
        'num_panelists': panel.num_panelists
    }
    for panel in LabeledPanel.objects.all()
]

In [48]:
interviews = [
    {
        'video_id': interview.video_id,
        'start': interview.start,
        'end': interview.end,
        'interviewer1': interview.interviewer1,
        'interviewer2': interview.interviewer2,
        'guest1': interview.guest1,
        'guest2': interview.guest2,
        'original': interview.original,
        'scattered_clips': interview.scattered_clips
    }
    for interview in LabeledInterview.objects.all()
]

In [49]:
with open('/app/data/commercials.json', 'w') as f:
    json.dump(commercials, f)

In [50]:
with open('/app/data/panels.json', 'w') as f:
    json.dump(panels, f)

In [51]:
with open('/app/data/interviews.json', 'w') as f:
    json.dump(interviews, f)