## Feature Extraction ##
Given an episode number, this code extracts features from all the labeled interacting pairs of tracks in the episode (interacs_bool = False will gather all non-interacting pairs). 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
episode_no = 14
interacs_bool = True
episode_str = f'episode{str(episode_no).zfill(2)}'
if not interacs_bool:
  full_episode_str = episode_str+ '_nointeracs'
else:
  full_episode_str = episode_str

## Preparing the Model and Data##

In [3]:
FRAME_PATH = f'drive/MyDrive/Friends/frames/episode{str(episode_no).zfill(2)}.tar.gz' #change me
SHOTS_PATH = f'drive/MyDrive/Friends/shots/shots.tar.gz' #change me
TRACK_PATH = f'drive/MyDrive/Friends/tracks-features/Friends.pk' #change me
VIDEO_PATH = f'drive/MyDrive/Friends_Extra/episode{str(episode_no).zfill(2)}.mp4' #change me
MODEL_PATH = f'drive/MyDrive/friends_annotations/resnet-18-kinetics.pth' 

In [21]:
! git clone https://github.com/kenshohara/video-classification-3d-cnn-pytorch classification

Cloning into 'classification'...
remote: Enumerating objects: 121, done.[K
remote: Total 121 (delta 0), reused 0 (delta 0), pack-reused 121[K
Receiving objects: 100% (121/121), 158.63 KiB | 2.09 MiB/s, done.
Resolving deltas: 100% (63/63), done.


In [22]:
! mkdir data
! tar xzf $SHOTS_PATH -C data
! tar xzf $FRAME_PATH -C classification

mkdir: cannot create directory ‘data’: File exists


## Loading Data ##

In [7]:
def load_shots():
  shots = {}
  i = 0
  with open(f'data/shots/season3/episode{str(episode_no).zfill(2)}_shots.txt', 'r') as f:
    for l in f:
      boundaries = l.split(' ')
      assert len(boundaries) == 2
      start, end = int(boundaries[0].strip()), int(boundaries[1].strip())
      shots[i]  = (start, end)
      i += 1
  return shots

In [8]:
def load_tracks():
  import pickle
  with open(TRACK_PATH, 'rb') as f:
    tracks = pickle.load(f)
  return tracks

In [9]:
def load_track_boundaries(tracks):
  
  track_boundaries = {}

  for i in tracks[episode_str]['GT']:
    for features in ['body', 'face']:
      if i in tracks[episode_str][features]:
        l = tracks[episode_str][features][i]
        start, end = l[0,0], l[-1,0]
        if end - start > 16:
          if i not in track_boundaries:
            track_boundaries[i] = (start, end)

  return track_boundaries

In [10]:
def load_tracks_to_shots(track_boundaries):
  tracks_to_shots = {}
  for i, (track_s, track_e) in track_boundaries.items():
    for s in shots:
      if shots[s][0] <= track_s <= track_e <= shots[s][1]:
        tracks_to_shots[i] = s 
  return tracks_to_shots

In [11]:
def load_interacc_annot(interacc_bool):
  % cd drive/MyDrive/friends_annotations
  from parse_csv_annotations import gather_annotations
  annotations = gather_annotations(episode_no)
  % cd -
  if interacc_bool:
    return annotations['full physical'] + annotations['partial physical'] + annotations['full non-physical'] + annotations['partial non-physical']
  return annotations['no interaction']

In [12]:
def load_interactions(interacc_annot, tracks_to_shots):
  interactions = []
  for interacc in interacc_annot:
    shot_id = interacc[1]
    person1, person2 = interacc[2], interacc[3]

    person1 = 'Jack Geller' if person1 == '"Jack Geller"' else person1
    person2 = 'Jack Geller' if person2 == '"Jack Geller"' else person2

    curr = []
    for i in tracks_to_shots:
      if tracks_to_shots[i] == shot_id:
        curr.append(i)

    for i in range(len(curr)):
      for j in range(len(curr)):
        if tracks[episode_str]['GT'][curr[i]] == person1:
          if tracks[episode_str]['GT'][curr[j]] == person2:
            if ((curr[i], curr[j])) not in interactions:
              interactions.append((curr[i], person1, curr[j], person2))
  interactions.sort()

  return interactions

In [13]:
def load_true_interaccs(tracks, interactions):
  true_interaccs = {}
  body_info = tracks[episode_str]['body']
  face_info = tracks[episode_str]['face']

  for track1, person1, track2, person2 in interactions:
    true_interaccs[(track1, person1, track2, person2)] = [[]]
    overall_frames = []

    info_1 = body_info if track1 in body_info else face_info
    info_2 = body_info if track2 in body_info else face_info
    
    for frames_i in info_1[track1][:,0]:
      if frames_i in info_2[track2][:,0]:
        overall_frames.append(frames_i)
    
    for i in range(len(overall_frames)):
      true_interaccs[(track1, person1, track2, person2)][-1].append(overall_frames[i])
      if i < len(overall_frames) - 1:
        if overall_frames[i] + 1 != overall_frames[i + 1]:
          true_interaccs[(track1, person1, track2, person2)].append([])
  return true_interaccs

In [14]:
def create_cropped_video(track_no, info, frames, FRAMES_PATH, VIDEO_PATH, video_name):
  if len(frames) < 2:
    return

  start, end = frames[0], frames[-1]

  TEMP_PATH = 'tmp'
  ! mkdir $TEMP_PATH
  
  count = 0
  for frame_bbx in info[track_no]:

    if start <= frame_bbx[0] <= end:
      pass
    else:
      continue

    frame_no = frame_bbx[0]
    x1, y1, x2, y2 = frame_bbx[1], frame_bbx[2], frame_bbx[3], frame_bbx[4]

    width = (x2 - x1)
    height = (y2 - y1)
  
    width_n = width * (1.15)**0.5
    height_n = height * (1.15)**0.5

    x_center = (x1 + x2) / 2
    y_center = (y1 + y2) / 2

    x1_n = x_center - (width_n / 2)
    x2_n = x_center + (width_n / 2)

    if x1_n < 0:
      width_n += 2 * x1_n
      x1_n = 0

    if x1_n + width_n > 1280:
      width_n -= x1_n + width_n - 1280

    y1_n = y_center - (height_n / 2) 
    y2_n = y_center + (height_n / 2)

    if y1_n < 0:
      height_n += 2 * y1_n
      y1_n = 0

    if y1_n + height_n > 720:
      height_n -= y1_n + height_n - 720

    crop_str = f'"crop={width_n}:{height_n}:{x1_n}:{y1_n}"'
    scale_str = "scale=112:112:force_original_aspect_ratio=decrease,pad=112:112:-1:-1:color=black"
    double_str= f"{crop_str},{scale_str}"

    img_fname = str(int(frame_no)).zfill(6) + ".jpg"
    in_img = os.path.join(FRAMES_PATH, img_fname)
    out_fno = str(int(count)).zfill(4) + ".jpg"
    out_img = os.path.join(TEMP_PATH, out_fno)

    ! ffmpeg -i $in_img -vf $double_str $out_img 

    count += 1

  out_video = VIDEO_PATH + video_name
  in_frames = TEMP_PATH + '/%4d.jpg'
  ! ffmpeg -f image2 -i $in_frames $out_video 

  ! rm -r $TEMP_PATH

In [23]:
shots = load_shots()

In [24]:
tracks = load_tracks()

In [25]:
track_boundaries = load_track_boundaries(tracks)
tracks_to_shots = load_tracks_to_shots(track_boundaries)
interacc_annot = load_interacc_annot(interacs_bool)
interactions = load_interactions(interacc_annot, tracks_to_shots)
true_interaccs = load_true_interaccs(tracks, interactions)

/content/drive/MyDrive/friends_annotations
/content


## Create Tracks Video ##

In [None]:
import os
% cd classification

body_info = tracks[episode_str]['body']
face_info = tracks[episode_str]['face']

! mkdir video

for (track_i, person_i, track_j, person_j) in true_interaccs:

  frames = true_interaccs[(track_i, person_i, track_j, person_j)]

  info_i = body_info if track_i in body_info else face_info
  info_j = body_info if track_j in body_info else face_info


  for k in range(len(frames)):
    
    track_path = f'track_{str(track_i).zfill(3)}_{str(track_j).zfill(3)}_{k}'

    track_str_i = f'{str(track_i).zfill(3)}_{str(track_j).zfill(3)}_{k}.mp4'
    track_str_j = f'{str(track_j).zfill(3)}_{str(track_i).zfill(3)}_{k}.mp4'

    frame_path = f'datasets/friends/frames/season3/{episode_str}'

    create_cropped_video(track_i, info_i, frames[k], frame_path, 'video', track_str_i)
    create_cropped_video(track_j, info_j, frames[k], frame_path, 'video', track_str_j)

% cd -

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1;34m[swscaler @ 0x557010018000] [0m[0;33mdeprecated pixel format used, make sure you did set range correctly
[0mOutput #0, image2, to 'tmp/0117.jpg':
  Metadata:
    encoder         : Lavf57.83.100
    Stream #0:0: Video: mjpeg, yuvj420p(pc), 112x112 [SAR 12544:12483 DAR 12544:12483], q=2-31, 200 kb/s, 25 fps, 25 tbn, 25 tbc
    Metadata:
      encoder         : Lavc57.107.100 mjpeg
    Side data:
      cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: -1
frame=    1 fps=0.0 q=2.2 Lsize=N/A time=00:00:00.04 bitrate=N/A speed=5.78x    
video:3kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
ffmpeg version 3.4.8-0ubuntu0.2 Copyright (c) 2000-2020 the FFmpeg developers
  built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)
  configuration: --prefix=/usr --extra-version=0ubuntu0.2 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu 

## Extracting Feature Vectors ##
Finally, we can use 3D ResNet to extract the feature vector.

In [26]:
! cp drive/MyDrive/friends_annotations/resnet-18-kinetics.pth classification/models # change to location of model

In [20]:
! mkdir classification/video
import os

for f in os.listdir('classification'):
  if f[:5] == 'video' and f != 'video':
    file = f'classification/{f}'
    !mv $file classification/video

mkdir: cannot create directory ‘classification/video’: No such file or directory


FileNotFoundError: ignored

In [None]:
! 

In [None]:
# add opt.model_depth = 18 in line 17 in classification/main.py
import os
% cd classification
! mkdir outputs
video_path = 'video'
for f in os.listdir(video_path):
  with open('input', 'w') as file:
    file.write(f)
    file.write('\n')
  output_loc = f'outputs/output_{f}.json'
  ! touch $output_loc
  ! python main.py --input input --video_root $video_path --output $output_loc --model ./models/resnet-18-kinetics.pth --mode feature
% cd -

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  Duration: 00:00:10.20, start: 0.000000, bitrate: 75 kb/s
    Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuvj420p(pc), 112x112 [SAR 1974:1969 DAR 1974:1969], 72 kb/s, 25 fps, 25 tbr, 12800 tbn, 50 tbc (default)
    Metadata:
      handler_name    : VideoHandler
Stream mapping:
  Stream #0:0 -> #0:0 (h264 (native) -> mjpeg (native))
Press [q] to stop, [?] for help
Output #0, image2, to 'tmp/image_%05d.jpg':
  Metadata:
    major_brand     : isom
    minor_version   : 512
    compatible_brands: isomiso2avc1mp41
    encoder         : Lavf57.83.100
    Stream #0:0(und): Video: mjpeg, yuvj420p(pc), 112x112 [SAR 1974:1969 DAR 1974:1969], q=2-31, 200 kb/s, 25 fps, 25 tbn, 25 tbc (default)
    Metadata:
      handler_name    : VideoHandler
      encoder         : Lavc57.107.100 mjpeg
    Side data:
      cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: -1
frame=  255 fps=0.0 q=10.5 Lsize=N/A time=00:

In [None]:
import json
import os

final_dicts = []

for current_file in sorted(os.listdir('drive/MyDrive/friends_annotations/outputs_final')):

  curr_episode = current_file[:9]

  loc = f'drive/MyDrive/friends_annotations/outputs_final/{current_file}'
  ! unzip -qq $loc

  outputs_path = 'classification/outputs/'
  for outputs in os.listdir(outputs_path):
    if os.path.getsize(outputs_path + outputs) == 0:
      continue
    values = outputs[12:-9].split('_')
    track1 = int(values[0])
    track2 = int(values[1])
    version = int(values[2])


    person1 = tracks[curr_episode]['GT'][track1]
    person2 = tracks[curr_episode]['GT'][track2]
    with open(outputs_path + outputs) as f:
      js_obj = json.load(f)
      for i in js_obj[0]['clips']:
        curr_dict = {'track1' : track1,
                    'track2' : track2,
                    'person1' : person1,
                    'person2' : person2,
                    'version' : version,
                    'episode' : curr_episode,
                    'segment': i['segment'],
                    'features': i['features'],
                    'video' : js_obj[0]['video']}
        final_dicts.append(curr_dict)
      
  ! rm -r classification