In [1]:
import sys, os
import pandas as pd
import numpy as np
import shutil
from zipfile import ZipFile
from utils.helper import get_all_files, get_all_dirs, make_new_dir
from utils.extractor import Extractor
from PIL import Image
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from utils.video_tools import FeatExtractor
from datetime import timedelta

In [2]:
def fix_path(my_dir):
    """replace white space with underscore '_' in files and directories under my_dir
    Note: only files/dirs inside my_dir are checked. Parent directories above my_dir are ignored.
    """
    sep = os.path.sep
    dir_lst = get_all_dirs(my_dir, trim=1)
    tree_depth = max([len(p.split(sep)) for p in dir_lst])
    dcount = 0
    for i in range(1):#tree_depth):  # repeat the path fixing process several times 
        dir_lst = get_all_dirs(my_dir, trim=1)[::-1]
        for p in dir_lst:
            leaf = p.split(sep)[-1]
            parent = os.path.dirname(p)
            new_leaf = leaf.replace(' ','_')
            if new_leaf != leaf and os.path.exists(os.path.join(my_dir, p)):
                print('Renaming "%s" to "%s"' % (p, os.path.join(parent, new_leaf)))
                shutil.move(os.path.join(my_dir, p), os.path.join(my_dir, parent, new_leaf))
                dcount += 1
    file_lst = get_all_files(my_dir, trim=0)
    fcount = 0
    for f in file_lst:
        leaf = f.split(sep)[-1]
        parent = os.path.dirname(f)
        new_leaf = leaf.replace(' ', '_')
        if new_leaf != leaf:
            print('Renaming "%s" to "%s"' % (f, os.path.join(parent, new_leaf)))
            shutil.move(f, os.path.join(parent, new_leaf))
            fcount += 1
    print('Done. Rename %d dirs and %d files' % (dcount, fcount))

In [3]:
DATA_ROOT = '/home/tb0035/projects/tna_datathon/data/LOC_Videos'
OUT = '/home/tb0035/projects/tna_datathon/data/out'
make_new_dir(OUT, False)

In [None]:
fix_path(DATA_ROOT)

In [None]:
accepted_img_formats = ['jpg', 'png', 'gif', 'bmp']
accepted_vid_formats = ['mp4', 'avi', 'flv', 'mov', 'mkv', 'mpeg']
loc_vid = get_all_files(DATA_ROOT, trim=1)
loc_vid = [vid for vid in loc_vid if vid.split('.')[-1].lower() in accepted_vid_formats]
loc_vid_tab = pd.DataFrame(columns=['path', 'type'])
loc_vid_tab['path'] = loc_vid
loc_vid_tab['type'] = ['video',] * len(loc_vid)
loc_vid_tab.to_csv(os.path.join(DATA_ROOT, 'list.txt'), index=None)

In [4]:
lst = pd.read_csv(os.path.join(DATA_ROOT, 'list.txt'))
paths = lst['path'].to_list()
paths = paths[:5]

In [None]:
batchsize = 2
min_frames = 3
arch = 'ResNet50'
stride_in_sec = 0.5
extractor = FeatExtractor(batchsize = batchsize, min_frames = min_frames,
                              arch = arch,
                              verbose = False, stride_in_sec = stride_in_sec)

In [7]:
sep = os.path.sep
feat_paths = []
for path in paths:
    print('Processing %s' % path)
    name = path.split(sep)[-1].split('.')[0]
    np_out = os.path.join(OUT, name + '.npy')
    #extractor.extract(os.path.join(DATA_ROOT, path), np_out, False)
    feat_paths.append(name + '.npy')
    

Processing Volunteering_message_JONATHAN_EDWARDS.mp4
Processing Journey_-_Test_Event_montage.mp4
Processing Pin_badges.mp4
Processing Overall_Highlights_FINAL.mp4
Processing The_Olympic_Torch_Relay_education_film.mp4


## search

In [5]:
def dist2(q, data):
    query = q + np.zeros((1, q.size), dtype=np.float32)
    return np.sum((q-data)**2, axis=1)
        

In [8]:
feats = []
for path in feat_paths:
    feats.append(np.load(os.path.join(OUT, path)))

In [9]:
hasher = Extractor()

Instructions for updating:
Colocations handled automatically by placer.




In [22]:
test_im_path = '/home/tb0035/projects/tna_datathon/data/test/jonathan.png'
test_feat = hasher.extract_from_path(test_im_path)

In [23]:
Nframe = 5
res = {'ids': [], 'dist': []}
for i, vid_feat in enumerate(feats):
    d = dist2(test_feat, vid_feat)
    simid = np.argsort(d)[:Nframe]
    res['ids'].append(simid)
    res['dist'].append(d[simid].mean())
res['dist'] = np.array(res['dist'])

In [24]:
K = 3
vidid = np.argsort(res['dist'])[:K]
viddist = res['dist'][vidid]

selpaths = [paths[i] for i in vidid]
for i in range(len(selpaths)):
    time_pos = np.sort(res['ids'][i])[int(Nframe/2)]
    time_step = int(time_pos/2)
    time_step = timedelta(seconds=time_step)
    print('%s: %f. Position around: %s' % (selpaths[i], viddist[i], time_step))
    

Volunteering_message_JONATHAN_EDWARDS.mp4: 200.743835. Position around: 0:00:19
Journey_-_Test_Event_montage.mp4: 1084.150635. Position around: 0:01:11
Pin_badges.mp4: 1174.289429. Position around: 0:01:44
