## Essential imports and path settings

In [None]:
import sys, os
import pandas as pd
import numpy as np
import shutil
from zipfile import ZipFile
from utils.helper import get_all_files, get_all_dirs, make_new_dir
from utils.extractor import Extractor
from PIL import Image
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
DATA_ROOT = '/home/tb0035/projects/tna_datathon/data'

In [None]:
def visualise(lst_imgs, target_shape=(200,200)):
    """visualise lst of images"""
    out = []
    for p in lst_imgs:
        im = Image.open(p).resize(target_shape, Image.BILINEAR)
        im = np.array(im)
        if im.shape[-1] > 3:
            im = im[:,:,:3]
#         print(im.shape)
        out.append(im)
    out = np.concatenate(out, axis=1)
    return out

## Find and replace white space with underscore

In [None]:
def fix_path(my_dir):
    """replace white space with underscore '_' in files and directories under my_dir
    Note: only files/dirs inside my_dir are checked. Parent directories above my_dir are ignored.
    """
    sep = os.path.sep
    dir_lst = get_all_dirs(my_dir, trim=1)
    tree_depth = max([len(p.split(sep)) for p in dir_lst])
    dcount = 0
    for i in range(1):#tree_depth):  # repeat the path fixing process several times 
        dir_lst = get_all_dirs(my_dir, trim=1)[::-1]
        for p in dir_lst:
            leaf = p.split(sep)[-1]
            parent = os.path.dirname(p)
            new_leaf = leaf.replace(' ','_')
            if new_leaf != leaf and os.path.exists(os.path.join(my_dir, p)):
                print('Renaming "%s" to "%s"' % (p, os.path.join(parent, new_leaf)))
                shutil.move(os.path.join(my_dir, p), os.path.join(my_dir, parent, new_leaf))
                dcount += 1
    file_lst = get_all_files(my_dir, trim=0)
    fcount = 0
    for f in file_lst:
        leaf = f.split(sep)[-1]
        parent = os.path.dirname(f)
        new_leaf = leaf.replace(' ', '_')
        if new_leaf != leaf:
            print('Renaming "%s" to "%s"' % (f, os.path.join(parent, new_leaf)))
            shutil.move(f, os.path.join(parent, new_leaf))
            fcount += 1
    print('Done. Rename %d dirs and %d files' % (dcount, fcount))

In [None]:
fix_path(DATA_ROOT)


## Extract zip files

In [None]:
def zip_extract(in_dir, out_dir=None):
    """find .zip files in in_dir and extract to out_dir
    if out_dir=None, extract to in_dir
    """
    out = in_dir if out_dir is None else out_dir
    lst = get_all_files(in_dir, trim=1, extension='zip')
    for path in lst:
        print('Extracting %s' % path)
        parent = os.path.dirname(path)
        out_path = os.path.join(out, parent)
        make_new_dir(out_path, False)
        with ZipFile(os.path.join(in_dir, path), 'r') as zip_ref:
            zip_ref.extractall(out_path)
        

In [None]:
BT_IMG = os.path.join(DATA_ROOT, 'BT_images')
zip_extract(BT_IMG)

## Get list of files

In [None]:
UKSC_VID = os.path.join(DATA_ROOT, 'UKSC_Videos/10_minutes_cuts')
LOC_VID = os.path.join(DATA_ROOT, 'LOC_Videos')
LOC_IMG = os.path.join(DATA_ROOT, 'LOC_images/LOC_SAMPLE_IMGS')
BT_IMG = os.path.join(DATA_ROOT, 'BT_images')

In [None]:
accepted_img_formats = ['jpg', 'png', 'gif', 'bmp']
accepted_vid_formats = ['mp4', 'avi', 'flv', 'mov', 'mkv', 'mpeg']

# uksc_vid = get_all_files(UKSC_VID, trim=1)
# uksc_vid = [vid for vid in uksc_vid if vid.split('.')[-1].lower() in accepted_vid_formats]
# uksc_vid_tab = pd.DataFrame(columns=['path', 'type'])
# uksc_vid_tab['path'] = uksc_vid
# uksc_vid_tab['type'] = ['video',] * len(uksc_vid)
# uksc_vid_tab.to_csv(os.path.join(UKSC_VID, 'list.txt'), index=None)

# loc_vid = get_all_files(LOC_VID, trim=1)
# loc_vid = [vid for vid in loc_vid if vid.split('.')[-1].lower() in accepted_vid_formats]
# loc_vid_tab = pd.DataFrame(columns=['path', 'type'])
# loc_vid_tab['path'] = loc_vid
# loc_vid_tab['type'] = ['video',] * len(loc_vid)
# loc_vid_tab.to_csv(os.path.join(LOC_VID, 'list.txt'), index=None)

# uksc_vid = get_all_files(LOC_IMG, trim=1)
# uksc_vid = [vid for vid in uksc_vid if vid.split('.')[-1].lower() in accepted_img_formats]
# uksc_vid_tab = pd.DataFrame(columns=['path', 'type'])
# uksc_vid_tab['path'] = uksc_vid
# uksc_vid_tab['type'] = ['image',] * len(uksc_vid)
# uksc_vid_tab.to_csv(os.path.join(LOC_IMG, 'list.txt'), index=None)

uksc_vid = get_all_files(BT_IMG, trim=1)
uksc_vid = [vid for vid in uksc_vid if vid.split('.')[-1].lower() in accepted_img_formats]
uksc_vid_tab = pd.DataFrame(columns=['path', 'type'])
uksc_vid_tab['path'] = uksc_vid
uksc_vid_tab['type'] = ['image',] * len(uksc_vid)
uksc_vid_tab.to_csv(os.path.join(BT_IMG, 'list.txt'), index=None)

## extract features

In [None]:
DATA_DIR = LOC_IMG
IMG_LST = os.path.join(LOC_IMG, 'list.txt')
OUT = os.path.join(DATA_ROOT, 'loc_img.npz')

lst = pd.read_csv(IMG_LST)
paths1 = [os.path.join(DATA_DIR, p) for p in lst['path'].to_list()]

DATA_DIR = BT_IMG
IMG_LST = os.path.join(BT_IMG, 'list.txt')

lst = pd.read_csv(IMG_LST)
paths2 = [os.path.join(DATA_DIR, p) for p in lst['path'].to_list()]

paths = paths1 + paths2

In [None]:
paths

In [None]:
hasher = Extractor()
feats = []
exclude_lst = []
for i, path in enumerate(paths):
    print('processing %d/%d: %s' % (i, len(paths), path))
    try:
        feat = hasher.extract_from_path(path)
    except Exception as e:
        print('Error processing %s' % path)
        exclude_lst.append(i)
        continue
    feats.append(feat)


In [None]:
feats = np.array(feats)
print(feats.shape)

In [None]:
sel_lst = [i for i in range(len(paths)) if i not in exclude_lst]
paths0 = paths
paths = [paths[i] for i in sel_lst]
np.savez(OUT, feats=feats)
np.save('sel_lst.npy', np.array(sel_lst))

## search

In [None]:
print(len(paths), len(feats))

In [None]:
# paths = pd.read_csv(IMG_LST)['path'].to_list()
# feats = np.load(OUT)['feats']
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(feats)

In [None]:
# test_im_path = '/home/tb0035/projects/tna_datathon/data/test/Flag_bearers02.jpg'
test_im_path = '/home/tb0035/projects/tna_datathon/data/test/field.png'
im = hasher.extract_from_path(test_im_path)
dist, inds = nbrs.kneighbors(im[None,...])
selected_paths = [os.path.join(DATA_DIR, paths[i]) for i in inds.squeeze()]
selected_paths.insert(0, test_im_path)

In [None]:
# Visualise results
canvas = visualise(selected_paths, (200,200))
print('Results: %s' % selected_paths)
print('Distance: %s' % dist.squeeze())
Image.fromarray(canvas).show()

In [None]:
im = Image.open(test_im_path)

In [None]:
print(np.array(im).shape)