In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd

import heapq
import gzip
import os

### Set the variables

In [None]:
# Output filename that will be created
OUTPUTFILE = "3x_geometric_dbof_2vlad_blend_maskedembeddlocalization_and_2018normed_200k_labels.csv"
# Filenames or 5 frame predictions. 'preds.npy' and 'ids.npy' extensions are automatically added.
FILENAMES = ["dbof_finetune_wmask_cp752", "VLAD5frames_53525_542", "VLAD5frames_50842_686"]
# Location of 'segment_label_ids.csv' file
LABELS = "./segment_label_ids.csv"

# Location of localization predictions
LOCALIZATION_FOLDER = "./localization_preds"
#  Video model predictions'preds.npy' and 'ids.npy' extensions are automatically added.
VIDEOMODEL = "./2018model_whole_video"

### parallel preparation of localization predictions

In [None]:
# parallel preparation of localization predictions
mapping_def_inx = {i: idx for i, idx in enumerate(pd.read_csv(LABELS)["Index"])}
mapping_inx_sorted = {idx:i  for i, idx in enumerate(sorted(pd.read_csv(LABELS)["Index"]))}

os.makedirs("./processed", exist_ok=True)

ref_table = []
for i, (xpreds, xid) in tqdm(enumerate(zip(my_preds, my_ids))):
        ref_id, pred_loc = xid.decode().split(":")
        ref_table.append([int(pred_loc), ref_id])
        
loc_preds = os.path.join(LOCALIZATION_FOLDER, "predfile_{}_ids.npy")
loc_ids = os.path.join(LOCALIZATION_FOLDER, "predfile_{}_preds.npy")
def par_proc(j):
    my_preds = np.zeros(2062258, dtype=np.float32)
    lids = np.load(loc_ids.format(j), allow_pickle=True)
    lpred = np.log(np.clip(np.load(loc_preds.format(j), 
                                   allow_pickle=True), 10**-7, 1))
    idx_mapping = {xid: i for i, xid in enumerate(lids)}
    jj = mapping_inx_sorted[mapping_def_inx[j]]
    # def_idx -> sorted_idx
    
    for i, (pred_loc, ref_id) in enumerate(ref_table):
        leq_idx = idx_mapping[ref_id]
    
        # Factor for localization
        factor = np.mean(lpred[leq_idx][int(pred_loc):int(pred_loc)+5]) # TODO: should we sum or multiply?!
        my_preds[i] += factor
    print("Done {}".format(j))
    np.save("./processed/arr{}".format(j), my_preds)

from multiprocessing import Pool
my_pool = Pool(8)
my_pool.map(par_proc, range(1000))

### Execute the scoring

In [None]:

weights = [2/3., 2/3., 2/3.]
top_k = 20
segment_max_pred = 200000


print("loading file", filenames[0])
my_preds = weights[0] * np.log(np.clip(np.load("{}preds.npy".format(filenames[0])), 10**-7, 1))
my_ids = np.load("{}ids.npy".format(filenames[0]))

id_order = my_ids.argsort()
my_ids = my_ids[id_order]
my_preds = my_preds[id_order]

assert my_ids.shape == (2062258,)
assert my_preds.shape == (2062258, 1000)

for weight, new_file in zip(weights[1:], filenames[1:]):
    print("loading file", new_file)
    new_preds = weight * np.log(np.clip(np.load("{}preds.npy".format(new_file)), 10**-7, 1))
    new_ids = np.load("{}ids.npy".format(new_file))
    
    assert new_ids.shape == (2062258,)
    assert new_preds.shape == (2062258, 1000)

    id_order = new_ids.argsort()
    new_ids = new_ids[id_order]
    new_preds = new_preds[id_order]
    
    my_preds += new_preds 
                  

idx_mapping = {xid: i for i, xid in enumerate(lids)}

vl_2018preds = np.log(np.load(VIDEOMODEL + "preds.npy", allow_pickle=True))
vl_2018ids = np.load(VIDEOMODEL + "ids.npy", allow_pickle=True)
video_mapping = {xid: i for i, xid in enumerate(vl_2018ids)}

for i, (xpreds, xid) in tqdm(enumerate(zip(my_preds, my_ids))):   
    # Factor for video level
    factor = vl_2018preds[video_mapping[ref_id]]
    my_preds[i] += factor

for i in trange(1000):
    new_vals = np.load("./processed/arr{}.npy".format(i))
    my_preds[:, i] += new_vals
    

# Get dictionary order.
idx_mapping = {i: idx for i, idx in enumerate(sorted(pd.read_csv(LABELS)["Index"]))}

if not OUTPUTFILE.endswith(".gz"): out_file += ".gz"
final_out_file = gzip.open(OUTPUTFILE, "wb")
final_out_file.write("Class,Segments\n".encode())

for i, cls in tqdm(idx_mapping.items()):

    pred_group = my_preds[:, i]
    label_top_hits = my_ids[pred_group.argsort()[-segment_max_pred:][::-1]]
    cls_heap = [label for label in label_top_hits]
        
    wstring = "%d,%s\n" %(cls, " ".join([x.decode() for x in cls_heap]))
    final_out_file.write(wstring.encode())
    
final_out_file.close()