In [None]:
#default_exp approach

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Approach


> This module contains all the code for defining the various approaches

In [None]:
#export
import copy
import cv2
import multiprocessing
import pickle
import time

import numpy as np

from collections import defaultdict, OrderedDict
from itertools import combinations, combinations_with_replacement, permutations
from joblib import Parallel, delayed
from pathlib import Path
from tango.eval import *
from tango.features import *
from tango.model import *
from tango.prep import *
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# export
def flatten_dict(d_in, d_out, parent_key):
    for k, v in d_in.items():
        if isinstance(v, dict):
            flatten_dict(v, d_out, parent_key + (k,))
        else:
            d_out[parent_key + (k,)] = v

In [None]:
# export
def gen_extracted_features(vid_ds, mdl, fps, ftk):
    vid_ds_features = {}
    for app in tqdm(vid_ds.labels):
        start = time.time()
        vid_ds_features[app] = {}
        for bug in vid_ds[app]:
            vid_ds_features[app][bug] = {}
            for report in vid_ds[app][bug]:
                vid_ds_features[app][bug][report] = {
                    'features': extract_features(vid_ds[app][bug][report], mdl, fps, frames_to_keep = ftk)
                }
        end = time.time()
        vid_ds_features[app]['elapsed_time'] = end - start
        
    return vid_ds_features

In [None]:
# export
def gen_tfidfs(vid_ds_features, vw, codebook, df, ftk):
    vid_tfids = defaultdict(
        lambda: defaultdict(dict)
    )
    
    for app in vid_ds_features:
        for bug in vid_ds_features[app]:
            if bug == 'elapsed_time': continue
            for report in vid_ds_features[app][bug]:
                bovw = new_get_bovw(
                    vid_ds_features[app][bug][report]['features'],
                    codebook, vw
                )
                vid_tfids[app][bug][report] = calc_tf_idf(bovw, df)
    
    return vid_tfids

In [None]:
# export
def gen_bovw_similarity(vid_ds, vid_ds_features, mdl, codebook, vw, ftk):
    results = defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(
                lambda: defaultdict(
                    lambda: defaultdict(
                        lambda: defaultdict(float)
                    )
                )
            )
        )
    )
    
    vid_ds_features = copy.deepcopy(vid_ds_features)
    df = np.histogram(codebook.labels_, bins = range(vw + 1))[0]
    vid_tfids = gen_tfidfs(vid_ds_features, vw, codebook, df, ftk)
    for app, bugs in vid_ds.labels.items():
        start = time.time()
        l = [(bug, report) for bug in bugs for report in bugs[bug] if bug != 'elapsed_time']
        pairs = list(x for x in combinations_with_replacement(l, 2) if x[0] != x[1])
        for (bug_i, report_i), (bug_j, report_j) in pairs:
            results[app][bug_i][report_i][bug_j][report_j]['bovw'] = np.dot(vid_tfids[app][bug_i][report_i], vid_tfids[app][bug_j][report_j]) / (np.linalg.norm(vid_tfids[app][bug_i][report_i]) * np.linalg.norm(vid_tfids[app][bug_j][report_j]))
        end = time.time()
        results[app]['elapsed_time'] = end - start + vid_ds_features[app]['elapsed_time']
            
    return df, results

In [None]:
# export
# Modified from geeksforgeeks: https://www.geeksforgeeks.org/longest-common-substring-dp-29/
def fuzzy_LCS(X, Y, m, n, sim_func, codebook, df, vw, mdl_frame_threshold = 0.0):
    LCSuff = [[0 for k in range(n + 1)] for l in range(m + 1)]
    LCSuff_weighted = [[0 for k in range(n + 1)] for l in range(m + 1)]
      
    # To store the length of  
    # longest common substring 
    result = result_weighted = 0 
  
    # Following steps to build 
    # LCSuff[m+1][n+1] in bottom up fashion 
    for i in range(0, m + 1):
        for j in range(0, n + 1):
            if (i == 0 or j == 0): 
                LCSuff[i][j] = 0
                LCSuff_weighted[i][j] = 0
                continue
            
            sim = sim_func(X[i - 1], Y[j - 1], codebook, df, vw)
            if sim > mdl_frame_threshold:
                LCSuff[i][j] = LCSuff[i - 1][j - 1] + sim
                LCSuff_weighted[i][j] = LCSuff_weighted[i - 1][j - 1] + sim * (i / m) * (j / n)
                if LCSuff[i][j] > result:
                    result = LCSuff[i][j]
                    result_weighted = LCSuff_weighted[i][j]
            else: 
                LCSuff[i][j] = 0
                LCSuff_weighted[i][j] = 0
        
    mini, maxi = min(m, n), max(m, n)
    sum_w = 0
    max_v = maxi + 1
    for i in reversed(range(1, mini + 1)):
        sum_w += (i / mini) * (max_v / maxi)
        max_v -= 1
    return result / min(m, n), result_weighted / sum_w

Some examples showing the differences between f-LCS and w-LCS

This example shows an example where there is a lot of overlap causes f-LCS similarity to be high, but since the overlap is not toward the end, the w-LCS similarity is lower.

In [None]:
X = [1, 2, 3, 4, 5]
Y = [3, 4, 5, 6, 7, 8]
test_sim_func = lambda *args: int(args[0] == args[1])
lcs, weighted_lcs = fuzzy_LCS(X, Y, len(X), len(Y), test_sim_func, None, None, None)
lcs, weighted_lcs

This example shows that if the overlap is towards the end of the two sequences, w-LCS will have a higher similarity than f-LCS

In [None]:
X = [1, 2, 3, 4, 5]
Y = [11, 12, 3, 4, 5]
lcs, weighted_lcs = fuzzy_LCS(X, Y, len(X), len(Y), test_sim_func, None, None, None)
lcs, weighted_lcs

In [None]:
# export
def gen_lcs_similarity(vid_ds, vid_ds_features, sim_func, mdl, codebook, df, vw, ftk):
    results = defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(
                lambda: defaultdict(
                    lambda: defaultdict(
                        lambda: defaultdict(float)
                    )
                )
            )
        )
    )
    
    vid_ds_features = copy.deepcopy(vid_ds_features)
    for app, bugs in vid_ds.labels.items():
        start = time.time()
        l = [(bug, report) for bug in bugs for report in bugs[bug] if bug != 'elapsed_time']
        pairs = list(x for x in combinations_with_replacement(l, 2) if x[0] != x[1])
        for (bug_i, report_i), (bug_j, report_j) in tqdm(pairs):
            lcs_sim, weighted_lcs_sim = fuzzy_LCS(
                vid_ds_features[app][bug_i][report_i]['features'],
                vid_ds_features[app][bug_j][report_j]['features'],
                len(vid_ds_features[app][bug_i][report_i]['features']),
                len(vid_ds_features[app][bug_j][report_j]['features']),
                sim_func, codebook, df, vw
            )
            results[app][bug_i][report_i][bug_j][report_j]['lcs'] = lcs_sim
            results[app][bug_i][report_i][bug_j][report_j]['weighted_lcs'] = weighted_lcs_sim

        end = time.time()
        results[app]['elapsed_time'] = end - start + vid_ds_features[app]['elapsed_time']
        
    return results

In [None]:
# export
def fix_sims(vid_sims, vid_ds):
    for sim_type in vid_sims:
        for app in vid_sims[sim_type]:
            l = [(bug, report) for bug in vid_ds[app] for report in vid_ds[app][bug] if bug != 'elapsed_time']
            pairs = reversed(list(x for x in permutations(l, 2) if x[0] != x[1]))
            for (bug_i, report_i), (bug_j, report_j) in pairs:
                if (bug_i, report_i) == (bug_j, report_j): continue
                vid_sims[sim_type][app][bug_i][report_i][bug_j][report_j] = vid_sims[sim_type][app][bug_j][report_j][bug_i][report_i]
    
    return vid_sims

In [None]:
# export
def sort_rankings(vid_sims):
    sorted_rankings = {}
    for sim_type in vid_sims:
        sorted_rankings[sim_type] = {}
        for app in vid_sims[sim_type]:
            sorted_rankings[sim_type][app] = {'elapsed_time': vid_sims[sim_type][app][f'elapsed_time']}
            for bug in vid_sims[sim_type][app]:
                if bug == 'elapsed_time': continue
                sorted_rankings[sim_type][app][bug] = {}
                for report in vid_sims[sim_type][app][bug]:
                    sorted_rankings[sim_type][app][bug][report] = []
                    d_out = {}
                    flatten_dict(vid_sims[sim_type][app][bug][report], d_out, tuple())
                    sorted_rankings[sim_type][app][bug][report] = OrderedDict(
                        sorted(d_out.items(), key = lambda x: str(x[1]), reverse = True)
                    )
    
    return sorted_rankings

In [None]:
# export
def approach(
    vid_ds, vid_ds_features, bovw_vid_ds_sims, lcs_vid_ds_sims,
    mdl, sim_func, codebook, df, vw, fps = 30, ftk = 1
):
    vid_ds_sims = defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(
                lambda: defaultdict(
                    lambda: defaultdict(
                        lambda: defaultdict(
                            lambda: defaultdict(float)
                        )
                    )
                )
            )
        )
    )
    
    vid_ds_features = copy.deepcopy(vid_ds_features)
    bovw_vid_ds_sims = copy.deepcopy(bovw_vid_ds_sims)
    lcs_vid_ds_sims = copy.deepcopy(lcs_vid_ds_sims)
    for app, bugs in vid_ds.labels.items():
        l = [(bug, report) for bug in bugs for report in bugs[bug] if bug != 'elapsed_time']
        pairs = list(x for x in combinations_with_replacement(l, 2) if x[0] != x[1])
        for (bug_i, report_i), (bug_j, report_j) in tqdm(pairs):
            lcs = lcs_vid_ds_sims[app][bug_i][report_i][bug_j][report_j]['lcs']
            weighted_lcs = lcs_vid_ds_sims[app][bug_i][report_i][bug_j][report_j]['weighted_lcs']
            vid_ds_sims['lcs'][app][bug_i][report_i][bug_j][report_j] = lcs
            vid_ds_sims['weighted_lcs'][app][bug_i][report_i][bug_j][report_j] = weighted_lcs
            
            bovw = bovw_vid_ds_sims[app][bug_i][report_i][bug_j][report_j]['bovw']
            vid_ds_sims['bovw'][app][bug_i][report_i][bug_j][report_j] = bovw
            vid_ds_sims['bovw_lcs'][app][bug_i][report_i][bug_j][report_j] = (bovw + lcs) / 2
            vid_ds_sims['bovw_weighted_lcs'][app][bug_i][report_i][bug_j][report_j] = (bovw + weighted_lcs) / 2

        bovw_time = bovw_vid_ds_sims[app]['elapsed_time']
        lcs_time = lcs_vid_ds_sims[app]['elapsed_time']
        
        vid_ds_sims['bovw'][app]['elapsed_time'] = bovw_time
        vid_ds_sims['lcs'][app]['elapsed_time'] = lcs_time
        vid_ds_sims['weighted_lcs'][app]['elapsed_time'] = lcs_time
        vid_ds_sims['bovw_lcs'][app]['elapsed_time'] = bovw_time + lcs_time
        vid_ds_sims['bovw_weighted_lcs'][app]['elapsed_time'] = bovw_time + lcs_time

    fixed_vid_ds_sims = fix_sims(vid_ds_sims, vid_ds)
    rankings = sort_rankings(fixed_vid_ds_sims)
    return rankings

In [None]:
path = Path("/tf/data/datasets/validation_set")
vid_ds = VideoDataset.from_path(path).label_from_paths()
vid_ds.get_labels()

In [None]:
fps = 30
ftk = 1
vw = 1_000
model_01 = 'M01'
simclr = SimCLRModel.load_from_checkpoint(checkpoint_path = str('/tf/data/models/simclr/checkpointepoch=98.ckpt')).eval()
M01 = SimCLRExtractor(simclr)

fname = f'/tf/data/models/codebooks/M01/cookbook_M01_{vw}vw.model'
codebook_01 = pickle.load(open(fname, 'rb'))

In [None]:
vid_ds_features = gen_extracted_features(vid_ds, M01, fps, ftk)
df, bovw_vid_ds_sims = gen_bovw_similarity(vid_ds, vid_ds_features, M01, codebook_01, vw, ftk)
lcs_vid_ds_sims = gen_lcs_similarity(vid_ds, vid_ds_features, simclr_frame_sim, M01, codebook_01, df, vw, ftk)

In [None]:
# vid_ds_features = gen_extracted_features(vid_ds, M01, fps, ftk)
# df, vid_ds_sims = gen_similarity(vid_ds, vid_ds_features, M01, codebook_01, vw, ftk)
rankings_01 = approach(
    vid_ds, vid_ds_features, bovw_vid_ds_sims, lcs_vid_ds_sims, M01, simclr_frame_sim,
    codebook_01, df, vw, fps = fps, ftk = ftk,
)

In [None]:
rankings_01['weighted_lcs']

In [None]:
evals = evaluate(rankings_01['lcs'])

In [None]:
# export
def compute_sims(q_vid, vid_ds, model, codebook, vw, fps, ftk):
    df = np.histogram(codebook.labels_, bins = range(vw + 1))[0]
    
    q_features = extract_features(q_vid, model, fps, frames_to_keep = ftk)
    bovw = new_get_bovw(
        q_features,
        codebook, vw
    )
    q_tfids = calc_tf_idf(bovw, df)
    
    vid_ds_features = gen_extracted_features(vid_ds, model, fps, ftk)
    vid_ds_tfids = gen_tfidfs(vid_ds_features, vw, codebook, df, ftk)
    results = {}
    for app in tqdm(vid_ds.labels):
        start = time.time()
        results[app] = {}
        for bug in vid_ds[app]:
            results[app][bug] = {}
            for report in vid_ds[app][bug]:
                results[app][bug][report] = np.dot(q_tfids, vid_ds_tfids[app][bug][report]) / (np.linalg.norm(q_tfids) * np.linalg.norm(vid_ds_tfids[app][bug][report]))
    
    d_out = {}
    flatten_dict(results, d_out, tuple())
    sorted_rankings = OrderedDict(
        sorted(d_out.items(), key = lambda x: str(x[1]), reverse = True)
    )
    
    return sorted_rankings

In [None]:
from nbdev.export import notebook2script
notebook2script()