# System setup

## Install missing dependencies in the docker file

In [1]:
!pip install bitstring
!pip install ipywidgets
!pip install plotly
!pip install textdistance[extras]
!pip install tqdm

Collecting textdistance[extras]
  Using cached textdistance-4.5.0-py3-none-any.whl (31 kB)
Collecting rapidfuzz>=2.6.0
  Downloading rapidfuzz-2.13.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting python-Levenshtein
  Downloading python_Levenshtein-0.20.8-py3-none-any.whl (9.4 kB)
Collecting abydos
  Downloading abydos-0.5.0-py2.py3-none-any.whl (886 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m886.0/886.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting jellyfish
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pyxDamerauLevenshtein
  Using cached pyxDamerauLevenshtein-1.7.1.tar.gz 

## Import libraries and modules

In [2]:
import os
import sys
import math
import time

from typing import NamedTuple

import bitstring
import itertools
import json

import pandas as pd
from tqdm import tqdm
import numpy as np
import numba
import multiprocessing
import subprocess

import concurrent.futures as cf
import random
from collections import deque

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction import FeatureHasher

from collections import defaultdict

import textdistance

import plotly.offline as py
import plotly.graph_objs as go
import plotly.io as pio
from plotly.offline import init_notebook_mode, iplot
import plotly.express as px
import plotly.figure_factory as ff

# Define auxiliary functions

In [3]:
class FFProbeResult(NamedTuple):
    return_code: int
    json: str
    error: str


def get_video_data(file_path) -> FFProbeResult:
    command_array = ["ffprobe",
                     "-v", "quiet",
                     "-print_format", "json",
                     "-show_format",
                     "-show_streams",
                     file_path]
    result = subprocess.run(command_array, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    return FFProbeResult(return_code=result.returncode,
                         json=result.stdout,
                         error=result.stderr)

In [4]:
def parse_bitstream_signature(s):
    start_time = time.time()
    # MPEG7 Signature export can be found in https://github.com/FFmpeg/FFmpeg/blob/45ab5307a6e8c04b4ea91b1e1ccf71ba38195f7c/libavfilter/vf_signature.c#L485

    # NumOfSpatial Regions, only 1 supported (32 bits)
    # SpatialLocationFlag, always the whole image (1 bit)
    # PixelX,1 PixelY,1, 0,0 (16+16 bits)
    # PixelX,2 -> W (16 bits)
    # PixelY,2 -> H (16 bits
    # StartFrameOfSpatialRegion (32 bits)
    skipped, width, height, start_frame_spatial_region = s.readlist([65,16,16,32])
    
    # NumOfFrames (32 bits)
    # MediaTimeUnit (16 bits)
    # MediaTimeFlagOfSpatialRegion (1 bit)
    # StartMediaTimeOfSpatialRegion (32 bits)
    # EndMediaTimeOfSpatialRegion (32 bits)
    # NumOfSegments (32 bits)
    last_index, time_base_denominator, media_time_flag, start_media_time_flag, last_coarse_pts, num_segments = s.readlist([32,16,1,32, 32, 32])

    # Read first coarse signature
    segments_dict = {}
    # Iterate through segments (45-frame long each at extraction time)
    for i in range(num_segments.uint):
        segment_signatures = []    
        # StartFrameOfSegment (32 bits)
        # EndFrameOfSegment (32 bits)
        # MediaTimeFlagOfSegment (1 bit)
        # StartMediaTimeOfSegment (32 bits)
        # EndMediaTimeOfSegment (32 bits)
        first_index, last_index, media_time_flag, first_pts, last_pts = s.readlist([32,32,1,32,32])
        for j in range(5):
            # put 243 bits ( = 7 * 32 + 19 = 8 * 28 + 19) into buffer 
            #for k in range(30):
            #    segment_signatures.append(s.read(8))
            segment_signatures.append(s.read(243))
        
        segments_dict[str(i)] = segment_signatures

    # CompressionFlag (1 bit)
    compression_flag = s.read(1)
    
    # Then come the fine signature elements
    # The “frame confidence” c , the “words” w and the “frame
    # signature” x make up the fine signature s of the Video
    # Signature.
    frames_dict = {}
    words_dict = {}
    confidence_dict = {}
    # Iterate through frames one by one
    for i in range(last_index.uint):
        words = []
        frame_signature = []
        # MediaTimeFlagOfFrame (1 bit)
        # MediaTimeOfFrame (32 bits)
        # FrameConfidence (8 bits)
        media_time_flag, pts, confidence = s.readlist([1, 32, 8])
        
        # Words
        for l in range(5):
            # Word (8 bits)
            words.append(s.read(8).uint)
        # Frame Signature
        for l in range(76):
            # framesignature (8 bits)
            frame_signature.append(s.read(8).uint)

        frames_dict[str(i)] = frame_signature
        words_dict[str(i)] = words
        confidence_dict[str(i)] = confidence.uint
    process_time = time.time() - start_time
    
    return last_index.uint32 + 1, width.uint16 + 1, height.uint16 + 1, process_time, (last_index.uint32 + 1) / process_time , segments_dict, confidence_dict, words_dict, frames_dict 

def extract_title_data(df):
    signatures_df = pd.DataFrame(columns=['Path', 'Title', 'duration', 'framerate', 'clipsize', 'L', 'W', 'H', 'elapsed_time', 'process_FPS', 'coarse_sign', 'confidence', 'words', 'fine_sign'])

    for f in tqdm(df[0]['Path']):

        s = bitstring.ConstBitStream(filename=f)

        lenght, width, height, elapsed_time, process_time, coarse, confidence, words, fine = parse_bitstream_signature(s)
        signatures_list = []
        coarse = np.array(tuple(coarse.values()), dtype=np.uint8).flatten()
        confidence = np.array(tuple(confidence.values()), dtype=np.uint8).flatten()
        words = np.array(tuple(words.values()), dtype=np.uint8).flatten()
        fine = np.array(tuple(fine.values()), dtype=np.uint8).flatten()
        
        clip_filename = f.replace('.bin','').replace(signatures_directory, clips_directory)
    
        file_size = os.path.getsize(clip_filename)
        video_data = get_video_data(clip_filename)
        d = json.loads(video_data.json)
        for stream in d['streams']:
            if stream['codec_type'] == 'video':
                duration = d['format']['duration']
                framerate = float(stream['avg_frame_rate'].split('/')[0]) / float(stream['avg_frame_rate'].split('/')[1])
                break
                
        signatures_list.append(f)        
        signatures_list.append(f.replace('.bin','').replace(signatures_directory,''))
        signatures_list.append(duration)
        signatures_list.append(framerate)
        signatures_list.append(file_size)
        signatures_list.append(lenght)
        signatures_list.append(width)
        signatures_list.append(height)
        signatures_list.append(elapsed_time)
        signatures_list.append(process_time)
        signatures_list.append(coarse)
        signatures_list.append(confidence)
        signatures_list.append(words)
        signatures_list.append(fine)

        signatures_df.loc[len(signatures_df)] = signatures_list

    return signatures_df

def search_str(file_path, word):
    with open(file_path, 'r') as file:
        # read all content of a file
        content = file.read()
        # check if string present in a file

    if word in content:
        return True
    
    return False
    
def find_topic(annotations_directory, title):
    for filename in os.listdir(annotations_directory):
        f = os.path.join(annotations_directory, filename)

        # checking if it is a file
        if os.path.isfile(f):
            if search_str(f, title):
                return filename.replace('.txt','')
    return ('Topic not found')



In [5]:
def multiproc(df1):
    cpus = multiprocessing.cpu_count()
    procs = deque()
    df1_splits = np.array_split(df1, cpus)

    with cf.ProcessPoolExecutor(max_workers=cpus) as executor:
        for df1_slice in zip(df1_splits):
            procs.append(
                executor.submit(extract_title_data, df1_slice)
            )

    return (future.result() for future in cf.as_completed(procs))


def create_titles_df():
    
    titles_list = []
    for filename in os.listdir(signatures_directory):
        f = os.path.join(signatures_directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            titles_list.append((f, filename))
    return pd.DataFrame(data=titles_list, columns=['Path', 'Title'])

### Define environment variables

In [6]:
np.set_printoptions(threshold=sys.maxsize)
py.init_notebook_mode(connected=True)
pio.renderers.default = 'colab'

In [7]:
annotations_directory = '/home/jovyan/work/core_dataset/annotations/'
clips_directory = '/home/jovyan/work/core_dataset/videos/'
signatures_directory = 'processed_signatures/'
matches_directory = 'processed_matches/'

In [8]:
metrics = ['jaccard', 'cosine', 'tanimoto', 'bag', 'tversky',  'hamming']
signatures = ['coarse_sign', 'confidence']
#signatures = ['coarse_sign', 'fine_sign', 'words', 'confidence']

## Extract signatures 

In [None]:
df1 = create_titles_df()

In [None]:
df1

In [None]:
results = multiproc(df1)

signatures_df = pd.DataFrame()
for df_slice in results:
    signatures_df = pd.concat([signatures_df, df_slice], ignore_index=True)

In [None]:
display(signatures_df)

In [None]:
signatures_df['Topic'] = signatures_df['Title'].apply(lambda x: find_topic(annotations_directory, x))

In [None]:
def tokenize(doc):
    """Extract tokens from doc.

    This uses a simple regex that matches word characters to break strings
    into tokens. For a more principled approach, see CountVectorizer or
    TfidfVectorizer.
    """
    return np.unique(doc, return_counts=True)

def token_freqs(doc):
    """Extract a dict mapping tokens from doc to their occurrences."""
    freq_dict = defaultdict(int)
    values , frequencies = tokenize(doc)

    for A, B in zip(values, frequencies):
        freq_dict[str(A)] = B

    return freq_dict

In [None]:
n = 255
binary = False
for signature in signatures:
    if signature == 'coarse_sign':
        n = 1215
        binary = True
    data = signatures_df[signature].to_numpy()
    h = FeatureHasher(n_features=n, input_type="dict")
    f = h.fit_transform(token_freqs(d) for d in data).toarray().astype(int)

    signatures_df[f'{signature}_hashed'] =  list(f)

In [None]:
signatures_df.to_hdf('signatures.h5', key='key', mode='w')

### Load pre-computed data from csv (if available)

In [9]:
signatures_df_loaded = pd.read_hdf('signatures.h5', key='key', mode='r')
signatures_df = signatures_df_loaded
display(signatures_df.head())

Unnamed: 0,Path,Title,duration,framerate,clipsize,L,W,H,elapsed_time,process_FPS,coarse_sign,confidence,words,fine_sign,Topic,coarse_sign_hashed,confidence_hashed
0,processed_signatures/127dab55025984673f65d3a23...,127dab55025984673f65d3a23b1fea99ecc79b15.mp4,300.845,29.968921,103385387,9016,1440,1080,25.66618,351.279389,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[121, 121, 121, 121, 121, 182, 36, 48, 154, 15...","[121, 121, 121, 121, 121, 121, 121, 121, 121, ...",dove_evolution_commercial,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, -92, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,processed_signatures/5ac8651ce41781440c8595d17...,5ac8651ce41781440c8595d174748f1a6cad1ff7.flv,64.9,30.0,5923082,1948,640,360,5.942043,327.833383,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[129, 130, 129, 129, 126, 130, 138, 127, 134, ...","[194, 119, 225, 95, 215, 194, 122, 234, 107, 2...","[152, 151, 162, 66, 231, 55, 115, 130, 52, 142...",david_beckham_lights_the_olympic_torch,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, -8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
2,processed_signatures/2fe7b38b1cbdfdf5d6075ee2b...,2fe7b38b1cbdfdf5d6075ee2b2cd9fee2cf7d7c1.flv,142.56,25.0,7721187,3565,424,320,11.831021,301.326486,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[111, 111, 111, 111, 111, 108, 110, 110, 109, ...","[106, 215, 175, 202, 236, 106, 215, 175, 202, ...","[117, 36, 12, 0, 84, 84, 19, 33, 209, 136, 35,...",scent_of_woman_tango,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, -59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
3,processed_signatures/caec7bae88f5b5d60a08487e9...,caec7bae88f5b5d60a08487e9d9a8cd27251c64e.flv,166.233,29.969697,5991444,4983,320,240,16.579916,300.54435,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[121, 121, 121, 121, 121, 233, 92, 120, 145, 1...","[121, 121, 121, 121, 121, 121, 121, 121, 121, ...",dove_evolution_commercial,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, -9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,processed_signatures/7b2158aad4ea2e6c8f149bb86...,7b2158aad4ea2e6c8f149bb869801b96734c1b1c.flv,99.366,29.969697,9638711,2979,640,480,10.320174,288.657931,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[121, 121, 121, 121, 121, 121, 121, 121, 121, ...","[121, 121, 121, 121, 121, 121, 121, 121, 121, ...",dove_evolution_commercial,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, -17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


## Generate charts for dataset analysis

In [None]:
fig = px.histogram(signatures_df, x=['Topic'])
fig.update_layout( autosize=False,
                width=600,
                height=550,
                margin=dict(
                    l=10,
                    r=10,
                    b=10,
                    t=55,
                    pad=4
                ),
                paper_bgcolor="LightSteelBlue",
                showlegend=False,
                title={
                            'text': f'Video clip count by topic',
                            'y':0.95,
                            'x':0.5,
                            'xanchor': 'center',
                            'yanchor': 'top'},
                xaxis_title='',
                yaxis_title="Count"
            )
fig.show()

In [None]:
data_dict = {}
for topic in signatures_df['Topic'].unique():
    if topic is not None:
        data_dict[topic] = signatures_df[signatures_df['Topic']==topic]['L'].sum()
    
data_dict
topic_frames_df = pd.DataFrame(index=data_dict.keys(), data=data_dict.values())
topic_frames_df.columns = ['Total topic frames']

fig = px.bar(topic_frames_df, x=['Total topic frames'])
fig.update_layout( autosize=False,
                    width=600,
                    height=550,
                   margin=dict(
                        l=10,
                        r=10,
                        b=10,
                        t=55,
                        pad=4
                    ),
                    paper_bgcolor="LightSteelBlue",
                                  showlegend=False,
                                  title={
                                        'text': "Total frame count per topic",
                                        'y':0.95,
                                        'x':0.5,
                                        'xanchor': 'center',
                                        'yanchor': 'top'},
                    xaxis_title="Number of frames",
                    yaxis_title=""
                )

fig.show()


In [None]:
framerates = pd.to_numeric(signatures_df[signatures_df['framerate']<60]['duration'])
fig = px.histogram(framerates, nbins=30)
fig.update_layout( autosize=False,
                    width=600,
                    height=550,
                   margin=dict(
                        l=10,
                        r=10,
                        b=10,
                        t=55,
                        pad=4
                    ),
                    paper_bgcolor="LightSteelBlue",
                                  showlegend=False,
                                  title={
                                        'text': "Video clip count by duration",
                                        'y':0.95,
                                        'x':0.5,
                                        'xanchor': 'center',
                                        'yanchor': 'top'},
                    xaxis_title="Duration (s)",
                    yaxis_title="Count"
                )
fig.show()

In [None]:
fig = px.histogram(signatures_df, x=['L'], nbins=25)
fig.update_layout( autosize=False,
                width=600,
                height=550,
                margin=dict(
                    l=10,
                    r=10,
                    b=10,
                    t=55,
                    pad=4
                ),
                paper_bgcolor="LightSteelBlue",
                showlegend=False,
                title={
                            'text': f'Video clip count by number of frames',
                            'y':0.95,
                            'x':0.5,
                            'xanchor': 'center',
                            'yanchor': 'top'},
                xaxis_title='Lenght (in frames)',
                yaxis_title="Count"
            )
fig.show()

In [None]:
framerates = pd.to_numeric(signatures_df[signatures_df['framerate']<60]['framerate'])
fig = px.histogram(framerates, nbins=25)
fig.update_layout( autosize=False,
                    width=600,
                    height=550,
                   margin=dict(
                        l=10,
                        r=10,
                        b=10,
                        t=55,
                        pad=4
                    ),
                    paper_bgcolor="LightSteelBlue",
                                  showlegend=False,
                                  title={
                                        'text': "Video clip count by frame rate",
                                        'y':0.95,
                                        'x':0.5,
                                        'xanchor': 'center',
                                        'yanchor': 'top'},
                    xaxis_title="Frame rate (fps)",
                    yaxis_title="Count"
                )
fig.show()

# Obtain dataset with true positive matches

In [None]:
frames_col = 'detected-frames'
colnames=['Video-A', 'Video-B', 'Start-A', 'Start-B', frames_col]
detections_df = pd.DataFrame(columns=colnames)
for filename in os.listdir(matches_directory):
    f = os.path.join(matches_directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        print(f)
        topic_detections_df = pd.read_csv(f,index_col=[0],names=colnames, header=0)

        topic_detections_df['topic'] = filename

        detections_df = pd.concat([detections_df, topic_detections_df.head(20)], axis=0)

        detections_df = detections_df.reset_index(drop=True)
        detections_df = detections_df.drop(detections_df[detections_df[frames_col].astype(str).str.contains('error')].index)

        detections_df[frames_col] = detections_df[frames_col].astype(int)
display(detections_df)

In [None]:
def find_signature(x, signature_df, signature_column):
    signature = signature_df[signature_df['Title'].str.contains(x)][signature_column].values

    if len(signature) > 0:
        return signature[0]
    
    return 'No signature found'

In [None]:

for element in ['A', 'B']:
    detections_df[f'{element}_framerate'] = detections_df.apply(lambda x: find_signature(x[f'Video-{element}'], signatures_df, 'framerate'), axis=1)
    for signature_col in signatures:
    # Get signatures from signatures datataframe
        detections_df[f'{element}_{signature_col}'] = detections_df.apply(lambda x: find_signature(x[f'Video-{element}'], signatures_df, signature_col), axis=1)
        detections_df[f'{element}_{signature_col}_hashed'] = detections_df.apply(lambda x: find_signature(x[f'Video-{element}'], signatures_df, f'{signature_col}_hashed'), axis=1)

        # Cleanup dataframe
        detections_df = detections_df.drop(detections_df[detections_df[f'{element}_{signature_col}']=='No signature found'].index)
        detections_df = detections_df.drop(detections_df[detections_df[f'{element}_{signature_col}_hashed']=='No signature found'].index)


In [None]:
detections_df['framerate_diff'] = pd.to_numeric(detections_df['A_framerate'] - detections_df['B_framerate'])

In [None]:
def hamming_distance(a, b):
    return len([i for i in filter(lambda x: x[0] != x[1], zip(a, b))])

@numba.jit(forceobj=True)
def apply_integrate_f_numba(col_a, col_b, metric):
    n = len(col_a)
    
    result = np.empty(n, dtype="float64")
    if metric == 'jaccard':
        for i in range(n):
            result[i] = textdistance.jaccard(list(col_a[i]), list(col_b[i]))
    elif metric == 'cosine':
        for i in range(n):
            result[i] = textdistance.cosine(list(col_a[i]), list(col_b[i]))
    elif metric == 'tanimoto':
        for i in range(n):
            result[i] = textdistance.tanimoto(list(col_a[i]), list(col_b[i]))
    elif metric == 'bag':
        for i in range(n):
            result[i] = textdistance.bag(list(col_a[i]), list(col_b[i]))
    elif metric == 'tversky':
        for i in range(n):
            result[i] = textdistance.tversky(list(col_a[i]), list(col_b[i]))
    elif metric == 'hamming':
        for i in range(n):
            result[i] = hamming_distance(list(col_a[i]), list(col_b[i]))
    else:
        return f'Metric not found:{metric}'
    return result

In [None]:

def compute_numba(df, metric, signature_cols):

    result = tqdm(apply_integrate_f_numba(
        df[signature_cols[0]].values, df[signature_cols[1]].values, metric
    ))
    return pd.Series(result, index=df.index, name="result")
    


In [None]:
for signature_col in signatures:
    print(f'{signature_col}')
    for distance in metrics:
        detections_df[f'{signature_col}-{distance}'] = compute_numba(detections_df, distance, [f'A_{signature_col}', f'B_{signature_col}'])
        detections_df[f'{signature_col}-{distance}_hashed'] = compute_numba(detections_df, distance, [f'A_{signature_col}_hashed', f'B_{signature_col}_hashed'])

display(detections_df.describe())

In [None]:
topics = list(signatures_df['Topic'].unique())

detections_df['class'] = detections_df.apply(lambda x: (topics.index(x['topic'].replace('.csv',''))), axis=1)
detections_df['attack'] = 1

In [None]:
detections_df.to_hdf('detections.h5', key='key', mode='w')

### Load pre-computed data from csv (if available)

In [10]:
detections_df_loaded = pd.read_hdf('detections.h5', key='key', mode='r')
detections_df = detections_df_loaded
display(detections_df.head(5))

Unnamed: 0,Video-A,Video-B,Start-A,Start-B,detected-frames,topic,A_framerate,A_coarse_sign,A_coarse_sign_hashed,A_confidence,...,confidence-tanimoto,confidence-tanimoto_hashed,confidence-bag,confidence-bag_hashed,confidence-tversky,confidence-tversky_hashed,confidence-hamming,confidence-hamming_hashed,class,attack
0,0c3c3c22cd9caef7491e07a3176e6e27a0ef06a2.flv,0c3c3c22cd9caef7491e07a3176e6e27a0ef06a2.flv,1.533,1.533,2957,ronaldinho_ping_pong.csv,30.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[105, 97, 95, 95, 92, 93, 93, 93, 91, 93, 91, ...",...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,18,1
1,0c3c3c22cd9caef7491e07a3176e6e27a0ef06a2.flv,275ca13a391d89d08b7de46c23c4430070c3251c.flv,97.3,176.4,23,ronaldinho_ping_pong.csv,30.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[105, 97, 95, 95, 92, 93, 93, 93, 91, 93, 91, ...",...,-0.78489,-0.24041,1839.0,101.0,0.580396,0.846505,2913.0,157.0,18,1
2,0c3c3c22cd9caef7491e07a3176e6e27a0ef06a2.flv,4f1435d832285500d7663d893b9eeba69262321a.flv,97.267,158.68,93,ronaldinho_ping_pong.csv,30.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[105, 97, 95, 95, 92, 93, 93, 93, 91, 93, 91, ...",...,-0.827389,-0.187875,1576.0,79.0,0.563548,0.877898,2932.0,143.0,18,1
3,0c3c3c22cd9caef7491e07a3176e6e27a0ef06a2.flv,5567a1bb5b9767342dc99baea8d67508ca22c046.flv,97.267,158.68,93,ronaldinho_ping_pong.csv,30.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[105, 97, 95, 95, 92, 93, 93, 93, 91, 93, 91, ...",...,-0.827389,-0.187875,1576.0,79.0,0.563548,0.877898,2932.0,143.0,18,1
4,0c3c3c22cd9caef7491e07a3176e6e27a0ef06a2.flv,64c86092addbb5b3de44a4412d52509fdda5c38c.flv,40.133,119.4,89,ronaldinho_ping_pong.csv,30.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[105, 97, 95, 95, 92, 93, 93, 93, 91, 93, 91, ...",...,-0.772349,-0.159274,800.0,67.0,0.585463,0.895476,2907.0,144.0,18,1


## Generate charts with results

In [12]:

fig = go.Figure()

x_label='detected-frames'
y_label='coarse_sign-cosine_hashed'
for topic in np.unique(detections_df['topic']):
    chart_df = detections_df[(detections_df['topic']==topic)]
    fig.add_trace(
        go.Scatter(
            x=pd.to_numeric(chart_df[x_label]),
            y=pd.to_numeric(chart_df[y_label]),
            mode="markers",
            name=topic,
            text=chart_df[['Video-A', 'Video-B', y_label]],
            hovertemplate='<b>%{text}</b>',
            showlegend=True
        )
)

fig.update_layout(
    autosize=False,
    width=1500,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
fig.update_xaxes(
        tickangle = 90,
        title_text = x_label,
        title_font = {"size": 20},
        title_standoff = 25)

fig.update_yaxes(
        title_text = y_label,
        title_font = {"size": 20},
        title_standoff = 25)
fig.show()

In [None]:
for i in detections_df.index:
    print((detections_df['A_coarse_sign_hashed'] - detections_df['B_coarse_sign_hashed'])[i])


In [None]:
import plotly.figure_factory as ff

for hashing in ['', '_hashed']:
    measurements = []
    for signature_col in signatures:
        for distance in metrics:
            measurements.append(f'{signature_col}-{distance}{hashing}')

    metrics_cols = [metric for metric in measurements]
    metrics_cols.append('detected-frames')

    corr = detections_df[metrics_cols].corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))
    df_mask = corr.mask(mask)

    z = corr.values.tolist()

    z_text = [[str(round(y, 1)) for y in x] for x in z]

    fig = ff.create_annotated_heatmap(z=df_mask.to_numpy(), 
                                      x=df_mask.columns.tolist(),
                                      y=df_mask.columns.tolist(),
                                      colorscale=px.colors.diverging.RdBu,
                                      hoverinfo="none", #Shows hoverinfo for null values
                                      annotation_text=z_text,
                                      showscale=False, ygap=1, xgap=1
                                     )

    fig.update_xaxes(side="bottom")

    fig.update_layout(
        title_text=f'Correlation between distances', 
        title_x=0.5, 
        width=1000, 
        height=1000,
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        xaxis_zeroline=False,
        yaxis_zeroline=False,
        yaxis_autorange='reversed',
        template='plotly_white'
    )

    # NaN values are not handled automatically and are displayed in the figure
    # So we need to get rid of the text manually
    for i in range(len(fig.layout.annotations)):
        if fig.layout.annotations[i].text == 'nan':
            fig.layout.annotations[i].text = ""

    fig.show()

# Obtain dataset with true negative matches

In [None]:
true_negatives_df = pd.DataFrame()

A_titles = signatures_df['Title'].sample(frac=1)
B_titles = signatures_df['Title'].sample(frac=1)

true_negatives_df['Video-A'] = A_titles.values
true_negatives_df['Video-B'] = B_titles.values

true_negatives_df['Topic-A'] = true_negatives_df['Video-A'].apply(lambda x: find_topic(annotations_directory, x.replace('.bin','').replace(signatures_directory,'')))
true_negatives_df['Topic-B'] = true_negatives_df['Video-B'].apply(lambda x: find_topic(annotations_directory, x.replace('.bin','').replace(signatures_directory,'')))

true_negatives_df = true_negatives_df.drop(true_negatives_df[true_negatives_df['Topic-A']==true_negatives_df['Topic-B']].index).reset_index()
true_negatives_df = true_negatives_df.dropna(axis='rows')

In [None]:
for element in ['A', 'B']:
    true_negatives_df[f'{element}_framerate'] = true_negatives_df.apply(lambda x: find_signature(x[f'Video-{element}'], signatures_df, 'framerate'), axis=1)
    for signature_col in signatures:

        # Get signatures from coarse datataframe
        true_negatives_df[f'{element}_{signature_col}'] = true_negatives_df.apply(lambda x: find_signature(x[f'Video-{element}'], signatures_df, signature_col), axis=1)
        true_negatives_df[f'{element}_{signature_col}_hashed'] = true_negatives_df.apply(lambda x: find_signature(x[f'Video-{element}'], signatures_df, f'{signature_col}_hashed'), axis=1)

        # Cleanup dataframe
        true_negatives_df = true_negatives_df.drop(true_negatives_df[true_negatives_df[f'{element}_{signature_col}']=='No signature found'].index)
        true_negatives_df = true_negatives_df.drop(true_negatives_df[true_negatives_df[f'{element}_{signature_col}_hashed']=='No signature found'].index)


In [None]:
for signature_col in signatures:
    for distance in tqdm(metrics):
        true_negatives_df[f'{signature_col}-{distance}'] = compute_numba(true_negatives_df, distance, [f'A_{signature_col}', f'B_{signature_col}'])
        true_negatives_df[f'{signature_col}-{distance}_hashed'] = compute_numba(true_negatives_df, distance, [f'A_{signature_col}_hashed', f'B_{signature_col}_hashed'])

In [None]:
topics = list(true_negatives_df['Topic-A'].unique())
true_negatives_df['framerate_diff'] = pd.to_numeric(true_negatives_df['A_framerate'] - true_negatives_df['B_framerate'])

true_negatives_df['Video-A'] =  true_negatives_df.apply(lambda x: (x['Video-A'].replace(signatures_directory, '')), axis=1)
true_negatives_df['Video-B'] =  true_negatives_df.apply(lambda x: (x['Video-B'].replace(signatures_directory, '')), axis=1)
true_negatives_df['class'] =  true_negatives_df.apply(lambda x: (topics.index(x['Topic-A'].replace('.csv',''))), axis=1)
true_negatives_df['attack'] = 0
true_negatives_df['detected-frames'] = 0

In [None]:
true_negatives_df.to_hdf('true_negatives.h5', key='key', mode='w')

### Load pre-computed data from csv (if available)

In [None]:
true_negatives_df_loaded = pd.read_hdf('true_negatives.h5', key='key', mode='r')
true_negatives_df = true_negatives_df_loaded
display(true_negatives_df.head(5))

## Generate charts with results

In [None]:

fig = go.Figure()

for topic in np.unique(true_negatives_df['Topic-A']):
    fig.add_trace(
        go.Scatter(
            x=pd.to_numeric(true_negatives_df[(true_negatives_df['Topic-A']==topic)][x_label]),
            y=pd.to_numeric(true_negatives_df[(true_negatives_df['Topic-A']==topic)][y_label]),
            mode="markers",
            #marker=dict(size=0.01*detections_df[detections_df['topic']==topic]['Frames']),
            name=topic,
            text=true_negatives_df[(true_negatives_df['Topic-A']==topic)][['Video-A', 'Video-B', y_label]],
            hovertemplate='<b>%{text}</b>',
            showlegend=True
        )
)

fig.update_layout(
    autosize=False,
    width=1500,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
fig.update_xaxes(
        tickangle = 90,
        title_text = x_label,
        title_font = {"size": 20},
        title_standoff = 25)

fig.update_yaxes(
        title_text = y_label,
        title_font = {"size": 20},
        title_standoff = 25)
fig.show()

In [None]:
for hashing in ['', '_hashed']:
    measurements = []
    for signature_col in signatures:
        for distance in metrics:
            measurements.append(f'{signature_col}-{distance}{hashing}')

    metrics_cols = [metric for metric in measurements]

    corr = true_negatives_df[metrics_cols].corr()
    display(true_negatives_df[metrics_cols])
    mask = np.triu(np.ones_like(corr, dtype=bool))
    df_mask = corr.mask(mask)

    z = corr.values.tolist()

    z_text = [[str(round(y, 1)) for y in x] for x in z]

    fig = ff.create_annotated_heatmap(z=df_mask.to_numpy(), 
                                      x=df_mask.columns.tolist(),
                                      y=df_mask.columns.tolist(),
                                      colorscale=px.colors.diverging.RdBu,
                                      hoverinfo="none", #Shows hoverinfo for null values
                                      annotation_text=z_text,
                                      showscale=False, ygap=1, xgap=1
                                     )

    fig.update_xaxes(side="bottom")

    fig.update_layout(
        title_text=f'Correlation between distances', 
        title_x=0.5, 
        width=1000, 
        height=1000,
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        xaxis_zeroline=False,
        yaxis_zeroline=False,
        yaxis_autorange='reversed',
        template='plotly_white'
    )

    # NaN values are not handled automatically and are displayed in the figure
    # So we need to get rid of the text manually
    for i in range(len(fig.layout.annotations)):
        if fig.layout.annotations[i].text == 'nan':
            fig.layout.annotations[i].text = ""

    fig.show()

In [None]:
fig = go.Figure()

for topic in np.unique(true_negatives_df['Topic-A']):
    fig.add_trace(
        go.Scatter(
            x=pd.to_numeric(true_negatives_df[(true_negatives_df['Topic-A']==topic)][x_label]),
            y=pd.to_numeric(true_negatives_df[(true_negatives_df['Topic-A']==topic)][y_label]),
            mode="markers",
            #marker=dict(color=),
            name=topic,
            text=true_negatives_df[(true_negatives_df['Topic-A']==topic)][['Video-A', 'Video-B', y_label]],
            hovertemplate='<b>%{text}</b>',
            showlegend=True
        )
    )

for topic in np.unique(detections_df['topic']):
    fig.add_trace(
        go.Scatter(
            x=pd.to_numeric(detections_df[(detections_df['topic']==topic) & (detections_df['detected-frames']!=0)][x_label]),
            y=pd.to_numeric(detections_df[(detections_df['topic']==topic) & (detections_df['detected-frames']!=0)][y_label]),
            mode="markers",
            #marker=dict(color="red"),
            name=topic,
            text=detections_df[(detections_df['topic']==topic) & (detections_df['detected-frames']!=0)][['Video-A', 'Video-B', y_label]],
            hovertemplate='<b>%{text}</b>',
            showlegend=True
        )
)

fig.update_layout(
    autosize=False,
    width=1500,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
fig.update_xaxes(
        tickangle = 90,
        title_text = x_label,
        title_font = {"size": 20},
        title_standoff = 25)

fig.update_yaxes(
        title_text = y_label,
        title_font = {"size": 20},
        title_standoff = 25)
fig.show()

# Merge datasets with true negative matches and true positives

In [None]:
combined_cols = metrics_cols + ['Video-A', 'Video-B', 'attack', 'class']
combined_df = pd.concat([true_negatives_df[combined_cols], detections_df[combined_cols]], axis=0, ignore_index=True)

combined_df.to_csv('combined.csv', index=False)

In [None]:
combined_df_loaded = pd.read_csv('combined.csv')
display(combined_df)

In [None]:
from sklearn.preprocessing import StandardScaler
# Separating out the features
pd.set_option('use_inf_as_na', True)

combined_df = combined_df_loaded.dropna(axis='rows')

x = combined_df[metrics_cols].values

# Standardizing the features
x = StandardScaler().fit_transform(x)

In [None]:
from sklearn.decomposition import PCA
num_components = 2
pca = PCA(n_components=num_components)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = [f'principal component {n}' for n in range(num_components)])

In [None]:
total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_matrix(
    principalComponents,
    dimensions=range(num_components),
    color=combined_df['attack'],
    title=f'Total Explained Variance: {total_var:.2f}%',
)
fig.update_layout(
    autosize=False,
    width=1500,
    height=1500,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
fig.update_traces(diagonal_visible=False)
fig.show()