In [1]:
import pandas as pd
import numpy as np
from os.path import exists
from tqdm import tqdm
import re
import datatable as dt
# Variables that contains the file location
from files import *
from functions import *
from sklearn.preprocessing import normalize
from collections import defaultdict

In [2]:
# Datatype used to reduce memory required
DTYPE = np.float32

In [3]:
# string ids are inefficient, let's use integers and a lookup table
def getRelationIdsToNumbers(df):
    no_ids = len(df.index.values)
    return dict(zip(df.index.values, list(range(no_ids)))), np.arange(no_ids)

def change_id_to_keys(df, id_to_keys):
    df.set_index(np.asarray([id_to_keys[i] for i in df.index.values]), inplace=True)
    df.index.astype(np.int32, copy=False)
    df.index.name = "id"

In [4]:
# To save memory we converted the original indices to new indices with only numbers.
# The following function create those ids or if they already exists loads them
# The ids have to be similar in all dataframes to have consistency, 
# for that we use the info indexes as base for all dataframes

file_original_new_ids = "./data/relation_original_new_ids.csv"

info  = dt.fread(file_info_2).to_pandas().set_index('id')
id_to_key, indexes = getRelationIdsToNumbers(info)


if exists(file_original_new_ids):
    relation_ids = pd.read_csv(file_original_new_ids).set_index('original').astype(DTYPE)
else:
    
    relation_ids = pd.DataFrame(columns=['original', 'newId'])
    relation_ids['original'] = list(id_to_key.keys())
    relation_ids['newId'] = list(id_to_key.values())
    relation_ids.set_index('original', inplace=True)
    relation_ids = relation_ids.astype(DTYPE)
    relation_ids.to_csv(file_original_new_ids)

In [5]:
# Relation of original id, to new id
relation_ids.head(2)

Unnamed: 0_level_0,newId
original,Unnamed: 1_level_1
0009fFIM1eYThaPg,0.0
0010xmHR6UICBOYT,1.0


## Load datasets

Now after having a relation of original ids to new ids, we can load the datasets

In [6]:
genres  = dt.fread(file_genres_2).to_pandas().set_index('id')

In [7]:
tf_idf   = dt.fread(file_tfidf_2 ,  header=True).to_pandas().set_index('id').astype(DTYPE)
word2vec = dt.fread(file_word2vec_2, header=True).to_pandas().set_index('id').astype(DTYPE)
bert     = dt.fread(file_bert_2, header=True).to_pandas().set_index('id').astype(DTYPE)

In [8]:
blf_correlation      = dt.fread(file_blf_correlation, header=True).to_pandas().set_index('id').astype(DTYPE)
blf_deltaspectral    = dt.fread(file_blf_deltaspectral, header=True).to_pandas().set_index('id').astype(DTYPE)
blf_spectral         = dt.fread(file_blf_spectral, header=True).to_pandas().set_index('id').astype(DTYPE)
blf_spectralcontrast = dt.fread(file_blf_spectralcontrast, header=True).to_pandas().set_index('id').astype(DTYPE)
blf_vardeltaspectral = dt.fread(file_blf_vardeltaspectral, header=True).to_pandas().set_index('id').astype(DTYPE)
essentia             = dt.fread(file_essentia, header=True).to_pandas().set_index('id').astype(DTYPE)
mfcc_bow             = dt.fread(file_mfcc_bow, header=True).to_pandas().set_index('id').astype(DTYPE)
mfcc_stats           = dt.fread(file_mfcc_stats, header=True).to_pandas().set_index('id').astype(DTYPE)

In [9]:
blf_logfluc  = dt.fread(file_blf_logfluc)
# This is done because in the csv it has an extra column name, 
# so in case someone with the original dataset tries to run it, it fixes that error
# It looks weird, but it is because first i am loading the data into datatable and then pass it to dataframe
new_cols = ['id']
new_cols.extend(list(blf_logfluc.names[2:]))
new_cols = tuple(new_cols)
del blf_logfluc[:, -1]

blf_logfluc.names = new_cols
blf_logfluc = blf_logfluc.to_pandas()
blf_logfluc.set_index('id', inplace=True)
blf_logfluc = blf_logfluc.astype(DTYPE)

In [10]:
incp   = dt.fread(file_incp , header=True).to_pandas().set_index('id').astype(DTYPE)
resnet = dt.fread(file_resnet, header=True).to_pandas().set_index('id').astype(DTYPE)
vgg19  = dt.fread(file_vgg19, header=True).to_pandas().set_index('id').astype(DTYPE)

In [11]:
datasets = {
    "tfidf" : tf_idf,
    "word2vec" : word2vec,
    "bert" : bert,
    "mfcc_bow" : mfcc_bow,
    "mfcc_stats" : mfcc_stats,
    "essentia" : essentia,
    "blf_delta_spectral" : blf_deltaspectral,
    "blf_correlation" : blf_correlation,
    "blf_logfluc" : blf_logfluc,
    "blf_spectral" : blf_spectral,
    "blf_spectral_contrast" : blf_spectralcontrast,
    "blf_vardelta_spectral" : blf_vardeltaspectral,
    "incp" : incp,
    "vgg19" : vgg19,
    "resnet" : resnet
}

#### Change the original ids to the new ids of the loaded datasets

In [12]:
change_id_to_keys(genres,id_to_key)

In [13]:
for df in datasets.keys():
    change_id_to_keys(datasets[df], id_to_key)

In [14]:
for df in datasets.keys():
    print(f"Feature {df}" , datasets[df].shape)

Feature tfidf (68641, 1000)
Feature word2vec (68641, 300)
Feature bert (68641, 768)
Feature mfcc_bow (68641, 500)
Feature mfcc_stats (68641, 104)
Feature essentia (68641, 1034)
Feature blf_delta_spectral (68641, 1372)
Feature blf_correlation (68641, 1326)
Feature blf_logfluc (68641, 3626)
Feature blf_spectral (68641, 980)
Feature blf_spectral_contrast (68641, 800)
Feature blf_vardelta_spectral (68641, 1344)
Feature incp (68641, 4096)
Feature vgg19 (68641, 8192)
Feature resnet (68641, 4096)


# Function to compute distance and only retrieve the top ids

In [15]:
def compute_top(data: np.array, simfunction, idx_values, batches:int = 1, top: int=100):
    
    splits = np.array_split(data, batches, axis=0)
    splits_idx = np.array_split(np.arange(data.shape[0]), batches, axis=0)

    top_values = np.zeros((data.shape[0], top), dtype=np.int32)
    for b,i in tqdm(list(zip(splits, splits_idx))):
        size_batch = b.shape[0]
        
        ### Calculate similarities
        results = simfunction(data, b).T
        
        ### Get Tops
        # Set the distance to the same document to -1 because we dont want it at the start.
        results[(np.arange(size_batch),i)] = -1
        # Get the document indices instead of the distances
        top_values[i, :] =(idx_values[np.argsort(results * -1, axis=1)][:,:top])

    return top_values

In [16]:
# Example on how to use the function
# topExample = compute_top(
#     tf_idf.to_numpy(dtype=DTYPE),
#     get_cosine_similarity,
#     tf_idf.index.values,
#     batches=100,
#     top=100
# )

## Computation of Top 100 ids with different feature vectors cosine similarity

In [19]:
features = datasets.keys() 
for feature in features:
    file_name = f'./data/top_ids_cosine_{feature}.csv'
    print(file_name)
#Uncomment to generate ids
#     topData = compute_top(
#         datasets[feature].to_numpy(dtype=DTYPE), 
#         get_cosine_similarity,
#         datasets[feature].index.values,
#         batches=100,
#         top=100
#     )
#     dt.Frame(pd.DataFrame(topData, index=datasets[feature].index.values).reset_index()).to_csv(file_name)

./data/top_ids_cosine_tfidf.csv
./data/top_ids_cosine_word2vec.csv
./data/top_ids_cosine_bert.csv
./data/top_ids_cosine_mfcc_bow.csv
./data/top_ids_cosine_mfcc_stats.csv
./data/top_ids_cosine_essentia.csv
./data/top_ids_cosine_blf_delta_spectral.csv
./data/top_ids_cosine_blf_correlation.csv
./data/top_ids_cosine_blf_logfluc.csv
./data/top_ids_cosine_blf_spectral.csv
./data/top_ids_cosine_blf_spectral_contrast.csv
./data/top_ids_cosine_blf_vardelta_spectral.csv
./data/top_ids_cosine_incp.csv
./data/top_ids_cosine_vgg19.csv
./data/top_ids_cosine_resnet.csv


## TOP 100 Ids using each song as a query with jaccard similarity

In [20]:
features = datasets.keys() 
for feature in features:
    file_name = f'./data/top_ids_jaccard_{feature}.csv'
    print(file_name)
#Uncomment to generate ids
#     topData = compute_top(
#         datasets[feature].to_numpy(dtype=DTYPE), 
#         get_jaccard_similarity,
#         datasets[feature].index.values,
#         batches=100,
#         top=100
#     )
#     dt.Frame(pd.DataFrame(topData, index=datasets[feature].index.values).reset_index()).to_csv(file_name)

./data/top_ids_jaccard_tfidf.csv
./data/top_ids_jaccard_word2vec.csv
./data/top_ids_jaccard_bert.csv
./data/top_ids_jaccard_mfcc_bow.csv
./data/top_ids_jaccard_mfcc_stats.csv
./data/top_ids_jaccard_essentia.csv
./data/top_ids_jaccard_blf_delta_spectral.csv
./data/top_ids_jaccard_blf_correlation.csv
./data/top_ids_jaccard_blf_logfluc.csv
./data/top_ids_jaccard_blf_spectral.csv
./data/top_ids_jaccard_blf_spectral_contrast.csv
./data/top_ids_jaccard_blf_vardelta_spectral.csv
./data/top_ids_jaccard_incp.csv
./data/top_ids_jaccard_vgg19.csv
./data/top_ids_jaccard_resnet.csv


## Generate Baseline

The baseline that we took in consideration is a random selection of songs without repeting the ids for the same query

In [22]:
top_random_ids = np.empty((68641, 100), dtype=np.int32)
print(top_random_ids.shape)
np.random.seed(42)
for i in tqdm(range(68641)):
    top_random_ids[i] = np.random.choice(68641, 100,replace=False)

(68641, 100)


100%|███████████████████████████████████| 68641/68641 [01:02<00:00, 1097.35it/s]


In [23]:
top_random_ids[0,:5]

array([51329, 19980, 15110, 63047,  9736], dtype=int32)

In [24]:
top_random_ids.shape

(68641, 100)

In [28]:
dt.Frame(pd.DataFrame(top_random_ids, index=list(range(68641)) ).reset_index()).to_csv('./data/top_ids_baseline.csv')

In [24]:
# baseline_topids = np.take(tf_idf.index.values, top_random_ids.astype(int), axis=0)
# baseline_topids[0,:5]

In [25]:
# dt.Frame(pd.DataFrame(baseline_topids, index=tf_idf.index.values ).reset_index()).to_csv('./TopIdsTask2/top_ids_baseline_complete.csv')

## Early fusion datasets combining Lyrics Audio Video

### Computations concatenating each dataset and then compute the metrics

In [17]:
audio_jaccard = ["essentia", "blf_logfluc", "mfcc_stats"]
lyrics_jaccard = ["bert", "tfidf"]
video_jaccard = ["vgg19", "resnet"]

In [22]:
for lyrics in lyrics_jaccard:
    for audio in audio_jaccard:
        for video in video_jaccard:
            file_name = f'./data/top_ids_jaccard_earlyfusion_{lyrics}_{audio}_{video}.csv'
            print(file_name)
#             index_values = datasets[lyrics].index.values
#             topData = compute_top(
#                 datasets[lyrics].join(datasets[audio], on='id').join(datasets[video], on="id").to_numpy(dtype=DTYPE),
#                 get_jaccard_similarity,
#                 index_values,
#                 batches=100,
#                 top=100
#             )
#             dt.Frame(pd.DataFrame(topData, index=index_values).reset_index()).to_csv(file_name)

./data/top_ids_jaccard_earlyfusion_bert_essentia_vgg19.csv
./data/top_ids_jaccard_earlyfusion_bert_essentia_resnet.csv
./data/top_ids_jaccard_earlyfusion_bert_blf_logfluc_vgg19.csv
./data/top_ids_jaccard_earlyfusion_bert_blf_logfluc_resnet.csv
./data/top_ids_jaccard_earlyfusion_bert_mfcc_stats_vgg19.csv
./data/top_ids_jaccard_earlyfusion_bert_mfcc_stats_resnet.csv
./data/top_ids_jaccard_earlyfusion_tfidf_essentia_vgg19.csv
./data/top_ids_jaccard_earlyfusion_tfidf_essentia_resnet.csv
./data/top_ids_jaccard_earlyfusion_tfidf_blf_logfluc_vgg19.csv
./data/top_ids_jaccard_earlyfusion_tfidf_blf_logfluc_resnet.csv
./data/top_ids_jaccard_earlyfusion_tfidf_mfcc_stats_vgg19.csv
./data/top_ids_jaccard_earlyfusion_tfidf_mfcc_stats_resnet.csv


In [20]:
audio_cosine = ["blf_spectral", "blf_logfluc", "mfcc_bow"]
lyrics_cosine = ["bert", "tfidf"]
video_cosine = ["incp", "resnet"]

In [23]:
for lyrics in lyrics_cosine:
    for audio in audio_cosine:
        for video in video_cosine:
            file_name = f'./data/top_ids_cosine_earlyfusion_{lyrics}_{audio}_{video}.csv'
            print(file_name)
#             index_values = datasets[lyrics].index.values
#             topData = compute_top(
#                 datasets[lyrics].join(datasets[audio], on='id').join(datasets[video], on="id").to_numpy(dtype=DTYPE),
#                 get_cosine_similarity,
#                 index_values,
#                 batches=100,
#                 top=100
#             )
#             dt.Frame(pd.DataFrame(topData, index=index_values).reset_index()).to_csv(file_name)

./data/top_ids_cosine_earlyfusion_bert_blf_spectral_incp.csv
./data/top_ids_cosine_earlyfusion_bert_blf_spectral_resnet.csv
./data/top_ids_cosine_earlyfusion_bert_blf_logfluc_incp.csv
./data/top_ids_cosine_earlyfusion_bert_blf_logfluc_resnet.csv
./data/top_ids_cosine_earlyfusion_bert_mfcc_bow_incp.csv
./data/top_ids_cosine_earlyfusion_bert_mfcc_bow_resnet.csv
./data/top_ids_cosine_earlyfusion_tfidf_blf_spectral_incp.csv
./data/top_ids_cosine_earlyfusion_tfidf_blf_spectral_resnet.csv
./data/top_ids_cosine_earlyfusion_tfidf_blf_logfluc_incp.csv
./data/top_ids_cosine_earlyfusion_tfidf_blf_logfluc_resnet.csv
./data/top_ids_cosine_earlyfusion_tfidf_mfcc_bow_incp.csv
./data/top_ids_cosine_earlyfusion_tfidf_mfcc_bow_resnet.csv


### Late fusion only combine 3 features >>> Still need to check

In [14]:
def compute_top_late(d1: np.array, d2: np.array, d3: np.array, simfunction, idx_values, batches:int = 1, top: int=100):
    
    sx, sy = d1.shape
    splits_1 = np.array_split(d1, batches, axis=0)
    splits_2 = np.array_split(d2, batches, axis=0)
    splits_3 = np.array_split(d3, batches, axis=0)
    splits_idx = np.array_split(np.arange(sx), batches, axis=0)

    top_values = []
    for b1,b2,b3,i in tqdm(list(zip(splits_1, splits_2, splits_3, splits_idx))):
        size_batch = b1.shape[0]
        # Calculate similarities
        result1 = simfunction(d1, b1).T
        result2 = simfunction(d2, b2).T
        result3 = simfunction(d3, b3).T
        # Normalize similarities      
        n1 = np.linalg.norm(result1 , axis=1, keepdims=True)
        n2 = np.linalg.norm(result2 , axis=1, keepdims=True)
        n3 = np.linalg.norm(result3 , axis=1, keepdims=True)
        result1 =  np.divide(result1, n1, out=np.zeros_like(result1), where=n1!=0)
        result2 =  np.divide(result2, n2, out=np.zeros_like(result2), where=n2!=0)
        result3 =  np.divide(result3, n3, out=np.zeros_like(result3), where=n3!=0)
        
        # Combine similarities
        result_combined = np.sum([result1, result2, result3], axis=0)
        # Get Tops
        result_combined[(np.arange(size_batch),i)] = -1
        top_values.append(idx_values[np.argsort(result_combined * -1, axis=1)][:,:top])

    return np.concatenate(top_values, axis=0)

In [15]:
def generate_ids_late_fusion(df1, df2, df3, simFunction, file_name, b):
    indexes = df1.index.values
    top = compute_top_late(
        df1.loc[indexes].to_numpy(dtype=np.float32), 
        df2.loc[indexes].to_numpy(dtype=np.float32), 
        df3.loc[indexes].to_numpy(dtype=np.float32), 
        simFunction, 
        indexes, 
        batches=b,   
        top=100)
     
    dt.Frame(pd.DataFrame(top , index=indexes).reset_index()).to_csv(file_name)
    del top, indexes

In [16]:
audio_jaccard = ["essentia", "blf_logfluc", "mfcc_stats"]
lyrics_jaccard = ["bert", "tfidf"]
video_jaccard = ["vgg19", "resnet"]

In [17]:
for lyrics in lyrics_jaccard:
    for audio in audio_jaccard:
        for video in video_jaccard:
            file_name = f'./TopIdsFusion/top_ids_jaccard_latefusion_{lyrics}_{audio}_{video}_complete.csv'
            print(file_name)
            generate_ids_late_fusion(
                datasets[lyrics],
                datasets[audio],
                datasets[video],
                get_jaccard_similarity,
                file_name,
                200
            )

./TopIdsFusion/top_ids_jaccard_latefusion_bert_essentia_vgg19_complete.csv


100%|█████████████████████████████████████████| 200/200 [18:33<00:00,  5.57s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_bert_essentia_resnet_complete.csv


100%|█████████████████████████████████████████| 200/200 [13:52<00:00,  4.16s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_bert_blf_logfluc_vgg19_complete.csv


  r = dotp / (divisor-dotp)
100%|█████████████████████████████████████████| 200/200 [20:53<00:00,  6.27s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_bert_blf_logfluc_resnet_complete.csv


  r = dotp / (divisor-dotp)
100%|█████████████████████████████████████████| 200/200 [16:49<00:00,  5.05s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_bert_mfcc_stats_vgg19_complete.csv


100%|█████████████████████████████████████████| 200/200 [17:30<00:00,  5.25s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_bert_mfcc_stats_resnet_complete.csv


100%|█████████████████████████████████████████| 200/200 [13:20<00:00,  4.00s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_tfidf_essentia_vgg19_complete.csv


100%|█████████████████████████████████████████| 200/200 [18:28<00:00,  5.54s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_tfidf_essentia_resnet_complete.csv


100%|█████████████████████████████████████████| 200/200 [14:12<00:00,  4.26s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_tfidf_blf_logfluc_vgg19_complete.csv


  r = dotp / (divisor-dotp)
100%|█████████████████████████████████████████| 200/200 [21:09<00:00,  6.35s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_tfidf_blf_logfluc_resnet_complete.csv


  r = dotp / (divisor-dotp)
100%|█████████████████████████████████████████| 200/200 [18:04<00:00,  5.42s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_tfidf_mfcc_stats_vgg19_complete.csv


100%|█████████████████████████████████████████| 200/200 [17:56<00:00,  5.38s/it]


./TopIdsFusion/top_ids_jaccard_latefusion_tfidf_mfcc_stats_resnet_complete.csv


100%|█████████████████████████████████████████| 200/200 [13:24<00:00,  4.02s/it]


In [18]:
audio_cosine = [ "blf_logfluc", "blf_spectral","mfcc_bow"]
lyrics_cosine = ["bert", "tfidf"]
video_cosine = ["incp", "resnet"]

In [19]:
for lyrics in lyrics_cosine:
    for audio in audio_cosine:
        for video in video_cosine:
            file_name = f'./TopIdsFusion/top_ids_cosine_latefusion_{lyrics}_{audio}_{video}_complete.csv'
            print(file_name)
            generate_ids_late_fusion(
                datasets[lyrics],
                datasets[audio],
                datasets[video],
                get_cosine_similarity,
                file_name,
                130
            )

./TopIdsFusion/top_ids_cosine_latefusion_bert_blf_logfluc_incp_complete.csv


100%|█████████████████████████████████████████| 130/130 [16:55<00:00,  7.81s/it]


./TopIdsFusion/top_ids_cosine_latefusion_bert_blf_logfluc_resnet_complete.csv


100%|█████████████████████████████████████████| 130/130 [16:27<00:00,  7.59s/it]


./TopIdsFusion/top_ids_cosine_latefusion_bert_blf_spectral_incp_complete.csv


100%|█████████████████████████████████████████| 130/130 [14:29<00:00,  6.69s/it]


./TopIdsFusion/top_ids_cosine_latefusion_bert_blf_spectral_resnet_complete.csv


100%|█████████████████████████████████████████| 130/130 [13:58<00:00,  6.45s/it]


./TopIdsFusion/top_ids_cosine_latefusion_bert_mfcc_bow_incp_complete.csv


100%|█████████████████████████████████████████| 130/130 [14:10<00:00,  6.54s/it]


./TopIdsFusion/top_ids_cosine_latefusion_bert_mfcc_bow_resnet_complete.csv


100%|█████████████████████████████████████████| 130/130 [13:44<00:00,  6.35s/it]


./TopIdsFusion/top_ids_cosine_latefusion_tfidf_blf_logfluc_incp_complete.csv


100%|█████████████████████████████████████████| 130/130 [16:41<00:00,  7.71s/it]


./TopIdsFusion/top_ids_cosine_latefusion_tfidf_blf_logfluc_resnet_complete.csv


100%|█████████████████████████████████████████| 130/130 [16:00<00:00,  7.39s/it]


./TopIdsFusion/top_ids_cosine_latefusion_tfidf_blf_spectral_incp_complete.csv


100%|█████████████████████████████████████████| 130/130 [13:23<00:00,  6.18s/it]


./TopIdsFusion/top_ids_cosine_latefusion_tfidf_blf_spectral_resnet_complete.csv


100%|█████████████████████████████████████████| 130/130 [12:39<00:00,  5.84s/it]


./TopIdsFusion/top_ids_cosine_latefusion_tfidf_mfcc_bow_incp_complete.csv


100%|█████████████████████████████████████████| 130/130 [13:20<00:00,  6.16s/it]


./TopIdsFusion/top_ids_cosine_latefusion_tfidf_mfcc_bow_resnet_complete.csv


100%|█████████████████████████████████████████| 130/130 [12:45<00:00,  5.89s/it]
