# Duplicate Detection in Million Song Dataset

In [1]:
import numpy as np
import pandas as pd
import tarfile
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from pathlib import Path
from sklearn import preprocessing

In [2]:
# HELPER FUNCTIONS
def findangle(v1 , v2):
    dot = np.dot(v1 , v2)
    v1_mag = np.linalg.norm(v1)
    v2_mag = np.linalg.norm(v2)
    cos_angle = dot / (v1_mag * v2_mag)
    rad_angle = np.arccos(cos_angle)
    angle = np.degrees(rad_angle)
    return angle

def standardize(data):
    scaled_data = preprocessing.scale(data)
    return scaled_data
    
def listify_tuples(tuple_list):
    for i in range(len(tuple_list)):
        tuple_list[i] = list(tuple_list[i])


In [1]:
# STEP 1: Extract data, select featues AND songs
def extract_and_read(path, songs, fields):
    file_h5_path = 'MillionSongSubset/AdditionalFiles/subset_msd_summary_file.h5'
    file_txt_path = 'MillionSongSubset/AdditionalFiles/subset_unique_tracks.txt'
    file_h5 = Path(file_h5_path)
    file_txt = Path(file_txt_path)

    if not(file_h5.is_file()):        
        tar = tarfile.open(path, 'r')
        members = tar.getmembers()
        tar.extract(member=members[5])
        tar.close()
        
    if not(file_txt.is_file()):        
        tar = tarfile.open(path, 'r')
        members = tar.getmembers()
        tar.extract(member=members[2])
        tar.close()

    summary = pd.HDFStore(file_h5_path)
    data = summary['/analysis/songs']
    summary.close()
    
    names = pd.read_csv(file_txt_path, sep='<SEP>', engine='python', header=None).values
    
    ids = data.loc[songs,['track_id']].T.values
    data = data.loc[songs,fields].T.values
    print("example")
    return (data, ids, names)



In [4]:
# STEP 2: Standardize M_raw and convert it to signature matrix (sketching)
def standardize_data(data):
    number_of_rows = data.shape[0]
    for i in range (number_of_rows):
        arr = data[i, :]
        data[i , :] = standardize(data[i , :])
    return data

def signatureMatrix(M_raw , numberOfPlanes):
    matrix = np.zeros((numberOfPlanes , M_raw.shape[1]))
    for i in range (0,numberOfPlanes):
        randomVector = np.random.rand(M_raw.shape[0]) * 2 - 1
        for e in range (0 , M_raw.shape[1]):
            product = np.dot(M_raw[: , e] , randomVector)
            if(product<0):
                matrix[i , e] = 0
            else:
                matrix[i , e] = 1                     
    return matrix


In [5]:
# STEP 3: Generate candidate pairs using LSH using approximate cosine similarity
def LSH(M, b, r):    
    candidates = dict()
    N_song = M.shape[1]
    for current_column in range(N_song): # for each song
        for current_band in range(b):  # for each band of that song            
            start = current_band*r  
            end = start+r
            current_values = M[start:end, current_column]
            current_values = current_values.tolist()
            current_values = list(map(str, current_values))
            current_values = ''.join(current_values)
            if current_values in candidates: # hash that band
                    candidates[current_values].add(current_column)
            else:
                    candidates[current_values] = set([current_column])
                    
    distilled_candidates = dict()
    for k in candidates.keys():
        v = list(candidates[k])
        if len(v) > 1:
            distilled_candidates[k] = v

    return distilled_candidates


In [6]:
# STEP 4: Refine candidate pairs according to cosine similarity
def refine(M_raw_std, candidates, epsilon):
    rv = ()
    for k in candidates.keys(): # for each candidates
        candidate = candidates[k]
        a = range(len(candidate))
        combinations_2 = list(itertools.combinations(a,2))
        for comb in combinations_2:
            v1 = M_raw_std[:,candidate[comb[0]]]
            v2 = M_raw_std[:,candidate[comb[1]]]
            angle = findangle(v1, v2)
            if angle < epsilon:
                rv += ((candidate[comb[0]], candidate[comb[1]]),)
    return rv
            

In [7]:
# TEST (step 1)
songs = np.arange(0,10000) # all songs
fields = ['duration', 'end_of_fade_in', 'key', 'loudness', 'mode', 'start_of_fade_out', 'tempo', 'time_signature']
path_list = ['../', '../','../','../','../','Downloads/','millionsongsubset_full.tar.gz']
path = ''.join(path_list)

M_raw, ids, names = extract_and_read(path, songs, fields)
pd.DataFrame(data=M_raw,index=fields)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
duration,280.21506,191.68608,222.92853,278.38649,89.15546,255.73832,239.59465,156.96934,197.19791,262.5824,...,190.74567,430.94159,208.16934,271.64689,247.30077,288.57424,177.50159,340.79302,61.83138,256.73098
end_of_fade_in,0.238,0.0,0.0,0.496,4.847,0.156,0.403,0.322,0.276,2.328,...,0.276,0.0,0.0,0.0,0.135,0.346,0.0,0.287,0.191,0.0
key,5.0,0.0,1.0,7.0,2.0,9.0,2.0,4.0,4.0,9.0,...,4.0,5.0,4.0,7.0,1.0,5.0,0.0,6.0,5.0,11.0
loudness,-3.306,-10.764,-9.035,-23.095,-20.359,-5.724,-10.653,-20.816,-29.75,-5.644,...,-6.617,-10.743,-2.14,-7.537,-4.931,-10.656,-9.054,-2.871,-21.439,-7.479
mode,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
start_of_fade_out,275.528,184.128,216.3,278.386,79.203,252.012,231.805,142.286,187.582,257.155,...,179.74,419.161,205.816,267.865,231.091,279.65,174.237,324.795,56.697,245.412
tempo,173.205,150.955,93.056,127.113,90.66,101.167,173.841,127.547,127.782,90.013,...,133.6,56.886,197.722,192.417,88.264,87.784,126.202,110.058,76.354,120.103
time_signature,5.0,4.0,4.0,1.0,3.0,1.0,3.0,1.0,4.0,5.0,...,1.0,1.0,4.0,4.0,4.0,4.0,4.0,1.0,4.0,4.0


In [8]:
pd.DataFrame(names).head(5)

Unnamed: 0,0,1,2,3
0,TRAAAAW128F429D538,SOMZWCG12A8C13C480,Casual,I Didn't Mean To
1,TRAAABD128F429CF47,SOCIWDW12A8C13D406,The Box Tops,Soul Deep
2,TRAAADZ128F9348C2E,SOXVLOJ12AB0189215,Sonora Santanera,Amor De Cabaret
3,TRAAAEF128F4273421,SONHOTT12A8C13493C,Adam Ant,Something Girls
4,TRAAAFD128F92F423A,SOFSOCN12A8C143F5D,Gob,Face the Ashes


In [9]:
# TEST (step 2)
b = 3
r = 64
N_sample = b * r # number of random planes
M_raw_std = standardize_data(M_raw)
df = pd.DataFrame(M_raw_std, index=fields)
df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
duration,0.365433,-0.41024,-0.1365,0.349412,-1.308593,0.150973,0.009525,-0.714421,-0.361947,0.210939,...,-0.41848,1.686069,-0.265817,0.290361,0.077045,0.438675,-0.534522,0.896205,-1.548001,0.15967
end_of_fade_in,-0.278724,-0.406142,-0.406142,-0.140597,2.188808,-0.322624,-0.190387,-0.233752,-0.258379,0.840205,...,-0.258379,-0.406142,-0.406142,-0.406142,-0.333867,-0.220903,-0.406142,-0.25249,-0.303886,-0.406142
key,-0.077689,-1.484591,-1.20321,0.485072,-0.92183,1.047832,-0.92183,-0.359069,-0.359069,1.047832,...,-0.359069,-0.077689,-0.359069,0.485072,-1.20321,-0.077689,-1.484591,0.203691,-0.077689,1.610593
loudness,1.329687,-0.051547,0.268666,-2.33527,-1.828558,0.881869,-0.03099,-1.913195,-3.567787,0.896685,...,0.716484,-0.047658,1.545632,0.546098,1.028734,-0.031546,0.265147,1.41025,-2.028576,0.55684
mode,0.668557,0.668557,0.668557,0.668557,0.668557,0.668557,0.668557,-1.495759,0.668557,0.668557,...,0.668557,0.668557,-1.495759,-1.495759,0.668557,0.668557,0.668557,0.668557,0.668557,-1.495759
start_of_fade_out,0.40603,-0.408659,-0.121895,0.431504,-1.343901,0.196421,0.016307,-0.781615,-0.377872,0.242263,...,-0.447771,1.686294,-0.215344,0.337726,0.009943,0.442771,-0.496821,0.845168,-1.544507,0.137593
tempo,1.429385,0.796971,-0.848698,0.119307,-0.9168,-0.618158,1.447462,0.131643,0.138322,-0.93519,...,0.303688,-1.876762,2.126234,1.975449,-0.984902,-0.998545,0.093414,-0.365449,-1.323421,-0.079939
time_signature,1.133492,0.343712,0.343712,-2.025627,-0.446068,-2.025627,-0.446068,-2.025627,0.343712,1.133492,...,-2.025627,-2.025627,0.343712,0.343712,0.343712,0.343712,0.343712,-2.025627,0.343712,0.343712


In [10]:
# TEST (step 2 contd.)
M  = signatureMatrix(M_raw_std , N_sample)
pd.DataFrame(M).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0


In [11]:
# TEST (step 3)
candidates = LSH(M, b, r)
candidates.values()

dict_values([[4890, 5234], [385, 213], [3041, 8170], [5206, 3239], [9696, 4924, 598], [7296, 6588], [9579, 4989], [6664, 8269], [5612, 7518], [42, 8547], [6048, 5511], [6303, 3479], [3372, 7614, 7926], [6657, 3470, 8470], [6249, 3853], [4264, 8326], [1440, 4232, 3562, 6096, 1652, 9749, 2008, 9565, 4093], [3284, 8078], [8432, 7513], [8107, 7469], [3664, 7630], [2986, 7382, 4639], [9379, 1420], [8267, 2875, 4476, 3471], [3386, 4748, 5269, 9516], [8867, 5701], [2976, 9678], [4002, 1111], [1451, 5876], [4883, 5700, 6619, 2631], [9689, 8874, 899, 7622], [5673, 5654], [488, 7336], [3312, 3563], [89, 6325], [2568, 9788], [3322, 1439], [6402, 3211, 6031], [7930, 9654], [8232, 6860], [7136, 9505, 9543, 4271, 1722, 2493], [7497, 6263], [6869, 6454], [572, 4652], [5473, 7140, 108], [2930, 659, 9037], [3698, 8559], [5332, 9228, 5031], [6064, 4626], [4428, 9647], [355, 5694], [5748, 7429], [5633, 1565], [6728, 6878], [8664, 1149, 4965, 9279], [8576, 4329], [5221, 9255], [2432, 2411, 5646], [96, 733

In [41]:
# TEST (step 4)
epsilon = 1.5
buckets = refine(M_raw_std, candidates, epsilon)
duplicates = list(set(buckets))
duplicates

[(6427, 70), (9651, 99), (2569, 612), (908, 3742), (3088, 5529), (4961, 8609)]

In [42]:
#print('# of duplicates :',len(duplicates))

listify_tuples(duplicates)
duplicates = np.asarray(duplicates)

duplicates_ids = [[None for x in range(2)] for x in range(duplicates.shape[0])]
duplicates_artists = [[None for x in range(2)] for x in range(duplicates.shape[0])]
duplicates_names = [[None for x in range(2)] for x in range(duplicates.shape[0])]

for i in range(duplicates.shape[0]):
    for j in range(duplicates.shape[1]):        
        current_row = names[names[:,0] == ids[0,duplicates[i,j]]]
        duplicates_ids[i][j] = current_row[0,0]
        duplicates_artists[i][j] = current_row[0,2]
        duplicates_names[i][j] = current_row[0,3]


df1 = pd.DataFrame(duplicates)
df2 = pd.DataFrame(duplicates_ids)
df3 = pd.DataFrame(duplicates_artists)
df4 = pd.DataFrame(duplicates_names)

frames = [df1, df2, df3, df4]
result = pd.concat(frames, axis=1)
result

Unnamed: 0,0,1,0.1,1.1,0.2,1.2,0.3,1.3
0,6427,70,TRATGQO128F421B850,TRACOHQ128F424C5EF,Elvenking,Chimaira,Pathfinders,The Flame
1,9651,99,TRBEGHL128F92F76E8,TRACRJZ128F4244626,Alice In Chains,Theodis Ealey,Would?,This Time I Know
2,2569,612,TRAYTDZ128F93146E3,TRAIJLI128F92FC94A,Stevie Ray Vaughan And Double Trouble,Stevie Ray Vaughan And Double Trouble,Mary Had A Little Lamb,Mary Had A Little Lamb
3,908,3742,TRAHCVT128F4282806,TRASWSO12903CBE857,L.A. Guns,Head Horny's,Magdalaine,Pray (Instrumental Version)
4,3088,5529,TRABFQC128F4265349,TRAWGDB128F42AD39D,Regina Belle,Regina Belle,Quiet Time,Quiet Time
5,4961,8609,TRALVCW128F4288926,TRBBCCM128F932D71A,Snowgoons,Leon Lai,Real World,Yi Ba Xiang Shui Yin
