In [1]:
import os
import shutil
import csv
import random
import deepdish as dd
import fnmatch, numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import DistanceMetric
import tqdm
import string
import glob
import pandas as pd
import sqlite3
%matplotlib inline

In [2]:
HOME_DIR = '/home/jvidyala/final_data/'
GENRE_DIR = HOME_DIR + 'file_genres/'
DATA_DIR = '/home/jvidyala/data/original_data/'
AFV_DIR = HOME_DIR + 'avg_feat_vectors/'
TRAIN_DATA = HOME_DIR + 'mf_train_data/'
TEST_DATA = HOME_DIR + 'mf_test_data/'
WEIGHTS_DIR = HOME_DIR + 'model_weights/'

Data extraction using MGD

In [3]:
def data_extraction():
    dataset = pd.read_csv("/home/jvidyala/msd-genre-dataset.txt",sep=",",skiprows=9)
    
    
    file_choices = dataset['track_id'].values

    mfcc_coeffs = {}
    avg_feature_vector = {}
    for file in tqdm.tqdm(file_choices):
        file_load = dd.io.load(DATA_DIR + file[2] + '/' + file[3] + '/' + file[4] + '/' + file + '.h5')
        temp_load = file_load['analysis']['segments_timbre']
        mfcc_coeffs[file] = temp_load
        temp = []
        try:
            for j in range(12):
                feature_set = [(mfcc_coeffs[file][j]) for i in range(mfcc_coeffs[file].shape[0])]
                avg_feature = np.mean(np.array(feature_set))
                temp.append(avg_feature)
            avg_feature_vector[file]=temp
        except IndexError as e:
            print(file)
            
    dd.io.save(HOME_DIR+'afv_mgd.h5',avg_feature_vector)

In [83]:
#Train GMM using defined balanced train set
def genre_GMM(train_set):
    gmm = GaussianMixture(n_components=10,covariance_type='full').fit(train_set)    
    dd.io.save(HOME_DIR + 'gmm.h5',gmm)

#Song to song mhl distance
def mahalanobis_point(x,y,cov):
    cov_inverse = np.linalg.inv(cov)
    return (np.matmul(np.matmul(x-y,cov_inverse),np.transpose(x-y))**0.5)[0][0]

In [5]:
def calc_track_avg(file):
    temp_load = dd.io.load(DATA_DIR + file[2] + '/' + file[3] + '/' + file[4] + '/' + file + '.h5')['analysis']['segments_timbre']
    avg_vector,temp = {},[]
    for j in range(12):
                feature_set = [(temp_load[j]) for i in range(temp_load.shape[0])]
                avg_feature = np.mean(np.array(feature_set))
                temp.append(avg_feature)
    #avg_vector[file]=temp
    return temp

In [31]:
def closest_clusters(gmm,track_id):
    x = calc_track_avg(track_id)
    
    #     afv = dd.io.load(HOME_DIR + 'afv_mgd.h5')
    #     x = afv[track_id]
    probs = gmm.predict_proba(np.array(x).reshape(1,-1))
    
    lprobs = sorted(probs.tolist()[0],reverse=True)
    temp = probs.tolist()[0]
    vals = lprobs[:3]
    closest_clusters = [temp.index(val) for val in vals]
    
    return closest_clusters

In [7]:
def track_cluster_assignment(gmm):
    track_clusters = {i:[] for i in range(10)}
    afv = dd.io.load(HOME_DIR + 'afv_mgd.h5')
    for track,x in tqdm.tqdm(afv.items()):
        probs = gmm.predict_proba(np.array(x).reshape(1,-1))
        lprobs = sorted(probs.tolist()[0],reverse=True)
        temp = probs.tolist()[0]
        vals = lprobs[:3]
        clusters = [temp.index(val) for val in vals]
        
        for cluster in clusters:
            track_clusters[cluster].append(track)
    dd.io.save(HOME_DIR + 'track_clusters.h5',track_clusters)

In [20]:
#Equal representation from each genre
def gmm_train_set():
    dataset = pd.read_csv("/home/jvidyala/msd-genre-dataset.txt",sep=",",skiprows=9)

    starting_indices = [0]
    for idx in range(1,len(dataset['%genre'].values)):
        if (dataset['%genre'].values)[idx-1]!=(dataset['%genre'].values)[idx]:
            starting_indices.append(idx)
    
    file_choices = []
    for i in range(len(starting_indices)):
        file_choices += (dataset['track_id'].values)[starting_indices[i]:starting_indices[i]+434].tolist()
     
    train_set = []
    afv_mgd = dd.io.load(HOME_DIR + 'afv_mgd.h5')    

    for file in file_choices:
        try:
            train_set.append(afv_mgd[file])
        except KeyError:
            print(file)

    return train_set

In [119]:
def return_predictions(track_id):
    if not HOME_DIR+'afv_mgd.h5':
        data_extraction()
    if not HOME_DIR+'track_clusters.h5':
        track_cluster_assignment()
    if not HOME_DIR+'gmm.h5':
        train_set = gmm_train_set()
        genre_GMM(train_set)
        
    gmm = dd.io.load(HOME_DIR + 'gmm.h5')
    track_clusters = dd.io.load(HOME_DIR + 'track_clusters.h5')
    
    #kclusters = indices of highest prob gmm components  
    kclusters = closest_clusters(gmm,track_id)
    song_set = {}
    for i in kclusters:
        song_set[i] = track_clusters[i] 
        
    #song_set contains all acoustically similar songs    
    
    distances = mahalanobis_dist_set(gmm,song_set,track_id)
    
    return sorted(distances,key=distances.get)

In [120]:
def mahalanobis_dist_set(gmm,song_set,track_id):
    x = np.array(calc_track_avg(track_id))
    afv_mgd = dd.io.load(HOME_DIR + 'afv_mgd.h5')
    distances = {}
    for cluster_id in song_set.keys():
        for track in song_set[cluster_id]:
            
            #Song to song Mhl dist calc
            cov = gmm.covariances_[cluster_id]
            y = np.array(afv_mgd[track])
            cov_inverse = np.linalg.inv(cov) 
    
            dist = np.matmul(np.matmul(x-y,cov_inverse),np.transpose(x-y))**0.5
            
            distances[track] = dist

    return distances