In [1]:
import os, deepdish as dd
import shutil, csv
import random, deepdish as dd, math
import fnmatch, numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import DistanceMetric
import fileinput
%matplotlib inline

Global variables

In [2]:
AVG_VECS_FNAME = 'avg_vecs.h5'
GENRE_LOOKUP_FNAME = 'genre_lookup.h5'
TRAIN_SETS_FNAME = 'train_sets.h5'
GMMS_FNAME = 'gmms.h5'

In [3]:
HOME_DIR = '/home/jvidyala/'
DATA_DIR = '/home/jvidyala/data/original_data/'
CLUSTER_DIR = '/home/jvidyala/data/cluster_test/'
CLUSTER2_DIR = HOME_DIR+'data/'+'cluster_test2/'
SAMPLE_TEST_DIR = CLUSTER2_DIR+'test_data/'

os.chdir(CLUSTER2_DIR)

Global dataset creation functions

In [4]:
#Creates dictionary with file:avg_vector
def calc_avg_coeffs(dir):
    mfcc_coeffs = {filename: [] for idx,filename in enumerate(fnmatch.filter(os.listdir(dir),'TR*.h5'))}
    avg_feature_vector = {filename: [] for idx,filename in enumerate(fnmatch.filter(os.listdir(dir),'TR*.h5'))}
    for file in fnmatch.filter(os.listdir(dir),'TR*.h5'):
        try:
            temp_load = dd.io.load(file)['analysis']['segments_timbre']
            mfcc_coeffs[file] = temp_load
            temp = []
            for j in range(12):
                feature_set = [(mfcc_coeffs[file][j]) for i in range(mfcc_coeffs[file].shape[0])]
                avg_feature = np.mean(np.array(feature_set))
                temp.append(avg_feature)
            avg_feature_vector[file]=temp
        except IOError as e:
            print(e)
    return avg_feature_vector

#Creates dictionary of track:genre
def genre_lookup(dir):
    file_genres = {}
    if 'file_genres.h5' not in os.listdir(dir):
        for track in fnmatch.filter(os.listdir(dir),'TR*.h5'):
            file_genres[track]=dd.io.load(track)['musicbrainz']['artist_mbtags'][0]
        dd.io.save('file_genres.h5',file_genres)
    else:
        file_genres = dd.io.load('file_genres.h5')
    return file_genres  



GMM-related functions

In [5]:
#Return GMMs with train sets for each GMM
def genre_GMM(avg_feature_vector_set,genre_mappings_set):
    gmms = {}
    train_sets = {}
    for idx,genre in enumerate(set(genre_mappings_set.values())):
        train_set = [val for key,val in avg_feature_vector_set.items() if genre_mappings_set[key]==genre]
        gmm_genre = GaussianMixture(n_components=1,covariance_type='tied').fit(train_set)
        gmms[genre]=(gmm_genre)   
        train_sets[genre]=(train_set)
        
    return gmms,train_sets

def mahalanobis(x,mean,cov):
    cov_inverse = np.linalg.inv(cov)
    return (np.matmul(np.matmul(x-mean,cov_inverse),np.transpose(x-mean))**0.5)[0][0]

def mahalanobis_point(x,y,cov,track):
    x = x[track]
    cov_inverse = np.linalg.inv(cov)
    print (np.matmul(np.matmul(x-y,cov_inverse),np.transpose(x-y))**0.5)
    
def proximity_neighbors(gmms,avg_feature_vector,seed='TRQYNRE128F147C605.h5'):
    distances = {}
    for genre,gmm in gmms.items():
        X = np.array(avg_feature_vector[seed]).reshape(1,-1)
        distances[genre]=mahalanobis(X,gmm.means_[0],gmm.covariances_)
    return distances

def file_move(fname):
    fname+='.h5'
    shutil.copy(DATA_DIR+'/'+fname[2]+'/'+fname[3]+'/'+fname[4]+'/'+fname,CLUSTER2_DIR+'test_data/') 


def find_track_from_mfcc(gmms,vector,avg_feature_vector):
    for key,val in avg_feature_vector.items():
        if np.array_equal(val,vector): 
            break
    file_genres = dd.io.load('file_genres.h5')
    genre = file_genres[key]
    return genre,key

def calc_track_avg(file,dir):
    temp_load = dd.io.load(dir+file)['analysis']['segments_timbre']
    avg_vector,temp = {},[]
    for j in range(12):
                feature_set = [(temp_load[j]) for i in range(temp_load.shape[0])]
                avg_feature = np.mean(np.array(feature_set))
                temp.append(avg_feature)
    avg_vector[file]=temp
    return avg_vector

In [6]:
def priority_calc(play_counts,alpha=0.1,offset=10):
    y = math.exp(-1*alpha*(play_counts-offset))
    return 1/(1+y)
    
def weighted_randomizer(priorities):
    total = sum(list(priorities.values()))
    rand = random.uniform(0,total)
    for key,val in priorities.items():
        rand = rand-val
        if rand<0:
            return key

Initial calculations & base file storage

In [7]:
def initial_setup(dir):
    avg_vectors = calc_avg_coeffs(dir)
    file_genres = genre_lookup(dir)
    gmms, train_sets = genre_GMM(avg_vectors,file_genres)
   
    modified_gmms = {}
    for key in gmms.keys():
        new_key = key.replace(' ','_')
        modified_gmms[new_key]=[gmms[key],0,0]

    dd.io.save(AVG_VECS_FNAME,avg_vectors)
    dd.io.save(GENRE_LOOKUP_FNAME,file_genres)
    dd.io.save(TRAIN_SETS_FNAME,train_sets)
    dd.io.save(GMMS_FNAME,modified_gmms)      

Input stream processing from playlist.txt, outputs recommendations per song

In [39]:
def recommendations():
#     f = fileinput.input(files=('test.txt'))
#     for line in f:
#         print(line)
#         d = raw_input()
    avg_vectors = dd.io.load(AVG_VECS_FNAME)
    file_genres = dd.io.save(GENRE_LOOKUP_FNAME)
    train_sets = dd.io.save(TRAIN_SETS_FNAME)
    gmms = dd.io.save(GMMS_FNAME)
    
    playlist = [random.choice(fnmatch.filter(os.listdir(CLUSTER2_DIR),'TR*.h5')) for i in range(10)]
    for song in playlist:
        neighbor_clusters,train_set = closest_clusters(track,gmms,avg_feature_vector,train_sets)
        track_avg_vector = calc_track_avg(track,CLUSTER2_DIR)
        
        #check directory
        track_genre = dd.io.load(song+'.h5')['musicbrainz']['artist_mbtags'][0]
        
        dist = {}
        for train_track in train_set:
            train_track_genre,train_track_name = find_track_from_mfcc(gmms,train_track,avg_vectors)
            dist[train_track_name]=mahalanobis_point(avg_vector,train_track,mms[train_track_genre].covariances_,track)

        recs1 = (sorted(dist,key=dist.get)[:10])
        
        

In [40]:
os.chdir(CLUSTER2_DIR)
initial_setup(CLUSTER2_DIR)

error: (25, 'Inappropriate ioctl for device')