In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
DRIVE_PATH='/content/drive/MyDrive/CS_DS541/'

Mounted at /content/drive/


Cleaning Songs CSV into a a set of features

In [2]:

import dask
import dask.dataframe as dd
import dask.array as da
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from tqdm.keras import TqdmCallback

In [3]:
import tensorflow as tf
from tensorflow.keras import layers
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',min_delta=0.0005,patience=3,mode='max')
tf.config.optimizer.set_jit(True)

In [4]:
import scipy.cluster.hierarchy as shc
from scipy.spatial.distance import pdist
import seaborn as sns

In [5]:

# Define Features
audioFeatures = ['danceability','energy','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo']
features = [ 'duration_ms' ] + audioFeatures
featTF = [feature+'_tf' for feature in features]
try:
  embed = dd.read_parquet(DRIVE_PATH+'/data/embed.pq')[featTF]
except:
  feat = dd.read_parquet(DRIVE_PATH+'/data/feat_parquet.pq')
  # Queue Transform
  for feature in features:
    feat[feature+'_tf'] = (feat[feature] - feat[feature].mean())/feat[feature].std()
  feat.to_parquet(DRIVE_PATH+'/data/embed.pq')
  embed = feat[featTF]

In [6]:
raw = dd.read_parquet(DRIVE_PATH+'/data/raw_parquet.pq')

# raw['art_tup'] = raw.artists.map(lambda art: tuple(a['name'] for a in art) if art is not None else None,meta=(object,object))
# arts = raw.art_tup.unique().compute()
# art_map = dict(zip(arts.values,np.arange(arts.shape[0])))
# embed['art_code'] = raw.art_tup.map(lambda art: art_map[art],meta=(object,'uint8'))
embed['popularity'] = raw.popularity
embed['popularity'] = (embed.popularity-embed.popularity.mean())/embed.popularity.std()

featTF = featTF + ['popularity'] #+ ['art_code']

embed['genre'] = raw.genre
gen_idx = embed.groupby('genre').mean().index
gen_map = dict(zip(gen_idx,np.arange(len(gen_idx))))
# id2gen = dict(zip(np.arange(len(gen_idx),gen_idx)))
embed['gen_code'] = embed.genre.map(lambda gen: gen_map[gen],meta=(object,object)).persist()

In [7]:
def getSplit(df,x_col,y_col):
  res = df.compute()
  x = res[x_col].to_numpy()
  y = res[y_col].to_numpy()
  return x,y
  
def stepTransform(arr,steps):
  return arr.repeat(steps,0).reshape(-1,steps,arr.shape[1])

In [8]:
# PCA Reduce
arr = embed.groupby('genre').mean().compute().to_numpy()

cov_gen = np.cov(arr.T)
v,w = np.linalg.eig(cov_gen)
idx = np.argsort(v)[::-1]
v=v[idx]
w=w[:,idx]
expl = v/v.sum()
k = (np.cumsum(expl)>0.95).argmax()

arr_tf = arr @ w

In [9]:
k = 2
c = int(np.emath.logn(k,5238))
evens = [i for i in np.arange(2,5238) if i%2 == 0][::-1]
squares = [k**i for i in range(1,c+1)][::-1]
halves = np.ceil([5238*((0.5)**(i)) for i in range(1,c+1)])

# Cluster reduced genres
dist = pdist(arr_tf,'cosine')
Zs,Zc,Za,Zw = shc.single(dist),shc.complete(dist),shc.average(dist),shc.weighted(dist)

In [10]:
groupLabels = ['Squares','Halves']
groups = dict(zip(groupLabels,[squares,halves]))
functionLabels = ['Complete','Average','Weighted','single']
functions = [Zc,Za,Zw,Zs]
funcs = dict(zip(functionLabels,functions))
idx = embed.groupby('genre').size().compute().index
trees = { funcK:
         { groupK:
                 dask.delayed(shc.cut_tree)(funcV,groupV)
                 for groupK,groupV in groups.items() 
              }
          for funcK,funcV in funcs.items() 
        }

In [30]:
f,g = 'Complete','Squares'
tree = pd.DataFrame(trees[f][g].compute()[:,::-1],index=idx)
tiers = len(tree.columns)
tree[tiers] = np.arange(5238)
maxs = np.array([tree[0].max()]+[tree.groupby(i)[i+1].nunique().max() for i in range(tiers)])+1
max_class = np.max(maxs)
print(maxs,max_class)
embed['y']=embed.genre.map(lambda gen: np.mod(tree.loc[gen].to_numpy(),maxs),meta=(object,object)).persist()
dTrain,dVal,dTest = [(df[featTF].to_numpy(),np.vstack(df.y.to_numpy())) for df in [df.compute() for df in embed.random_split([0.8,0.1,0.1])]]

[ 2  4  4  4  5  6  5  6  6  6  7 10 13] 13


In [27]:
def makeLayer(input,name,units=10,frac=0.1):
  x = layers.Dense(units,activation='relu',name=f'D_{name}')(input)
  return layers.BatchNormalization(name=f'BN_{name}')(x)


def addStage(m_in,input,output,layer,size):
  input = tf.concat([input,output],1)

  x = makeLayer(input,f'{layer}_1',256)
  x = makeLayer(x,f'{layer}_2',256)

  output = layers.Activation('softmax',dtype='float32',name=f'p_{layer}')(layers.Dense(size,activation='relu',name=f'out_{layer}')(x))

  model = tf.keras.Model(m_in,output)
  model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics='accuracy')
  return model, input, output

In [None]:
layer = 0
col = 0
n=0


# 0 Design
in_0 = layers.Input(shape=(11),name='Model_Input')

x = makeLayer(in_0,f'{layer}_0',11)
x = makeLayer(x,f'{layer}_1',256)
x = makeLayer(x,f'{layer}_2',256)

output = layers.Activation('softmax',dtype='float32',name=f'p_{layer}')(layers.Dense(maxs[layer],activation='relu',name=f'out_{layer}')(x))

# 0 Compile
model = tf.keras.Model(in_0,output)
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics='accuracy')

# 0 Fit
tx,ty = dTrain[0], dTrain[1][:,col]
vx,vy = dVal[0], dVal[1][:,col]
model.fit(tx,ty,batch_size=256,epochs=5,validation_data=(vx,vy),callbacks=[early_stopping,TqdmCallback(verbose=1)], verbose=0)
layer += 1
col += 1

# 1+ Generate, Compile, and Fit additional stages
i,o = in_0, output
for n in range(1,tiers):
  model, i, o = addStage(in_0,i,o,layer,maxs[layer])
  ty = dTrain[1][:,col]
  vy = dVal[1][:,col]
  model.fit(tx,ty,batch_size=512,epochs=10,validation_data=(vx,vy),callbacks=[early_stopping,TqdmCallback(verbose=1)], verbose=0)
  layer +=1
  col += 1

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]