In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
DRIVE_PATH='/content/drive/MyDrive/CS_DS541/'

Mounted at /content/drive/


Cleaning Songs CSV into a a set of features

In [2]:

import dask
import dask.dataframe as dd
import dask.array as da
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from tqdm.keras import TqdmCallback

In [3]:
import tensorflow as tf
from tensorflow.keras import layers
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',min_delta=0.0005,patience=3,mode='max')
tf.config.optimizer.set_jit(True)

In [4]:
import scipy.cluster.hierarchy as shc
from scipy.spatial.distance import pdist
import seaborn as sns

In [5]:

# Define Features
audioFeatures = ['danceability','energy','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo']
features = [ 'duration_ms' ] + audioFeatures
featTF = [feature+'_tf' for feature in features]
try:
  embed = dd.read_parquet(DRIVE_PATH+'/data/embed.pq')[featTF]
except:
  feat = dd.read_parquet(DRIVE_PATH+'/data/feat_parquet.pq')
  # Queue Transform
  for feature in features:
    feat[feature+'_tf'] = (feat[feature] - feat[feature].mean())/feat[feature].std()
  feat.to_parquet(DRIVE_PATH+'/data/embed.pq')
  embed = feat[featTF]

In [6]:
raw = dd.read_parquet(DRIVE_PATH+'/data/raw_parquet.pq')

# raw['art_tup'] = raw.artists.map(lambda art: tuple(a['name'] for a in art) if art is not None else None,meta=(object,object))
# arts = raw.art_tup.unique().compute()
# art_map = dict(zip(arts.values,np.arange(arts.shape[0])))
# embed['art_code'] = raw.art_tup.map(lambda art: art_map[art],meta=(object,'uint8'))
embed['popularity'] = raw.popularity
embed['popularity'] = (embed.popularity-embed.popularity.mean())/embed.popularity.std()

featTF = featTF + ['popularity'] #+ ['art_code']

embed['genre'] = raw.genre
gen_idx = embed.groupby('genre').mean().index
gen_map = dict(zip(gen_idx,np.arange(len(gen_idx))))
# id2gen = dict(zip(np.arange(len(gen_idx),gen_idx)))
embed['gen_code'] = embed.genre.map(lambda gen: gen_map[gen],meta=(object,object)).persist()

In [7]:
def getSplit(df,x_col,y_col):
  res = df.compute()
  x = res[x_col].to_numpy()
  y = res[y_col].to_numpy()
  return x,y
  
def stepTransform(arr,steps):
  return arr.repeat(steps,0).reshape(-1,steps,arr.shape[1])

In [8]:
hidden_units = 256

dense = tf.keras.Sequential([
    layers.Input(11),
    layers.Dense(hidden_units,'relu'),
    layers.Dropout(0.1),
    layers.Dense(hidden_units,'relu'),
    layers.Dropout(0.1),
    layers.Dense(hidden_units,'relu'),
    layers.Dropout(0.1),
    layers.Dense(5238,'softmax')
])

dense.compile('adam','sparse_categorical_crossentropy',metrics='accuracy')
t,v,e = embed.random_split([0.8,0.1,0.1])
dense.fit(t[featTF].compute().to_numpy(),t.gen_code.compute().to_numpy(),
          batch_size=512,epochs=30,
          validation_data=(v[featTF].compute().to_numpy(),v.gen_code.compute().to_numpy()),
          callbacks=[early_stopping,TqdmCallback(verbose=1)], 
                verbose=0)
dense.evaluate(e[featTF].compute().to_numpy(),e.gen_code.compute().to_numpy())

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]



[5.386788368225098, 0.13312360644340515]

In [9]:
def makeModel(input_dim=11,seq=1,maxClass=5238,unit_size=64,mode='GRU'):
  i = layers.Input(shape=(seq,input_dim))

  match mode:
    case 'GRU':
      x = layers.Bidirectional(layers.GRU(unit_size*2,return_sequences=True))(i)
    case 'LSTM':
      x = layers.Bidirectional(layers.LSTM(unit_size*2,return_sequences=True))(i)
    case 'RNN':
      x = layers.Bidirectional(layers.RNN(unit_size*2,return_sequences=True))(i)

  x = layers.Dense(unit_size,'relu')(x)
  x = layers.Dropout(0.1)(x)
  x = layers.Dense(unit_size,'relu')(x)
  x = layers.Dropout(0.1)(x)


  o = layers.Dense(maxClass,'softmax')(x)

  m = tf.keras.Model(i,o)
  pm = tf.keras.Model(i,tf.argmax(o,-1))
  m.compile('adam','sparse_categorical_crossentropy',metrics='accuracy')
  pm.compile('adam','sparse_categorical_crossentropy',metrics='accuracy')
  return m, pm

In [10]:
# PCA Reduce
arr = embed.groupby('genre').mean().compute().to_numpy()

cov_gen = np.cov(arr.T)
v,w = np.linalg.eig(cov_gen)
idx = np.argsort(v)[::-1]
v=v[idx]
w=w[:,idx]
expl = v/v.sum()
k = (np.cumsum(expl)>0.95).argmax()

arr_tf = arr @ w

In [11]:
k = 2
c = int(np.emath.logn(k,5238))
evens = [i for i in np.arange(2,5238) if i%2 == 0][::-1]
squares = [k**i for i in range(1,c+1)][::-1]
halves = np.ceil([5238*((0.5)**(i)) for i in range(1,c+1)])

# Cluster reduced genres
dist = pdist(arr_tf,'cosine')
Zs,Zc,Za,Zw = shc.single(dist),shc.complete(dist),shc.average(dist),shc.weighted(dist)

In [12]:
groupLabels = ['Squares','Halves']
groups = dict(zip(groupLabels,[squares,halves]))
functionLabels = ['Complete','Average','Weighted','single']
functions = [Zc,Za,Zw,Zs]
funcs = dict(zip(functionLabels,functions))
idx = embed.groupby('genre').size().compute().index
trees = { funcK:
         { groupK:
                 dask.delayed(shc.cut_tree)(funcV,groupV)
                 for groupK,groupV in groups.items() 
              }
          for funcK,funcV in funcs.items() 
        }

In [13]:
for f in functionLabels:
  for g in groupLabels:
    print(f'Beginning {g} spacing w/ {f} linkage')
    tree = pd.DataFrame(trees[f][g].compute()[:,::-1],index=idx)
    tiers = len(tree.columns)
    tree[tiers] = np.arange(5238)
    maxs = np.array([tree[0].max()]+[tree.groupby(i)[i+1].nunique().max() for i in range(tiers)])+1
    max_class = np.max(maxs)
    print(maxs,max_class)
    embed['y']=embed.genre.map(lambda gen: np.mod(tree.loc[gen].to_numpy(),maxs),meta=(object,object)).persist()

    (tx,ty),(vx,vy),(ex,ey) = [(stepTransform(df[featTF].to_dask_array(True),tiers+1),df.y.to_dask_array(True)) for df in embed.random_split([0.8,0.1,0.1])]

    m,pm = makeModel(11,tiers+1,max_class,hidden_units)

    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
      filepath=DRIVE_PATH+'/models/RNN/GRU/'+f'{f}_{g}',
      save_weights_only=True,
      monitor='val_accuracy',
      mode='max',
      save_best_only=True)
    
    fit = m.fit(tx.compute(),np.vstack(ty.compute()),
                batch_size=512,epochs=10,
                validation_data=(vx.compute(),np.vstack(vy.compute())),
                callbacks=[model_checkpoint_callback,early_stopping,TqdmCallback(verbose=1)], 
                verbose=0)
    
    p = pm.predict(ex.compute())
    acc = (p==np.vstack(ey.compute()))
    print(acc.mean(0),acc.mean())


Beginning Squares spacing w/ Complete linkage
[ 2  4  4  4  5  6  5  6  6  6  7 10 13] 13


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

[0.81479433 0.69807683 0.69390496 0.6161249  0.52442281 0.50064
 0.45460723 0.39986726 0.36592342 0.33286452 0.2627803  0.18425752
 0.13550671] 0.4602900615447825
Beginning Halves spacing w/ Complete linkage
[ 2  3  3  4  4  4  5  5  6  7  6  7 41] 41


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

[0.81529957 0.72500473 0.75839476 0.67399042 0.60955711 0.55701506
 0.46723997 0.44106344 0.4008379  0.32931708 0.31093681 0.249055
 0.08338058] 0.49393018623787854
Beginning Squares spacing w/ Average linkage
[ 2  3  4  4  5  7  6  7 11 11 12 29 34] 34


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

[0.85593207 0.74850224 0.73894174 0.67319758 0.65803915 0.6024373
 0.54048274 0.49667427 0.40220143 0.3315984  0.25547606 0.13161412
 0.08573001] 0.5016020853114964
Beginning Halves spacing w/ Average linkage
[  2   3   5   3   4   5   6   7   9   8  12  17 102] 102


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

[0.8565713  0.77953831 0.73957705 0.76829772 0.68969295 0.64063443
 0.58678119 0.52676785 0.45591084 0.40877338 0.30316261 0.20931636
 0.075874  ] 0.5416075367175611
Beginning Squares spacing w/ Weighted linkage
[ 2  4  5  5  4  5  5  6  7  7 11 23 26] 26


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

[0.85876879 0.70178583 0.62970823 0.65116645 0.62566811 0.5402597
 0.44798151 0.41564485 0.37257121 0.33809659 0.25086462 0.14066528
 0.10226058] 0.46734167235016133
Beginning Halves spacing w/ Weighted linkage
[ 2  3  3  4  4  4  5  6  6  7  7 12 93] 93


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

[0.85945937 0.72018189 0.74629741 0.62164082 0.59446743 0.58444122
 0.49310007 0.44217956 0.39956422 0.34196482 0.30774939 0.20849781
 0.07102031] 0.4915818699884131
Beginning Squares spacing w/ single linkage
[   2    4    6   10   18   34   65  129  250  481  934 1369  372] 1369


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

[0.99971493 0.99938234 0.99935067 0.99835292 0.99692756 0.99228723
 0.98428938 0.96511039 0.91804187 0.81912198 0.56445789 0.23676792
 0.0796142 ] 0.811801482860366
Beginning Halves spacing w/ single linkage
[   2    3    4    7   12   22   43   84  161  313  601 1179 1384] 1384


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

[0.99974809 0.99944895 0.99941746 0.99883492 0.99812643 0.99436354
 0.98923089 0.97753287 0.9545934  0.89422971 0.76618122 0.45587656
 0.09276549] 0.8554115018257347


In [None]:
def makeLayer(input,name,units=10,frac=0.1):
  x = layers.Dense(units,activation='relu',name=f'D_{name}')(input)
  return layers.BatchNormalization(name=f'BN_{name}')(x)


def addStage(m_in,input,output,layer,size):
  input = tf.concat([input,output],1)

  x = makeLayer(input,f'{layer}_1',256)
  x = makeLayer(x,f'{layer}_2',256)

  output = layers.Activation('softmax',dtype='float32',name=f'p_{layer}')(layers.Dense(size,activation='relu',name=f'out_{layer}')(x))

  model = tf.keras.Model(m_in,output)
  model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics='accuracy')
  return model, input, output

In [None]:
layer = 0
col = 0
n=0


# 0 Design
in_0 = layers.Input(shape=(11),name='Model_Input')

x = makeLayer(in_0,f'{layer}_0',11)
x = makeLayer(x,f'{layer}_1',256)
x = makeLayer(x,f'{layer}_2',256)

output = layers.Activation('softmax',dtype='float32',name=f'p_{layer}')(layers.Dense(sizes[layer],activation='relu',name=f'out_{layer}')(x))

# 0 Compile
model = tf.keras.Model(in_0,output)
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics='accuracy')

# 0 Fit
tx,ty = dTrain[0], dTrain[1][:,col]
vx,vy = dVal[0], dVal[1][:,col]
model.fit(tx,ty,batch_size=256,epochs=5,validation_data=(vx,vy),callbacks=[early_stopping,TqdmCallback(verbose=1)], verbose=0)
layer += 1
col += 1

# 1+ Generate, Compile, and Fit additional stages
i,o = in_0, output
for n in range(1,stepRange):
  model, i, o = addStage(in_0,i,o,layer,sizes[layer])
  ty = dTrain[1][:,col]
  vy = dVal[1][:,col]
  model.fit(tx,ty,batch_size=512,epochs=10,validation_data=(vx,vy),callbacks=[early_stopping,TqdmCallback(verbose=1)], verbose=0)
  layer +=1
  col += 1

In [None]:
model.summary()