This repository has been archived by the owner on Feb 23, 2023. It is now read-only.
/
03_autoencoding_and_tsne.py
375 lines (322 loc) · 16 KB
/
03_autoencoding_and_tsne.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
'''
This script trains an autoencoder on every csv file containing the Mel
spectrogram data for a list of songs. The learned latent features are then
used to cluster the songs with t-SNE
Artist information will be used to color the clusters to see how accurate
clustering is.
'''
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input
from keras.layers import Conv2D, MaxPooling2D, UpSampling2D
from keras.optimizers import Adadelta
from keras.callbacks import Callback, ModelCheckpoint
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
#Set seed for repeatability
seed = 14
np.random.seed(seed)
'''
-------------------------------------------------------------------------------
--------------------Convolutional Autoencoder Indv Model-----------------------
-------------------------------------------------------------------------------
'''
#Specify the file directory for the spectrogram csv files
filedir = '/home/nick/Documents/Python Scripts/Music Recommendation with Deep Learning/Audio Files/Spectrograms/'
os.chdir(filedir)
def autoencode(fileNbr):
'''
This function loads a csv file with the Mel spectrogram image data,
then creates a convolutional autoencoder to learn latent fatures about the image.
The output is the encoded image, reshaped to be 1 row.
'''
#Load data as array, noting that the log amplitude must be taken to scale the values
spec = librosa.logamplitude(np.loadtxt(str(i) + '.csv', delimiter=','), ref_power=np.max)
#Inspect the spectrogram. x-axis is time in frames and y-axis is frequency (Hz) upside down
plt.imshow(spec)
x_train = spec.astype('float32') / 255.
x_train = np.reshape(x_train, (1, 512, 2584, 1))
#Test data will be the same as training data
input_img = Input(shape=(512, 2584, 1))
#Build the endoder piece
encoded = Conv2D(nb_filter=16, nb_row=3, nb_col=3, input_shape=(512, 2584, 1), activation='relu', border_mode='same')(input_img)
encoded = MaxPooling2D((2, 2), border_mode='same')(encoded)
encoded = Conv2D(nb_filter=8, nb_row=3, nb_col=3, activation='relu', border_mode='same')(encoded)
encoded = MaxPooling2D((2, 2), border_mode='same')(encoded)
encoded = Conv2D(nb_filter=8, nb_row=3, nb_col=3, activation='relu', border_mode='same')(encoded)
encoded = MaxPooling2D((2, 2), border_mode='same')(encoded)
encoder = Model(input=input_img, output=encoded)
#encoded_imgs = encoder.predict(x_train)
#plt.imshow(encoded_imgs[0].reshape(encoded_imgs[0].shape[0], encoded_imgs[0].shape[1]*encoded_imgs[0].shape[2]).T)
#Build the decoder piece
decoded = Conv2D(nb_filter=8, nb_row=3, nb_col=3, input_shape=(4, 4, 8), activation='relu', border_mode='same')(encoded)
decoded = UpSampling2D((2, 2))(decoded)
decoded = Conv2D(nb_filter=8, nb_row=3, nb_col=3, activation='relu', border_mode='same')(decoded)
decoded = UpSampling2D((2, 2))(decoded)
decoded = Conv2D(nb_filter=16, nb_row=3, nb_col=3, activation='relu', border_mode='same')(decoded)
decoded = UpSampling2D((2, 2))(decoded)
decoded = Conv2D(nb_filter=1, nb_row=3, nb_col=3, activation='sigmoid', border_mode='same')(decoded)
#decoder = Model(input=encoder.input, output=decoded)
#decoded_imgs = decoder.predict(x_train)
#plt.imshow(decoded_imgs[0].reshape(decoded_imgs[0].shape[0], decoded_imgs[0].shape[1]))
#Create learning rate schedule and add it to the optimizer of choice
learning_rate = 1.0
epochs = 50
decay_rate = learning_rate / epochs
#rho and epsilon left at defaults
adadelta = Adadelta(lr=learning_rate, rho=0.95, epsilon=1e-08, decay=decay_rate)
#Create a custom callback to stop training the autoencoder when a val_loss of 0.1 is reached
class EarlyStoppingByLossVal(Callback):
def __init__(self, monitor='val_loss', value=0.1, verbose=0):
super(Callback, self).__init__()
self.monitor = monitor
self.value = value
self.verbose = verbose
def on_epoch_end(self, epoch, logs={}):
current = logs.get(self.monitor)
if current < self.value:
if self.verbose > 0:
print("Epoch %05d: early stopping THR" % epoch)
self.model.stop_training = True
#modelfilepath = "/home/nick/Documents/Python Scripts/Music Recommendation with Deep Learning/Models/" + str(i) + "autoencoder-{epoch:02d}-{val_loss:.2f}.hdf5"
modelfilepath = "/home/nick/Documents/Python Scripts/Music Recommendation with Deep Learning/Models/" + str(i) + "autoencoder-best_fit.hdf5"
callbacks = [
EarlyStoppingByLossVal(monitor='val_loss', value=0.1, verbose=1),
#Save the best model to a hdf5 file located at modelfilepath
ModelCheckpoint(modelfilepath, monitor='val_loss', save_best_only=True, verbose=0),
]
#Build the full autoencoder
autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer=adadelta, loss='binary_crossentropy')
autoencoder.summary()
autoencoder.fit(x_train, x_train, nb_epoch=epochs, batch_size=128, shuffle=True,
validation_data=(x_train, x_train), callbacks=callbacks)
#Reset the decoded images to be the result of the trained autoencoder
autoencoded_imgs = librosa.logamplitude(autoencoder.predict(x_train), ref_power=np.max)
plt.imshow(autoencoded_imgs[0].reshape(512, 2584))
plt.imshow(x_train.reshape(512, 2584))
#Store endoded image data for comparison to others. (Encoder wieghts have been tuned now)
encoded_imgs = encoder.predict(x_train)
plt.imshow(encoded_imgs[0].reshape(encoded_imgs[0].shape[0], encoded_imgs[0].shape[1]*encoded_imgs[0].shape[2]).T)
#Reshape each encoded image to be 1 row by 165,376 columns
print(encoded_imgs.shape)
print(encoded_imgs.reshape(encoded_imgs.shape[1], encoded_imgs.shape[2]*encoded_imgs.shape[3]).reshape(1, encoded_imgs.shape[1]*encoded_imgs.shape[2]*encoded_imgs.shape[3]).shape)
enc_rs = encoded_imgs.reshape(encoded_imgs.shape[1], encoded_imgs.shape[2]*encoded_imgs.shape[3]).reshape(1, encoded_imgs.shape[1]*encoded_imgs.shape[2]*encoded_imgs.shape[3])
return enc_rs[0]
#Build an autoencoder for every file in the directory (there are 28 files)
enc_list = []
for i in range(1, 29):
enc_list.append(autoencode(i))
x = np.matrix(enc_list)
print(x.shape)
#Specify the song ID as the label y
y = np.array(range(1, 29))
'''
-------------------------------------------------------------------------------
------------------------Plotting Clusters Indv Model---------------------------
-------------------------------------------------------------------------------
'''
#Build TSNE model
tsne_model = TSNE(n_components=2, perplexity=2.0, learning_rate=100.0,
n_iter=5000, n_iter_without_progress=60,
random_state=seed, method='barnes_hut')
tsne_dim = tsne_model.fit_transform(x)
#Plot first 2 extracted features and the observation class
plt.figure(figsize=(10, 5))
plt.xlabel('Latent Variable x')
plt.ylabel('Latent Variable y')
plt.title('t-SNE 2-Dimension Plot with Observation Class')
#plt.axis([-0.0003, 0.0003, -0.0003, 0.0003])
plt.scatter(tsne_dim[:, 0], tsne_dim[:, 1], c=y, s=50)
plt.colorbar()
labels = ['{0}'.format(i) for i in y]
for label, xc, yc in zip(labels, tsne_dim[:, 0], tsne_dim[:, 1]):
plt.annotate(
label,
xy=(xc, yc), xytext=(-0, 0),
textcoords='offset points', ha='right', va='bottom')
plt.show()
#Build K-Means clusters with k=8, based on looking at t-SNE results
km_model = KMeans(n_clusters=8)
km_results = km_model.fit(tsne_dim)
#Plot the clusters
plt.figure(figsize=(10,5))
plt.scatter(tsne_dim[:,0], tsne_dim[:,1], c=km_model.labels_, s=50)
plt.title('K Mean Clustering of t-SNE Reduced Dimensions')
for label, xc, yc in zip(labels, tsne_dim[:, 0], tsne_dim[:, 1]):
plt.annotate(
label,
xy=(xc, yc), xytext=(-0, 0),
textcoords='offset points', ha='right', va='bottom')
plt.show()
'''
-------------------------------------------------------------------------------
-----------------------Convolutional Autoencoder-------------------------------
-------------------------------------------------------------------------------
'''
#Specify the file directory for the spectrogram csv files
filedir = '/home/nick/Documents/Python Scripts/Music Recommendation with Deep Learning/Audio Files/Spectrograms/'
os.chdir(filedir)
def readFile(filenbr):
#Load data as array, noting that the log amplitude must be taken to scale the values
spec = librosa.logamplitude(np.loadtxt(str(filenbr) + '.csv', delimiter=','), ref_power=np.max)
x_train = spec.astype('float32') / 255.
x_train = np.reshape(x_train, (512, 2584, 1))
#Test data will be the same as training data
return x_train
x_train = []
for i in range(1, 19):
x_train.append(readFile(i))
x_train = np.array(x_train)
print(x_train.shape)
#Inspect a couple spectrograms (x-axis is time in frames and y-axis is freq upside down)
plt.title("Mel Spectrogram")
plt.xlabel("Time in Frames")
plt.ylabel("Mel-scaled Frequency (Hz)")
plt.imshow(x_train[0].reshape(512, 2584))
plt.title("Mel Spectrogram")
plt.xlabel("Time in Frames")
plt.ylabel("Mel-scaled Frequency (Hz)")
plt.imshow(x_train[1].reshape(512, 2584))
input_img = Input(shape=(512, 2584, 1))
#Build the endoder piece
encoded = Conv2D(nb_filter=16, nb_row=3, nb_col=3, input_shape=(512, 2584, 1), activation='relu', border_mode='same')(input_img)
encoded = MaxPooling2D((2, 2), border_mode='same')(encoded)
encoded = Conv2D(nb_filter=8, nb_row=3, nb_col=3, activation='relu', border_mode='same')(encoded)
encoded = MaxPooling2D((2, 2), border_mode='same')(encoded)
encoded = Conv2D(nb_filter=8, nb_row=3, nb_col=3, activation='relu', border_mode='same')(encoded)
encoded = MaxPooling2D((2, 2), border_mode='same')(encoded)
encoder = Model(input=input_img, output=encoded)
#encoded_imgs = encoder.predict(x_train)
#plt.imshow(encoded_imgs[0].reshape(encoded_imgs[0].shape[0], encoded_imgs[0].shape[1]*encoded_imgs[0].shape[2]).T)
#Build the decoder piece
decoded = Conv2D(nb_filter=8, nb_row=3, nb_col=3, input_shape=(4, 4, 8), activation='relu', border_mode='same')(encoded)
decoded = UpSampling2D((2, 2))(decoded)
decoded = Conv2D(nb_filter=8, nb_row=3, nb_col=3, activation='relu', border_mode='same')(decoded)
decoded = UpSampling2D((2, 2))(decoded)
decoded = Conv2D(nb_filter=16, nb_row=3, nb_col=3, activation='relu', border_mode='same')(decoded)
decoded = UpSampling2D((2, 2))(decoded)
decoded = Conv2D(nb_filter=1, nb_row=3, nb_col=3, activation='sigmoid', border_mode='same')(decoded)
#decoder = Model(input=encoder.input, output=decoded)
#decoded_imgs = decoder.predict(x_train)
#plt.imshow(decoded_imgs[0].reshape(decoded_imgs[0].shape[0], decoded_imgs[0].shape[1]))
#Create learning rate schedule and add it to the optimizer of choice
learning_rate = 1.0
epochs = 50
decay_rate = learning_rate / epochs
#rho and epsilon left at defaults
adadelta = Adadelta(lr=learning_rate, rho=0.95, epsilon=1e-08, decay=decay_rate)
#Create a custom callback to stop training the autoencoder when a val_loss of 0.1 is reached
class EarlyStoppingByLossVal(Callback):
def __init__(self, monitor='val_loss', value=0.1, verbose=0):
super(Callback, self).__init__()
self.monitor = monitor
self.value = value
self.verbose = verbose
def on_epoch_end(self, epoch, logs={}):
current = logs.get(self.monitor)
if current < self.value:
if self.verbose > 0:
print("Epoch %05d: early stopping THR" % epoch)
self.model.stop_training = True
modelfilepath = "/home/nick/Documents/Python Scripts/Music Recommendation with Deep Learning/Models/combined-data-autoencoder-best_fit.hdf5"
callbacks = [
EarlyStoppingByLossVal(monitor='val_loss', value=0.1, verbose=1),
#Save the best model to a hdf5 file located at modelfilepath
ModelCheckpoint(modelfilepath, monitor='val_loss', save_best_only=True, verbose=0),
]
#Build the full autoencoder
autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer=adadelta, loss='binary_crossentropy')
autoencoder.summary()
autoencoder.fit(x_train, x_train, nb_epoch=epochs, batch_size=128, shuffle=True,
validation_data=(x_train, x_train), callbacks=callbacks)
#Reset the decoded images to be the result of the trained autoencoder
autoencoded_imgs = librosa.logamplitude(autoencoder.predict(x_train), ref_power=np.max)
plt.imshow(autoencoded_imgs[0].reshape(512, 2584))
plt.imshow(x_train[0].reshape(512, 2584))
#Store endoded image data for comparison to others. (Encoder wieghts have been tuned now)
encoded_imgs = encoder.predict(x_train)
plt.imshow(encoded_imgs[0].reshape(encoded_imgs[0].shape[0], encoded_imgs[0].shape[1]*encoded_imgs[0].shape[2]).T)
#Reshape each encoded images to be 18 rows by 64*323*8 columns
print(encoded_imgs.shape)
enc_rs = []
for ei in encoded_imgs:
enc_rs.append(ei.reshape(1, ei.shape[0]*ei.shape[1]*ei.shape[2]))
x = np.array(enc_rs)
#Specify the song ID as the label y
y = np.array(range(1, 19))
'''
-------------------------------------------------------------------------------
----------------------------Plotting Clusters---------------------------------
-------------------------------------------------------------------------------
'''
#Build TSNE model
tsne_model = TSNE(n_components=2, perplexity=2.0, learning_rate=100.0,
n_iter=5000, n_iter_without_progress=60,
random_state=seed, method='barnes_hut')
tsne_dim = tsne_model.fit_transform(np.matrix(x))
#Plot first 2 extracted features and the observation class
plt.figure(figsize=(10, 5))
plt.xlabel('Latent Variable x')
plt.ylabel('Latent Variable y')
plt.title('t-SNE 2-Dimension Plot with Observation Class')
#plt.axis([-0.0003, 0.0003, -0.0003, 0.0003])
plt.scatter(tsne_dim[:, 0], tsne_dim[:, 1], c=y, s=50)
plt.colorbar()
labels = ['{0}'.format(i) for i in y]
for label, xc, yc in zip(labels, tsne_dim[:, 0], tsne_dim[:, 1]):
plt.annotate(
label,
xy=(xc, yc), xytext=(-0, 0),
textcoords='offset points', ha='right', va='bottom')
plt.show()
#Build K-Means clusters with k=8, based on looking at t-SNE results
km_model = KMeans(n_clusters=8)
km_results = km_model.fit(tsne_dim)
#Plot the clusters
plt.figure(figsize=(10,5))
plt.scatter(tsne_dim[:,0], tsne_dim[:,1], c=km_model.labels_, s=50)
plt.title('K Mean Clustering of t-SNE Reduced Dimensions')
for label, xc, yc in zip(labels, tsne_dim[:, 0], tsne_dim[:, 1]):
plt.annotate(
label,
xy=(xc, yc), xytext=(-0, 0),
textcoords='offset points', ha='right', va='bottom')
plt.show()
#Perform hierarchical agglomerative clustering
hc_model = linkage(tsne_dim, method='ward', metric='euclidean')
#Manually adding these since there aren't many
dendrogram_labels = ["Ain't No Grave - Johnny Cash",
"Danse Macabre - Camille Saint-Saens",
"Gimme Something Good - Ryan Adams",
"Mine - Taylor Swift",
"Sympohny No.3, Op.78 - Camille Saint-Saens",
"Roar - Katy Perry",
"Somewhat Damaged - Nine Inch Nails",
"Yesterday - The Beatles",
"Hurt - Johnny Cash",
"Shakermaker - Oasis",
"All My Loving - The Beatles",
"Back To December - Taylor Swift",
"Live Forever - Oasis",
"Wonderwall - Oasis",
"Don't Look Back In Anger - Oasis",
"Firework - Katy Perry",
"Little James - Oasis",
"Every Day Is Exactly The Same - Nine Inch Nails"]
#Plot the full dendrogram
plt.figure(figsize=(12, 5))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Song')
plt.ylabel('Distance')
dendrogram(hc_model,
leaf_rotation=90., #Rotate the x-axis labels for ease of reading
leaf_font_size=10., #Specify font size of x-axis labels
labels=dendrogram_labels
)
plt.show()