In [221]:
import os
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt

from nltk.tokenize import word_tokenize

# Neural network librairies 
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding, SpatialDropout1D, LSTM

In [64]:
artist_names = ["Drake", "Adele", "Lana Del Rey", "Bruno Mars", "Drake"]
nb_artists = len(artist_names)

# Préparation data

In [222]:
# get basic X, Y sets for training + testing
# X is tokenized song lyrics
# Y is the one-hot encoding of the artists
X = []
Y = []

def tokenized_lyrics(lyrics):
    # à voir comment améliorer en supprimant les stopwords + ponctuation
    return word_tokenize(str(lyrics))

for filename in os.listdir('./completed_data'):
    data = pd.read_csv(f'./completed_data/{filename}')
    # ignore macOS files
    if filename == '.DS_Store': continue        
    artist_name = filename.replace('.csv', '')[18:]
    data['CTL2'] = data['Clean Lyrics'].apply(tokenized_lyrics)
    # ensure we have 100 songs for the artist
    # add the raw song lyrics to the X data
    X += (data["CTL2"].values.tolist())
    # one hot encode artist
    for i in range(len(data)):
        one_h = np.zeros(nb_artists) 
        artist_index = artist_names.index(artist_name)
        one_h[artist_index] = 1
        # add the one hot encoding to the Y array
        Y.append(one_h)


# Tokenize the songs in the X vector
# so we are mapping each word to it's key in the big_word_list
word_tokens = {}
for song in X:
    for word in song: 
        if word not in word_tokens.keys():
            if (len(word_tokens.keys()) == 0):
                word_tokens[word] = 1
            else:
                word_tokens[word] = len(word_tokens) + 1
                
token_x = []
for song in X:
    new_song = []
    for word in song:
        new_song.append(word_tokens[word])
    token_x.append(np.array(new_song))
X = token_x

# On enlève les chansons qui ont moins de 100 mots 
cutoff = 100
new_x = []
new_y = []
for i in range(len(X)):
    if (len(X[i]) >= cutoff):
        new_x.append(X[i])
        new_y.append(Y[i])
X = new_x
Y = new_y

# On garde 100 mots par chanson
trimmed = []
for song in X:
    song = song[:100]
    trimmed.append(song)
X = trimmed

# Conversion en numpy array
X = np.array(X)
Y = np.array(Y)

# On vérifie les shapes
print(X.shape)
print(Y.shape)

(543, 100)
(543, 5)


In [223]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(488, 100)
(488, 5)
(55, 100)
(55, 5)


# Modèle

In [224]:
old_model = Sequential([
    Dense(128, input_shape=(100,)),
    Activation('softmax'),
    Dense(1500),
    Activation('relu'),
    Dense(64),
    Activation('softmax'),
    Dense(nb_artists)
])

old_model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=['accuracy'])

old_model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_45 (Dense)            (None, 128)               12928     
                                                                 
 activation_37 (Activation)  (None, 128)               0         
                                                                 
 dense_46 (Dense)            (None, 1500)              193500    
                                                                 
 activation_38 (Activation)  (None, 1500)              0         
                                                                 
 dense_47 (Dense)            (None, 64)                96064     
                                                                 
 activation_39 (Activation)  (None, 64)                0         
                                                                 
 dense_48 (Dense)            (None, 5)               

In [230]:
epochs = 1000
batch_size = 24
old_history = old_model.fit(X_train,
                    y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=0.1,
                    verbose=0)

In [231]:
loss, accuracy = old_model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = old_model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.5533
Testing Accuracy:  0.4364
