<a href="https://colab.research.google.com/github/oanders6/AIT-Budapest-Deep-Learning/blob/main/DeepLearningProjectMilestone2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate Lyrics

## **PART 1: Load Data**

Get data from Kaggle Datasource

In [1]:
! pip install kaggle



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! mkdir ~/.kaggle

In [4]:
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json

In [5]:
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle datasets download -d juicobowley/drake-lyrics

Downloading drake-lyrics.zip to /content
  0% 0.00/764k [00:00<?, ?B/s]
100% 764k/764k [00:00<00:00, 121MB/s]


In [7]:
!kaggle datasets download -d suraj520/music-dataset-song-information-and-lyrics

Downloading music-dataset-song-information-and-lyrics.zip to /content
  0% 0.00/1.90M [00:00<?, ?B/s]
100% 1.90M/1.90M [00:00<00:00, 126MB/s]


In [8]:
! unzip drake-lyrics.zip

Archive:  drake-lyrics.zip
  inflating: drake_data.csv          
  inflating: drake_data.json         
  inflating: drake_lyrics.txt        


In [9]:
! unzip music-dataset-song-information-and-lyrics

Archive:  music-dataset-song-information-and-lyrics.zip
  inflating: songs.csv               


Parse the lyrics to get a list of words

In [10]:
import re
import nltk
import pandas as pd

# NOTE:
# need to download the 'punkt' and 'stopwords' depndencies from nltk, used by
# tokenize and stopworks respectively:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
other_dataset = pd.read_csv('/content/songs.csv')
drake_dataset = pd.read_csv('/content/drake_data.csv')

In [12]:
drake_lyrics = drake_dataset.lyrics

Other artists

In [13]:
def filter_lyrics_with_title_format(title, lyrics2):
    # Extract the first line of lyrics
    first_line = lyrics2.split('\n')[0]

    # Apply the regex pattern to the first line of lyrics
    if re.search(fr'\b{re.escape(title)}\b', lyrics2, flags=re.IGNORECASE):
        return True
    else:
        return False

def remove_first_line(text):
    lines = text.split('\n')

    # Join the lines, excluding the first one
    new_text = '\n'.join(lines[1:])

    return new_text
def filter_dataset_with_title_format(dataset):
    # Initialize a list to store filtered rows
    filtered_rows = []

    # Iterate over each row of the dataset
    for index, row in dataset.iterrows():
        # Extract title and lyrics from the current row
        title = row['Name']
        lyrics2 = row['Lyrics']

        # Check if the lyrics match the title format
        if filter_lyrics_with_title_format(title, lyrics2):
            # If match found, add the row to filtered rows
            filtered_rows.append(row)

    # Create a new DataFrame with the filtered rows
    filtered_dataset = pd.DataFrame(filtered_rows)

    return filtered_dataset

Drake Lyrics

In [14]:
def preprocess_text(text):
  if isinstance(text, str):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove anything in brackets or parenthesis
    text = re.sub("[\(\[].*?[\)\]]", "", text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text
    tokens = nltk.tokenize.word_tokenize(text)

    # Remove stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

  else:
    return ''

In [15]:
lyrics_only = filter_dataset_with_title_format(other_dataset)

lyrics_only['Lyrics'] = lyrics_only['Lyrics'].apply(remove_first_line)

lyrics_only.head()

lyrics_only['Lyrics'] = lyrics_only['Lyrics'].apply(preprocess_text)

lyrics2 = []
for i in lyrics_only['Lyrics']:
  lyrics2 += [i]

print(lyrics2[1])

['sweet', 'lord', 'mmm', 'lord', 'mmm', 'lord', 'really', 'wan', 'na', 'see', 'really', 'wan', 'na', 'really', 'wan', 'na', 'see', 'lord', 'takes', 'long', 'lord', 'sweet', 'lord', 'mmm', 'lord', 'mmm', 'lord', 'really', 'wan', 'na', 'know', 'id', 'really', 'wan', 'na', 'go', 'really', 'wan', 'na', 'show', 'lord', 'wont', 'take', 'long', 'lord', 'sweet', 'lord', 'mmm', 'lord', 'sweet', 'lord', 'might', 'also', 'like', 'really', 'wan', 'na', 'see', 'really', 'wan', 'na', 'see', 'really', 'wan', 'na', 'see', 'lord', 'really', 'wan', 'na', 'see', 'lord', 'takes', 'long', 'lord', 'sweet', 'lord', 'mmm', 'lord', 'lord', 'really', 'wan', 'na', 'know', 'really', 'wan', 'na', 'go', 'really', 'wan', 'na', 'show', 'lord', 'wont', 'take', 'long', 'lord', 'mmmmm', 'sweet', 'lord', 'lord', 'mmm', 'lord', 'lord', 'oh', 'oh', 'sweet', 'lord', 'ooh', 'really', 'wan', 'na', 'see', 'really', 'wan', 'na', 'really', 'wan', 'na', 'see', 'lord', 'takes', 'long', 'lord', 'mmm', 'lord', 'lord', 'sweet', 'lord

In [16]:
# Example usage

processed_lyrics = preprocess_text(drake_lyrics[0])
processed_lyrics

['put',
 'feelings',
 'ice',
 'always',
 'gem',
 'certified',
 'lover',
 'boy',
 'somehow',
 'still',
 'heartless',
 'heart',
 'gettin',
 'colder']

The first song's processed lyrics without stop words

Now we need to create a list of all of the lyrics and tokenize them

In [17]:
end_of_song_marker = ['ENDOFSONG']
tokens = []
for i in range(0, len(drake_lyrics)):
  if i != 213:
    tokens += preprocess_text(drake_lyrics[i])
    tokens += end_of_song_marker

tokens2 = []
for i in range(0, len(lyrics2)):
  tokens2 += lyrics2[i]
  tokens2 += end_of_song_marker

In [18]:
from keras.preprocessing.text import Tokenizer

# Join the tokens back into a single string
text = ' '.join(tokens)
text2 = ' '.join(tokens2)

# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer2 = Tokenizer()

# Fit tokenizer on the text
tokenizer.fit_on_texts([text])
tokenizer2.fit_on_texts([text2])

# Tokenize the list of tokens
tokenized_sequence = tokenizer.texts_to_sequences([tokens])[0]
tokenized_sequence2 = tokenizer2.texts_to_sequences([tokens2])[0]

In [19]:
print(len(tokens), len(tokenized_sequence), len(tokens2), len(tokenized_sequence2))

76128 76128 102142 102142


Now we will create sequences from each lyric to the next, stopped by the end of the song

In [20]:
import numpy as np
from sklearn.model_selection import train_test_split

def split_data(tokens, tokenized_sequence):
  sequences = []

  for i in range (0, len(tokenized_sequence) - 3):
      curr  = tokens[i]
      next1 = tokens[i+1]
      next2 = tokens[i+2]
      next3 = tokens[i+3]
      # If neither token is the end-of-song marker, add it to the current sequence
      if curr != 'ENDOFSONG' and next1 != 'ENDOFSONG' and next2 != 'ENDOFSONG' and next3 != 'ENDOFSONG':
          sequences.append([tokenized_sequence[i], tokenized_sequence[i+1], tokenized_sequence[i+2], tokenized_sequence[i+3]])


  # Convert sequences to numpy array
  sequences = np.array(sequences)

  # Split sequences into input (X) and output (Y)
  X = sequences[:, :-1]
  Y = sequences[:, -1]

  # Split the dataset into training (80%), validation (10%), and test (10%) sets
  X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2)
  X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5)

  return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [21]:
X_train_drake, X_val_drake, X_test_drake, Y_train_drake, Y_val_drake, Y_test_drake = split_data(tokens, tokenized_sequence)

In [22]:
X_train_other, X_val_other, X_test_other, Y_train_other, Y_val_other, Y_test_other = split_data(tokens2, tokenized_sequence2)

In [23]:
print(len(X_train_other), len(X_val_other), len(X_test_other))
print(len(Y_train_other), len(Y_val_other), len(Y_test_other))

80174 10022 10022
80174 10022 10022


In [24]:
print(len(X_train_drake), len(X_val_drake), len(X_test_drake))
print(len(Y_train_drake), len(Y_val_drake), len(Y_test_drake))

59982 7498 7498
59982 7498 7498


In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

# Define the LSTM model
def create_model(vocabulary_size, seq_length):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 50, input_length=seq_length))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

# Train the LSTM model
def train_model(X_train, Y_train, X_val, Y_val, vocab_size):
    model = create_model(vocab_size, X_train.shape[1])
    model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=300)
    return model

# Evaluate the trained model on the test set
def evaluate_model(model, X_test, Y_test):
    loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
    print('Test Accuracy: %.2f%%' % (accuracy*100))

# For Drake's lyrics
model_drake = train_model(X_train_drake, Y_train_drake, X_val_drake, Y_val_drake, len(tokenizer.word_index)+1)
evaluate_model(model_drake, X_test_drake, Y_test_drake)

# For other dataset lyrics
model_other = train_model(X_train_other, Y_train_other, X_val_other, Y_val_other, len(tokenizer2.word_index)+1)
evaluate_model(model_other, X_test_other, Y_test_other)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 50)             424150    
                                                                 
 lstm (LSTM)                 (None, 3, 100)            60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 8483)              856783    
                                                                 
Total params: 1431833 (5.46 MB)
Trainable params: 1431833 (5.46 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2