In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import torchvision
from torchvision import models
from torchvision import datasets, transforms
from torchvision.models import resnet18, ResNet18_Weights

import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
import re
from collections import Counter

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

> Retrieve the song data from the file provided

In [None]:
songs_df = pd.read_csv('Songs.csv', header = 0)
songs_df.head(5)

> Each entry ends with something like '1EmbedShare URLCopyEmbedCopy', so we removed those appearances from the lyrics.

In [None]:
songs_df['Lyrics'] = songs_df['Lyrics'].str.replace(r'\d+EmbedShare URLCopyEmbedCopy$', "", regex=True)

> Show all unique artists in the dataset

In [None]:
for artist in songs_df['Artist'].unique():
    print(f'{artist}: {songs_df[songs_df['Artist'] == artist].shape[0]} Songs')

> The dataset's size.

In [None]:
print(f'The dataset contains {len(songs_df)} entries for songs.')

> The dataset contains 3 duplicates so we will need to take care of that.

In [None]:
print(f'The dataset contains {len(songs_df['Title'].unique())} unique songs.')

> Cleanup of duplicates

In [None]:
songs_df = songs_df.drop_duplicates(subset='Title')

In [None]:
for artist in songs_df['Artist'].unique():
    print(f'{artist}: {songs_df[songs_df['Artist'] == artist].shape[0]} Songs')

> Average song length

In [None]:
lengths = songs_df['Lyrics'].apply(lambda x: len(x.split()))
avg_words = lengths.sum() / songs_df['Lyrics'].nunique()

char_lengths = songs_df['Lyrics'].apply(lambda x: len(x))
avg_chars = char_lengths.sum() / songs_df['Lyrics'].nunique()

print(f'The average number of words in a song is {avg_words:.2f}')
print(f'The average number of characters in a song is {avg_chars:.2f}')

> Word Cloud

In [None]:
# Combine all lyrics into one large text
all_lyrics = " ".join(songs_df["Lyrics"].dropna())

# Clean the text (remove punctuation, convert to lowercase)
all_lyrics = re.sub(r"[^\w\s]", "", all_lyrics.lower())

# Tokenization
words = all_lyrics.split()

# Retrieve word frequencies
word_counts = Counter(words)

# 3 most common words
print("The 3 most common words are:")
for word, count in word_counts.most_common(3):
    print(f"{word}: {count} appearances")

# Definition of wordCloud
wordcloud = WordCloud(width=800, height=400, background_color="white", colormap='plasma', max_words=50).generate_from_frequencies(word_counts)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
lyrics = songs_df["Lyrics"].dropna().str.lower()

# Tokenization the lyrics removing puncuation
tokenized_lyrics = [re.findall(r"\b\w+\b|\n", lyric) for lyric in lyrics]

# Build vocabulary
word_counts = Counter(word for song in tokenized_lyrics for word in song)
word_to_index = {word: i + 1 for i, (word, _) in enumerate(word_counts.items())}  # Start index from 1
index_to_word = {i: word for word, i in word_to_index.items()}

# Convert lyrics to numerical sequences
encoded_sequences = [[word_to_index[word] for word in song if word in word_to_index] for song in tokenized_lyrics]

# Vocabulary size
vocab_size = len(word_to_index) + 1  # Adding 1 for padding if needed

print(f"Vocabulary Size: {vocab_size}")

In [None]:
# Set sequence length (choose a reasonable length)
SEQ_LENGTH = 10

# Create input-target pairs
input_sequences = []
targets = []

for song in encoded_sequences:
    for i in range(1, len(song)):
        n_gram_sequence = song[:i+1]  # Create sequence up to current word
        if len(n_gram_sequence) >= 2:  # Ensure sequence has at least one input and one target
            input_sequences.append(n_gram_sequence[:-1])  # All but last word (input)
            targets.append(n_gram_sequence[-1])  # Last word (target)

# Pad sequences to the same length
max_seq_length = SEQ_LENGTH  # Set fixed sequence length
padded_sequences = [([0] * (max_seq_length - len(seq)) + seq)[-max_seq_length:] for seq in input_sequences]

# Convert to tensors
X = torch.tensor(padded_sequences, dtype=torch.long)
y = torch.tensor(targets, dtype=torch.long)

# PyTorch Dataset
class SongLyricsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create DataLoader
dataset = SongLyricsDataset(X, y)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)

print(f"Dataset Size: {len(dataset)}")

In [None]:
class LyricsLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=256, num_layers=4):
        super(LyricsLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)  # Convert word indices to embeddings
        lstm_out, _ = self.lstm(x)  # LSTM forward pass
        out = self.fc(lstm_out[:, -1, :])  # Get output from last LSTM step
        return out

# Model setup
model = LyricsLSTM(vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
EPOCHS = 25

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch_X, batch_y in data_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        predictions = model(batch_X)
        loss = criterion(predictions, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(data_loader):.4f}")


In [None]:
def LyricsGenerator(starting_string, model, word_to_index, index_to_word, max_words=avg_words):
    model.eval()
    max_words = int(max_words)
    
    if not starting_string.strip():
        starting_word = random.choice(list(word_to_index.keys()))
        words = [starting_word]
    else:
        words = starting_string.lower().split()
    
    for _ in range(max_words):
        # Convert words to indices
        encoded_input = [word_to_index.get(word, 0) for word in words][-SEQ_LENGTH:]
        input_tensor = torch.tensor([encoded_input], dtype=torch.long).to(device)

        # Predict next word
        with torch.no_grad():
            output = model(input_tensor)
            predicted_index = torch.argmax(output, dim=1).item()

        # Convert index to word
        next_word = index_to_word.get(predicted_index, "<UNK>")
        words.append(next_word)

    return " ".join(words)


In [None]:
seed = "it's not a silly little moment"
generated_text = LyricsGenerator(seed, model, word_to_index, index_to_word)
print("\nGenerated Lyrics:\n", generated_text)

In [None]:
seed = "There is a house"
generated_text = LyricsGenerator(seed, model, word_to_index, index_to_word)
print("\nGenerated Lyrics:\n", generated_text)

In [None]:
seed = ""
generated_text = LyricsGenerator(seed, model, word_to_index, index_to_word)
print("\nGenerated Lyrics:\n", generated_text)