In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import os
from sklearn.model_selection import train_test_split

In [2]:
torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [6]:
df = pd.read_csv('combined_artists_tokenized.csv')

In [7]:
df.head(2)

Unnamed: 0,Artist,Title,Album,Year,Date,Lyric,Tokenized_Lyrics
0,Dua Lipa,New Rules,Dua Lipa,2017.0,2017-06-02,one one one one one talkin' in my sleep at n...,"[505, 530, 530, 530, 530, 220, 220, 1561, 259,..."
1,Dua Lipa,Don’t Start Now,Future Nostalgia,2019.0,2019-11-01,if you don't wanna see me did a full 80 craz...,"[361, 345, 836, 470, 18869, 766, 502, 220, 220..."


In [8]:
def convert_tokens_str_to_list(tokens_str):
    if pd.isna(tokens_str) or tokens_str == '[]':
        return []
    try:
        tokens_str = tokens_str.strip('[]')
        if not tokens_str:
            return []
        return [int(token) for token in tokens_str.split(',')]
    except:
        return []

In [9]:
print("Converting tokenized strings to token lists...")
df['Tokenized_Lyrics'] = df['Tokenized_Lyrics'].apply(convert_tokens_str_to_list)

Converting tokenized strings to token lists...


In [10]:
type(df['Tokenized_Lyrics'].iloc[1])

list

In [35]:
class LyricsDataset(Dataset):
    def __init__(self, token_lists, max_length=1024):
        self.token_lists = token_lists
        self.max_length = max_length

    def __len__(self):
        return len(self.token_lists)

    def __getitem__(self, idx):
        tokens = self.token_lists[idx]

        if len(tokens) > self.max_length:
            tokens = tokens[:self.max_length]
        attention_mask = [1] * len(tokens)

        padding_length = self.max_length - len(tokens)
        if padding_length > 0:
            tokens = tokens + [0] * padding_length  # 0 is usually the pad token ID
            attention_mask = attention_mask + [0] * padding_length

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        mask_tensor = torch.tensor(attention_mask, dtype=torch.long)

        return {"input_ids": tokens_tensor, "attention_mask": mask_tensor}

In [36]:
train_data, val_data = train_test_split(df['Tokenized_Lyrics'].tolist(), test_size=0.1, random_state=42)
print(f"Training set: {len(train_data)} songs")
print(f"Validation set: {len(val_data)} songs")

Training set: 5424 songs
Validation set: 603 songs


In [37]:
for i in range(50):
  print(len(train_data[i]))

124
558
440
263
529
984
21
119
532
759
401
19
1024
294
406
47
448
120
398
303
411
577
549
605
190
394
538
646
46
69
811
924
3
85
644
3
472
760
220
288
496
19
780
413
340
19
648
203
374
525


In [38]:
train_dataset = LyricsDataset(train_data)
val_dataset = LyricsDataset(val_data)

In [39]:
batch_size = 4 
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [40]:
model_name = "gpt2" 
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [41]:
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [43]:
def train_epoch(model, dataloader, optimizer, scheduler):
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})

    return total_loss / len(dataloader)

In [44]:
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss
            total_loss += loss.item()

    return total_loss / len(dataloader)

In [45]:
output_dir = "fine_tuned_gpt2_lyrics"
os.makedirs(output_dir, exist_ok=True)

In [46]:
num_epochs = 3
best_val_loss = float('inf')

In [47]:
print("Starting training...")
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Train
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler)
    print(f"Train loss: {train_loss:.4f}")

    # Evaluate
    val_loss = evaluate(model, val_dataloader)
    print(f"Validation loss: {val_loss:.4f}")

    # Save model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        # Save model
        model_path = os.path.join(output_dir, f"model_epoch_{epoch+1}")
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
        print(f"Model saved to {model_path}")

print("Training complete!")

Starting training...
Epoch 1/3


Training:   0%|          | 0/1356 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Training: 100%|██████████| 1356/1356 [31:40<00:00,  1.40s/it, loss=1.23]


Train loss: 1.2754


Evaluating: 100%|██████████| 151/151 [00:58<00:00,  2.58it/s]


Validation loss: 1.1244
Model saved to fine_tuned_gpt2_lyrics/model_epoch_1
Epoch 2/3


Training: 100%|██████████| 1356/1356 [31:50<00:00,  1.41s/it, loss=1.8]


Train loss: 1.1652


Evaluating: 100%|██████████| 151/151 [00:58<00:00,  2.58it/s]


Validation loss: 1.0970
Model saved to fine_tuned_gpt2_lyrics/model_epoch_2
Epoch 3/3


Training: 100%|██████████| 1356/1356 [31:48<00:00,  1.41s/it, loss=0.697]


Train loss: 1.1289


Evaluating: 100%|██████████| 151/151 [00:58<00:00,  2.59it/s]


Validation loss: 1.0880
Model saved to fine_tuned_gpt2_lyrics/model_epoch_3
Training complete!


In [39]:
def generate_lyrics(model, tokenizer, prompt="", max_length=100, temperature=0.78):
    model.eval()

    full_prompt = f"{prompt}"

    input_ids = tokenizer.encode(full_prompt, return_tensors="pt").to("cpu")

    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=50,
        top_p=0.90,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# !zip -r /content/fine_tuned_gpt2_lyrics.zip /content/fine_tuned_gpt2_lyrics

In [3]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import os
from sklearn.model_selection import train_test_split

In [47]:
import time

def type_words(sentence, delay=0.3):
    
    words = sentence.split()
    for i, word in enumerate(words):
        print(word, end=' ', flush=True)
        if i < len(words) - 1:
            time.sleep(delay)
    print()


In [72]:
model_path = f"./content/fine_tuned_gpt2_lyrics/model_epoch_3"
if os.path.exists(model_path):
    fine_tuned_model = GPT2LMHeadModel.from_pretrained(model_path).to("cpu")
    fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

    prompt = input("Enter your song lyrics : ")
    
    print("\nGenerating sample lyrics:")
    generated = generate_lyrics(fine_tuned_model, fine_tuned_tokenizer, prompt)
    type_words(generated)

    print("-" * 50)

Enter your song lyrics :  this is crazy and amazing feeling being with you



Generating sample lyrics:
this is crazy and amazing feeling being with you in the light of your own life i can't help but wonder if we could do this pre a girl like me would never leave us alone so let's just be ourselves baby keep loving yourself until it comes around to waking up at night oh yeah well don tell that story all through 'til she wakes from her dream then wake him out again take one look inside his head now he wants more too much no matter what happens after they get together boy why 
--------------------------------------------------
