In [1]:
from tqdm import tqdm

https://towardsdatascience.com/how-to-fine-tune-gpt-2-for-text-generation-ae2ea53bc272

Dataset from

https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres

https://github.com/aparrish/gutenberg-poetry-corpus/blob/master/quick-experiments.ipynb

In [7]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

### Prepare data
lyrics = pd.read_csv('songlyrics/lyrics-data.csv')
lyrics = lyrics[lyrics['language']=='en']

#Only keep popular artists, with genre Rock/Pop and popularity high enough
artists = pd.read_csv('songlyrics/artists-data.csv')
artists = artists[(artists['Genres'].isin(['Rock'])) & (artists['Popularity']>5)]
df = lyrics.merge(artists[['Artist', 'Genres', 'Link']], left_on='ALink', right_on='Link', how='inner')
df = df.drop(columns=['ALink','SLink','language','Link'])

#Drop the songs with lyrics too long (after more than 1024 tokens, does not work)
df = df[df['Lyric'].apply(lambda x: len(x.split(' ')) < 350)]

#Create a very small test set to compare generated text with the reality
test_set = df.sample(n = 200)
df = df.loc[~df.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
df = df.reset_index()

#For the test set only, keep last 20 words in a new column, then remove them from original column
test_set['True_end_lyrics'] = test_set['Lyric'].str.split().str[-20:].apply(' '.join)
test_set['Lyric'] = test_set['Lyric'].str.split().str[:-20].apply(' '.join)

In [11]:
df.head()

Unnamed: 0,index,SName,Lyric,Artist,Genres
0,0,What's Up,Twenty-five years and my life is still\nTrying...,4 Non Blondes,Rock
1,1,Spaceman,Starry night bring me down\nTill I realize the...,4 Non Blondes,Rock
2,2,Pleasantly Blue,Every time you wake in the mornin'\nAnd you st...,4 Non Blondes,Rock
3,3,I'm The One,Ah-hah!\nWoo!\nAh-ha-ha-ha-ha-ha!\nWe came her...,4 Non Blondes,Rock
4,4,Dear Mr. President,I'm looking outside of my windows\nThe view th...,4 Non Blondes,Rock


In [12]:
class SongLyrics(Dataset):  
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in df['Lyric']:
          self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))               
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]
    
dataset = SongLyrics(df['Lyric'], truncate=True, gpt2_type="gpt2")

In [14]:
dataset

<__main__.SongLyrics at 0x7fbe2c690af0>

In [15]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [18]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cpu")
#    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

Train the model

In [20]:
model = train(dataset, model, tokenizer)

Training epoch 0
0


111it [08:57,  4.84s/it]


KeyboardInterrupt: 

In [22]:
torch.save(model,"checkpoint.pth")

In [24]:
model = torch.load("checkpoint.pth")

In [28]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_lyrics = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Lyric'][i], entry_count=1)
    generated_lyrics.append(x)
  return generated_lyrics

#Run the functions to generate the lyrics
generated_lyrics = text_generation(test_set)

100%|██████████| 1/1 [00:49<00:00, 49.21s/it]
100%|██████████| 1/1 [00:46<00:00, 46.11s/it]
100%|██████████| 1/1 [00:16<00:00, 16.65s/it]
100%|██████████| 1/1 [00:34<00:00, 34.09s/it]
100%|██████████| 1/1 [00:27<00:00, 27.32s/it]
100%|██████████| 1/1 [00:21<00:00, 21.10s/it]
100%|██████████| 1/1 [00:37<00:00, 37.35s/it]
100%|██████████| 1/1 [00:41<00:00, 41.55s/it]
100%|██████████| 1/1 [00:37<00:00, 37.46s/it]
100%|██████████| 1/1 [00:34<00:00, 34.62s/it]
100%|██████████| 1/1 [00:21<00:00, 21.24s/it]
100%|██████████| 1/1 [00:30<00:00, 30.05s/it]
100%|██████████| 1/1 [00:25<00:00, 25.89s/it]
100%|██████████| 1/1 [00:10<00:00, 10.93s/it]
100%|██████████| 1/1 [00:17<00:00, 17.43s/it]
  0%|          | 0/1 [00:15<?, ?it/s]


KeyboardInterrupt: 

In [31]:
generated_lyrics = text_generation(test_set.head())

100%|██████████| 1/1 [00:38<00:00, 38.85s/it]
100%|██████████| 1/1 [00:44<00:00, 44.60s/it]
100%|██████████| 1/1 [00:21<00:00, 21.06s/it]
100%|██████████| 1/1 [00:37<00:00, 37.05s/it]
100%|██████████| 1/1 [00:25<00:00, 25.63s/it]


In [33]:
print (generated_lyrics[0])

["Solemn faced, the village settles down, undetected by the stars And the hangman plays the mandolin before he goes to sleep And the last thing on his mind Is the Wild Eyed Boy imprisoned neath the covered wooden shaft Folds the rope into its bag Blows his pipe of smolders, blankets smoke into the room And the day will end for some as the night begins for one Staring through the message in his eyes, lies a solitary son From the mountain called Freecloud where the eagle dare not fly And the patience in his sigh gives no indication for the townsmen to decide So the village Dreadful yawns pronouncing gross diversion as the label for the dog oh it's the madness in his eyes as he breaks the night to cry It's really me Really you and really me It's so hard for us to really be Really you and really me You'll lose me though I'm always really free And the mountain moved its eyes To the world of realize Where the snow had saved a place For the Wild Eyed Boy from Freecloud And the village Dreadfu

In [34]:
!ls -lh

total 1023344
-rw-r--r--@   1 jbt694  staff    12K Feb 23 16:34 Copy_of_text_similarity.ipynb
-rw-r--r--    1 jbt694  staff   7.4K Jun  1 14:05 GPT2 language scoring.ipynb
-rw-r--r--    1 jbt694  staff    22K Jun  1 14:04 GPT2-language generation.ipynb
drwxr-xr-x    3 jbt694  staff    96B Feb 18 13:57 [34m__pycache__[m[m
-rw-r--r--    1 jbt694  staff   487M Jun  1 15:16 checkpoint.pth
-rw-r--r--    1 jbt694  staff   1.7K Jun  1 14:52 finetune_poetry.ipynb
-rw-r--r--    1 jbt694  staff    51K Jun  1 15:30 finetune_songs.ipynb
-rw-r--r--@   1 jbt694  staff   556B Mar 23 13:18 format_poemsdataset.py
drwxr-xr-x   13 jbt694  staff   416B May 13 11:31 [34mhuman_kaggle_scansion[m[m
-rw-r--r--    1 jbt694  staff   1.4K May 13 12:49 humankaggleenglishrhymes.txt
drwxr-xr-x   13 jbt694  staff   416B Jun  1 13:58 [34mlm-scorer[m[m
drwxr-xr-x    6 jbt694  staff   192B Jun  1 13:58 [34mlm_scorer[m[m
drwxr-xr-x   16 jbt694  staff   512B May 13 12:52 [34mopenai_scansion[m[m