In [1]:
from pathlib import Path
import json

In [2]:
lyrics_path_cleaned = Path('Lyrics_TheBeatles_cleaned.json')

In [3]:
with open(lyrics_path_cleaned, 'r') as lyrics_file_cleaned:
    lyrics = json.load(lyrics_file_cleaned)

In [4]:
project_path = Path('beatles_models')

In [5]:
def pre_process_song(song):
  song_title = song['title']
  song_lyrics = song['lyrics']
  return f'<s_song>\n{song_title}\n[Lyrics]\n{song_lyrics}\n<e_song>\n'

In [6]:
from sklearn.model_selection import train_test_split

all_data = [pre_process_song(song) for song in lyrics]
train_data_str, test_data_str = train_test_split(all_data, test_size=0.2)

print(f'Train size: {len(train_data_str)}')
print(f'Test size: {len(test_data_str)}')

Train size: 212
Test size: 54


In [7]:
train_path = project_path / 'train_data.txt'
with open(train_path, 'w') as train_data_file:
  train_data_file.writelines(f'{line}\n' for line in train_data_str)

test_path = project_path / 'test_data.txt'
with open(test_path, 'w') as test_data_file:
  test_data_file.writelines(f'{line}\n' for line in test_data_str)

In [8]:
!pip install transformers

distutils: /opt/conda/include/python3.8/UNKNOWN
sysconfig: /opt/conda/include/python3.8[0m
user = False
home = None
root = None
prefix = None[0m
distutils: /opt/conda/include/python3.8/UNKNOWN
sysconfig: /opt/conda/include/python3.8[0m
user = False
home = None
root = None
prefix = None[0m
You should consider upgrading via the '/opt/conda/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [9]:
from transformers import AutoModelWithLMHead
from transformers import AutoTokenizer

MODEL_NAME = 'gpt2'
model = AutoModelWithLMHead.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [10]:
import torch
torch.cuda.is_available()

True

In [11]:
model = model.to('cuda:0')

In [12]:
from transformers import DataCollatorForLanguageModeling
from transformers import TextDataset

train_data = TextDataset(
    tokenizer=tokenizer,
    file_path=str(train_path),
    block_size=128,
)

test_data = TextDataset(
    tokenizer=tokenizer,
    file_path=str(test_path),
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)



In [13]:
from transformers import Trainer
from transformers import TrainingArguments

model_path = Path('./gpt2-beatles')
logs_path = Path('./logs')

training_args = TrainingArguments(
  output_dir=str(model_path),     # output directory
  logging_dir=str(logs_path),     # logs directory
  overwrite_output_dir=True,      # overwrite the content of the output directory
  num_train_epochs=70,            # number of training epochs
  per_device_train_batch_size=16, # batch size for training
  per_device_eval_batch_size=32,  # batch size for evaluation
  eval_steps=400,                 # number of update steps between two evaluations
  save_steps=800,                 # after how many steps model is saved
  warmup_steps=500,               # number of warmup steps for learning rate scheduler
)

trainer = Trainer(
  model=model,
  args=training_args,
  data_collator=data_collator,
  train_dataset=train_data,
  eval_dataset=test_data,
#   prediction_loss_only=True,
)

In [14]:
trainer.train()

***** Running training *****
  Num examples = 562
  Num Epochs = 70
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2520


Step,Training Loss
500,2.1444
1000,0.9428
1500,0.3493
2000,0.1838
2500,0.1336


Saving model checkpoint to gpt2-beatles/checkpoint-800
Configuration saved in gpt2-beatles/checkpoint-800/config.json
Model weights saved in gpt2-beatles/checkpoint-800/pytorch_model.bin
Saving model checkpoint to gpt2-beatles/checkpoint-1600
Configuration saved in gpt2-beatles/checkpoint-1600/config.json
Model weights saved in gpt2-beatles/checkpoint-1600/pytorch_model.bin
Saving model checkpoint to gpt2-beatles/checkpoint-2400
Configuration saved in gpt2-beatles/checkpoint-2400/config.json
Model weights saved in gpt2-beatles/checkpoint-2400/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2520, training_loss=0.745812770109328, metrics={'train_runtime': 858.4908, 'train_samples_per_second': 45.825, 'train_steps_per_second': 2.935, 'total_flos': 2569807134720000.0, 'train_loss': 0.745812770109328, 'epoch': 70.0})

In [15]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 134
  Batch size = 32


{'eval_loss': 4.135647296905518,
 'eval_runtime': 0.9036,
 'eval_samples_per_second': 148.291,
 'eval_steps_per_second': 5.533,
 'epoch': 70.0}

In [16]:
trainer.save_model()

Saving model checkpoint to gpt2-beatles
Configuration saved in gpt2-beatles/config.json
Model weights saved in gpt2-beatles/pytorch_model.bin


In [17]:
from transformers import pipeline

def generate(title, temperature, top_k):
  generator = pipeline('text-generation', model=str(model_path), tokenizer=tokenizer)
  return generator(
        f'<s_song>\n{title}\n[Lyrics]\n',
        max_length=10**3,
        temperature=temperature,
        top_k=top_k
  )[0]['generated_text']

In [18]:
print(generate('Love of my life', temperature=0.8, top_k=0))

loading configuration file gpt2-beatles/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-beatles",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "use_cache": true,
  "vocab_size": 50257
}

loading 

<s_song>
Love of my life
[Lyrics]
[Verse 1]
I don't know why nobody told you
You should know that I love you
More than anyone can
Love of my life

[Verse 2]
You don't need me to show the way
Just call on me when you're lonely
I'll keep my love for you only
I'll call on you if I'm lonely too

[Bridge]
And it really doesn't matter if I'm wrong, I'm right
I wouldn't mind if I knew how you feel

[Verse 3]
Although the days are few, they're filled with tears
And since your love has departed
I just don't know how you will be able to come back

[Bridge]
And it really doesn't matter if I'm wrong, I'm right
I wouldn't mind if I knew how you feel

[Verse 4]
Though the days are few, they're filled with tears
And since your love has departed
I just don't know how you will be able to come back

[Verse 5]
Though the days are few, they're filled with tears
And since your love has departed
I just don't know how you will be able to come back

[Outro]
You've been good to me
You've been good to me
<e_son