<a href="https://colab.research.google.com/github/paolofantozzi/codemotion2020/blob/main/Beatles_scraping_and_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [2]:
from pathlib import Path
project_path = Path('drive/My Drive/colab_data/beatles')

In [3]:
!pip install lyricsgenius

Collecting lyricsgenius
[?25l  Downloading https://files.pythonhosted.org/packages/41/c1/b7d56971a43e430214727daf774623d8edd0c13fe7bac1f484d0934af29b/lyricsgenius-2.0.2-py3-none-any.whl (46kB)
[K     |███████▏                        | 10kB 21.7MB/s eta 0:00:01[K     |██████████████▎                 | 20kB 24.9MB/s eta 0:00:01[K     |█████████████████████▍          | 30kB 13.3MB/s eta 0:00:01[K     |████████████████████████████▌   | 40kB 11.6MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.5MB/s 
Installing collected packages: lyricsgenius
Successfully installed lyricsgenius-2.0.2


In [4]:
import lyricsgenius

genius = lyricsgenius.Genius('viAbQje3x3nPl-ReFJMT_s10V00XIHmhcGLtiqn73kSGh0ruipkg60_dyLf6ibfc')
genius.skip_non_songs = True
genius.excluded_terms = ['(Remix)', '(Live)', '(Take)', '(Version)', '(Mix)', '(Demo)']

lyrics_path = project_path / 'Lyrics_TheBeatles.json'
if not lyrics_path.exists():
  artist = genius.search_artist('Beatles', sort='title')
  artist.save_lyrics(filename=str(lyrics_path))

In [5]:
import json

def is_lyric(text):
  if not text:
    return False
  if '[Verse' not in text and '[Chorus]' not in text:
    return False
  return True

with open(lyrics_path, 'r') as lyrics_file:
  artist = json.load(lyrics_file)
  lyrics = [{'title': s['title'], 'lyrics': s['lyrics']} for s in artist['songs'] if is_lyric(s['lyrics'])]

discarded = len(artist['songs']) - len(lyrics)
print(f'Discarded {discarded}')
print(f'Keeped {len(lyrics)}')


Discarded 136
Keeped 268


In [6]:
def pre_process_song(song):
  song_title = song['title']
  song_lyrics = song['lyrics']
  return f'<s_song>\n{song_title}\n[Lyrics]\n{song_lyrics}\n<e_song>\n'

In [7]:
from sklearn.model_selection import train_test_split

all_data = [pre_process_song(song) for song in lyrics]
train_data_str, test_data_str = train_test_split(all_data, test_size=0.2)

print(f'Train size: {len(train_data_str)}')
print(f'Test size: {len(test_data_str)}')

Train size: 214
Test size: 54


In [8]:
train_path = project_path / 'train_data.txt'
with open(train_path, 'w') as train_data_file:
  train_data_file.writelines(f'{line}\n' for line in train_data_str)

test_path = project_path / 'test_data.txt'
with open(test_path, 'w') as test_data_file:
  test_data_file.writelines(f'{line}\n' for line in test_data_str)

In [9]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 11.3MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 49.7MB/s 
[?25hCollecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 52.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.

In [10]:
from transformers import AutoModelWithLMHead
from transformers import AutoTokenizer

MODEL_NAME = 'gpt2'
model = AutoModelWithLMHead.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [11]:
from transformers import DataCollatorForLanguageModeling
from transformers import TextDataset

train_data = TextDataset(
    tokenizer=tokenizer,
    file_path=str(train_path),
    block_size=128,
)

test_data = TextDataset(
    tokenizer=tokenizer,
    file_path=str(test_path),
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [12]:
from transformers import Trainer
from transformers import TrainingArguments

model_path = Path('./gpt2-beatles')
logs_path = Path('./logs')

training_args = TrainingArguments(
  output_dir=str(model_path),     # output directory
  logging_dir=str(logs_path),     # logs directory
  overwrite_output_dir=True,      # overwrite the content of the output directory
  num_train_epochs=70,            # number of training epochs
  per_device_train_batch_size=16, # batch size for training
  per_device_eval_batch_size=32,  # batch size for evaluation
  eval_steps=400,                 # number of update steps between two evaluations
  save_steps=800,                 # after how many steps model is saved
  warmup_steps=500,               # number of warmup steps for learning rate scheduler
)

trainer = Trainer(
  model=model,
  args=training_args,
  data_collator=data_collator,
  train_dataset=train_data,
  eval_dataset=test_data,
  prediction_loss_only=True,
)



In [13]:
trainer.train()

Step,Training Loss
500,2.166332
1000,0.956241
1500,0.357635
2000,0.188342
2500,0.138242


TrainOutput(global_step=2520, training_loss=0.7563310895647322)



```
function ClickConnect(){document.querySelector("#comments").click();}
setInterval(ClickConnect,60000)
```



In [14]:
trainer.evaluate()

{'epoch': 70.0, 'eval_loss': 4.07750940322876, 'total_flos': 3766404735959040}

In [15]:
trainer.save_model()

In [23]:
from transformers import pipeline

def generate(title, temperature, top_k):
  generator = pipeline('text-generation', model=str(model_path), tokenizer=tokenizer)
  return generator(
        f'<s_song>\n{title}\n[Lyrics]\n',
        max_length=10**3,
        temperature=temperature,
        top_k=top_k
  )[0]['generated_text']

In [17]:
print(generate('Love of my life', temperature=0.8, top_k=0))

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


<s_song>
Love of my life
[Lyrics]
[Verse 1]
Nothing can come between us
When it gets dark I tow your heart away
Never thought of act of love I've ever known

[Chorus]
For if my love had lasted a day
We might have broken down
If we'd seen each other before

[Verse 2]
I can't forget the time or place
Where we just met, she's just the girl for me
And I want all the world to see we've met
Mmm-mmm-mmm-da-da

[Chorus]
Love of my life
[Verse 3]
Nothing can come between us
When it gets dark I tow your heart away
Never thought of act of love I've ever known

[Chorus]
For if my love had lasted a day
We might have broken down
If we'd seen each other before

[Verse 4]
I can't forget the time or place
Where we just met, she's just the girl for me
And I want all the world to see we've met
Mmm-mmm-da-da-da

[Chorus]
Love of my life
[Verse 5]
I can't forget the time or place
Where we just met, she's just the girl for me
And I want all the world to see we've met
Mmm-mmm-da-da-da

[Chorus]
For if my lov

In [19]:
print(generate('Love of my life', temperature=0.8, top_k=40))

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


<s_song>
Love of my life
[Lyrics]
[Verse 1]
There is one love I have
Love that keeps me on my toes
Sitting here in Blue Jay Way

[Verse 2]
Don't want to spend
Another day here
Loving you like I've never done
In the morning
Just waiting for you

[Chorus]
Love of my life
Love of my life
Love of my life

[Verse 3]
Longer than the road that stretches out ahead

[Bridge]
And before too long, I fall in love with you

[Instrumental Break]

[Chorus]

[Verse 1]
Don't want to spend
Another day here
Loving you like I've never done
In the morning
Just waiting for you

[Chorus]

[Verse 2]
Don't want to spend
Another day here
Loving you like I've never done
In the morning
Just waiting for you

[Chorus]

[Verse 3]
Love of my life
Love of my life
Love of my life

[Bridge]
And before too long, I fall in love with you

[Instrumental Break]

[Chorus]

[Verse 1]
Don't want to spend
Another day here
Loving you like I've never done
In the morning
Just waiting for you

[Chorus]

[Verse 2]
Don't want to spend

In [20]:
print(generate('Love of my life', temperature=1.2, top_k=0))

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


<s_song>
Love of my life
[Lyrics]
[Verse 1]
It is not enough for me to show you my love
That is, I must show you my love
Who can forget the love I have who came to share
And show you the joy that comes from holding you close

[Verse 2]
When I call you on the telephone
You say, "It's me"
When I whisper in your ear
You say, "It's me, love"
Just like I've known the secret for some time now

[Chorus]
Now this is the end, happy with you

[Outro]
Love of my life
Love of my life
Love of my life

[Verse 3]
When I write down a short list of love songs
Each one speaking of a love that has already been
And leaving no stone unturned
I just have to suppose it belongs to you

[Chorus]
All the songs that you've just heard
All the songs that you're thinking of
All the songs that you're thinking of
All the songs that you're thinking of
All the songs that you're thinking of
All the songs that you're thinking of
All the songs that you're thinking of
All the songs that you're thinking of
All the songs tha

In [24]:
print(generate('Love of my life', temperature=1.2, top_k=40))

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


<s_song>
Love of my life
[Lyrics]
[Verse 1]
The stars that shine above
Will light our way to love

[Chorus]
Love of my life
Love of my life
Love of my life

[Verse 2]
In the evening sky
People tell me that there's someone there
But I haven't seen him there

[Chorus]
There's nowhere I can be
Just like this little blue hilltop
Will I ever be free
So go away

[Instrumental Verses]

[Verse 3]
As the June light turns to moonlight
I tune in to the sound
Like an octopus on an electric light dance

[Chorus]
As the June light turns to moonlight
I tune in to the sound
Like an octopus on an electric light dance

[Chorus]
I know I never really been aware of this before
But I've learned to dance like an African dancer

[Instrumental Verses]

[Verse 4]
At night where the sun is shining
People tell me that there's a presence there
But I haven't seen him there

[Chorus]
There's nowhere I can be
Just like this little blue hilltop
Will I ever be free
So go away

[Instrumental Verses]

[Verse 5]
As the J