In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from loguru import logger

import multiprocessing as mp

In [None]:
def get_song_urls() -> list:
    """
    It returns a list of Zucchero's song lyrics urls.
    """
    
    url = "https://www.zucchero.it/eng/lyrics/"
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    song_urls = [item["href"] for item in soup.find('div', {'class':'article-content'}).ul.find_all(href=True)]
    
    return song_urls

@logger.catch
def get_song_text(song_url: str) -> str:
    """
    Given a song's url as input, it returns the text of the song scraped from the webpage.
    """
    
    page = urlopen(song_url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    condition1 = song_url in ['https://www.zucchero.it/eng/testi/alla-fine/',
                              'https://www.zucchero.it/eng/testi/e-un-peccato-morir/','https://www.zucchero.it/eng/testi/god-bless-the-child/']
    condition2 = song_url == 'https://www.zucchero.it/eng/testi/il-suono-della-domenica/'
    condition3 = song_url in ['https://www.zucchero.it/eng/testi/diamante/',
                             'https://www.zucchero.it/eng/testi/soldati-nella-mia-citta/','https://www.zucchero.it/eng/testi/spicinfrin-boy/', 
                              'https://www.zucchero.it/eng/oltre-le-rive/', 'https://www.zucchero.it/eng/alla-fine/', 'https://www.zucchero.it/eng/shake/']
    try:
        if condition1:
            song_text = '. '.join([item.get_text().replace('\n','. ') for item in soup.find('div', {'class':'article-content'}).find_all("p")[1:] if '<em>' not in str(item)])
        elif condition2:
            song_text = '. '.join([item.get_text().replace('\n','. ') for item in soup.find('div', {'class':'article-content'}).find_all("p")[:-1] if '<em>' not in str(item)])
        elif condition3:    
            return
        else:
            song_text = '. '.join([item.get_text().replace('\n','. ') for item in soup.find('div', {'class':'article-content'}).find_all("p") if '<em>' not in str(item)])
        return song_text
    except:
        return


def build_dataset(songs: list, filename: str):
    """
    Write a .txt file where each song includes the special tokens <BOS> at the beginning and <EOS> at the end.
    """
    
    f = open(f'{filename}.txt', 'w', encoding='utf-8')
    data = ''
    for song in songs:
        song = str(song).strip()
        bos_token = '<BOS>'
        eos_token = '<EOS>'
        data += bos_token + ' ' + song + ' ' + eos_token + '\n'
        
    f.write(data)
    


In [None]:
song_urls = get_song_urls()

songs = []
for url in tqdm(song_urls):
    song_text = get_song_text(url)
    if song_text in ['', None]:
        continue
    else:
        songs.append(song_text)
    
x_train, x_test = train_test_split(songs, test_size=0.1, random_state=42)

build_dataset(x_train, 'train')
build_dataset(x_test, 'test')

# Check longest string for padding

In [1]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('LorenzoDeMattei/GePpeTto',
                                         bos_token='<|startoftext|>',
                                         eos_token='<|endoftext|>',
                                         pad_token='<|pad|>')

tokenizer.encode("ciao mamma")

Downloading:   0%|          | 0.00/547k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[21191, 8027]

In [17]:
with open('./dataset/sugar_lyrics.txt', encoding="utf8") as f:
    songs = [s.strip('\n') for s in f.readlines()]

In [19]:
max_song = max([len(tokenizer.encode(song)) for song in songs])

print(f'The longest song text is {max_song} tokens long.')

The longest song text is 506 tokens long.
