## DALI dataset 수집

In [6]:
import DALI as dali_code
import os

In [8]:
audio_path = '/Users/seohyeongyu/Desktop/work/naver-ai/data/DALI_v1.0/audio'

# Get DALI data idx of which audio is collected
collected_dali_idx = []
for file in os.listdir(audio_path):
    dali_idx = file.split(".")[0]
    collected_dali_idx.append(dali_idx)

In [10]:
# Get DALI dataset
dali_data_path = '/Users/seohyeongyu/Desktop/work/naver-ai/data/DALI_v1.0'
dali_data = dali_code.get_the_DALI_dataset(dali_data_path, keep=collected_dali_idx)

In [31]:
# Read all lyrics from dali_data
lyrics = []
for idx, entry in dali_data.items():
    info = entry.info
    language = info['metadata']['language']
    
    if language != 'english':
        continue
    
    lines = entry.annotations['annot']['lines']
    for i in range(len(lines) - 1):
        curr = lines[i]['text']
        nxt = lines[i+1]['text']
        lyrics.append((curr, nxt))

Lyrics of  For Reasons Unknown
Lyrics of  Some Like It Cold
Lyrics of  Marry You
Lyrics of  Baby Can I Hold You
Lyrics of  Holy Mountains
Lyrics of  Dark Side
Lyrics of  I Am What I Am
Lyrics of  Roadside
Lyrics of  Castaway
Lyrics of  Cat's In The Cradle
Lyrics of  Imperfection
Lyrics of  Neon Knights
Lyrics of  Youth Gone Wild
Lyrics of  In Your World
Lyrics of  Denial
Lyrics of  Circadian Rhythm (Last Dance)
Lyrics of  Losing The Ground
Lyrics of  She Goes Nana
Lyrics of  Outta Here
Lyrics of  Can't Repeat
Lyrics of  Tangled Up In You
Lyrics of  Let There Be Love
Lyrics of  A Little More Love


## DALI dataset 이용한 학습

In [36]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPTNeoForCausalLM
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import matplotlib.pyplot as plt

In [37]:
# Training setup
acc_steps = 100
pre_trained_model_name = "EleutherAI/gpt-neo-125M"
model = GPTNeoForCausalLM.from_pretrained(pre_trained_model_name)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=200, num_training_steps=1
)

In [72]:
class LyricsDataset(Dataset):
    def __init__(self, lyrics):
        pretrained_model_name = "EleutherAI/gpt-neo-125M"
        self.tokenizer: GPT2Tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.lyrics = []
        
        for lyric in lyrics:
            curr = lyric[0]
            nxt = lyric[1]
            
            encoded = self.tokenizer([
                f"{curr}{self.tokenizer.eos_token}",
                f"{nxt}{self.tokenizer.eos_token}"
            ], padding=True)
            
            curr_encoded = encoded.input_ids[0]
            nxt_encoded = encoded.input_ids[1]
            
            print(curr_encoded, nxt_encoded)
            
            curr_tensored = torch.tensor(curr_encoded)
            nxt_tensored = torch.tensor(nxt_encoded)
            
            self.lyrics.append((curr_tensored, nxt_tensored))
        
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count
    
    def __getitem__(self, idx):
        return self.lyrics[idx]

In [73]:
dataset = LyricsDataset(lyrics)

[72, 2353, 616, 1339, 13, 50256] [72, 2198, 616, 1986, 13, 50256]
[72, 2198, 616, 1986, 13, 50256, 50256, 50256] [72, 804, 257, 1310, 1643, 4697, 13, 50256]
[72, 804, 257, 1310, 1643, 4697, 13, 50256] [72, 804, 257, 1310, 1643, 38427, 13, 50256]
[72, 804, 257, 1310, 1643, 38427, 13, 50256] [4480, 530, 2769, 8033, 50256, 50256, 50256, 50256]
[4480, 530, 2769, 8033, 50256] [392, 530, 1263, 2239, 50256]
[392, 530, 1263, 2239, 50256, 50256, 50256, 50256] [72, 1445, 257, 1310, 1643, 5699, 13, 50256]
[72, 1445, 257, 1310, 1643, 5699, 13, 50256] [72, 1445, 257, 1310, 1643, 5699, 13, 50256]
[72, 1445, 257, 1310, 1643, 5699, 13, 50256] [1640, 3840, 6439, 13, 50256, 50256, 50256, 50256]
[1640, 3840, 6439, 13, 50256, 50256] [72, 4978, 616, 33769, 13, 50256]
[72, 4978, 616, 33769, 13, 50256, 50256] [72, 13112, 290, 781, 798, 13, 50256]
[72, 13112, 290, 781, 798, 13, 50256] [72, 760, 611, 22701, 338, 1611, 50256]
[72, 760, 611, 22701, 338, 1611, 50256, 50256, 50256, 50256] [72, 1053, 1392, 262, 133

[5171, 470, 1037, 502, 7471, 50256, 50256, 50256, 50256, 50256] [27218, 10953, 484, 1394, 319, 7463, 866, 319, 502, 50256]
[27218, 10953, 484, 1394, 319, 7463, 866, 319, 502, 50256] [27218, 10953, 484, 1394, 319, 7463, 866, 319, 502, 50256]
[27218, 10953, 484, 1394, 319, 7463, 866, 319, 502, 50256] [27218, 10953, 484, 1394, 319, 7463, 866, 319, 502, 50256]
[27218, 10953, 484, 1394, 319, 7463, 866, 319, 502, 50256] [27218, 10953, 484, 1839, 470, 307, 3750, 50256, 50256, 50256]
[27218, 10953, 484, 1839, 470, 307, 3750, 50256] [4480, 23608, 5667, 50256, 50256, 50256, 50256, 50256]
[7091, 338, 588, 257, 27223, 50256, 50256] [82, 1322, 981, 673, 1278, 1666, 50256]
[82, 1322, 981, 673, 1278, 1666, 50256] [76, 5241, 588, 257, 7850, 50256, 50256]
[76, 5241, 588, 257, 7850, 50256] [15344, 82, 981, 673, 15623, 50256]
[15344, 82, 981, 673, 15623, 50256, 50256] [78, 1219, 267, 1219, 11, 10194, 50256]
[78, 1219, 267, 1219, 11, 10194, 50256, 50256, 50256] [392, 788, 673, 2876, 1424, 534, 2612, 1497,

In [58]:
train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
loss = 0
epochs = 5
accumulating_batch_count = 0
batch_size = 16

In [74]:
for epoch in range(epochs):
    print(f"Training epoch {epoch}")
        
    for idx, entry in tqdm(enumerate(train_dataloader)):
        curr = entry[0]
        nxt = entry[1]
        
        outputs = model(curr, labels=nxt)
        loss = outputs[0]
        loss.backward()
        
        if (accumulating_batch_count % batch_size) == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()
            
        accumulating_batch_count += 1

Training epoch 0


1119it [04:38,  4.02it/s]


Training epoch 1


1119it [04:37,  4.03it/s]


Training epoch 2


1119it [04:36,  4.04it/s]


Training epoch 3


1119it [04:35,  4.06it/s]


Training epoch 4


1119it [04:36,  4.05it/s]


In [75]:
torch.save(model.state_dict(), "DALI-tuned-gpt-neo.pt")

In [64]:
lyrics?