Tokenization Plan:
---

1.  Tokenize lyrics using MurIL's WordPiece Tokenization Algorithm

2.  Truncate from the middle if longer than 512 tokens  
    a.  Calculate the midpoint of the lyrics and truncate from there    
    b.  Split the song into two parts (before and after the middle) and keep only the middle section, which gives us context without losing too much

3.  Pad shorter lyrics with [PAD]

4.  Sort by length before batching

5.  Define dataset and tokenization pipeline

6.  Set up the DataLoader and feed it to the model for training     
a. Allows us to shuffle the data    
b. Splits data into batches     
c. When using a custom collate_fn, it can handle dynamic padding by ensuring that each batch is padded to the correct length    
d. Loads data in parallel (multithreading)  

In [1]:
import os
import pandas as pd

root = os.path.abspath(os.path.join(os.getcwd(), ".."))
file_path = os.path.join(root, "data/lyrics/lyrics_cleaned_labelled_gcp.csv")

df = pd.read_csv(file_path)
df

Unnamed: 0,artist,title,cleaned_lyrics,mood
0,Aastha Gill,Kamariya,आप एक अच्छे व्यक्ति हैं। तोद् भरम् छोद् घब्रन ...,Energetic
1,Aastha Gill,Hermosa,आउग केह्न्दे बर्चेलोन तोन् आप और देते हैं? बब्...,Energetic
2,Aastha Gill,Naagin,लंबा गया है... तेरी बिन सुने परदेसी लंबा गया ह...,Energetic
3,Aastha Gill,Video Bana De,केमरे केमरेवाले वीडियो बना मैं लगदी आज आग सोह्...,Energetic
4,Aastha Gill,Saara India,"मिक्ससिंह घर में, घर में, घर में सुनो दोस्तों,...",Happy
...,...,...,...,...
1078,Vishal Dadlani,Good Morning,फर्राटे वाला फैन खर्राटे मारे मैन उठ उठ प्यारे...,Happy
1079,Vishal Dadlani,Womaniya,जय हनुमान ज्ञान गुण सागर जय कपीस तिहु लोक उजाग...,Energetic
1080,Vishal Dadlani,Aadat Hai Voh,"ख़ाबों लिफ़ाफ़ों में, क़िस्सों में, किताबों में फ़ु...",Romantic
1081,Vishal Dadlani,Aao Na,"जले, जले, जले, दीये जल सारी ज़िंदगी, ओ-ओ जले, ज...",Energetic


In [None]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained("google/muril-base-cased")

In [3]:
specific_entry = df.iloc[6]
lyrics = specific_entry['cleaned_lyrics']
artist = specific_entry['artist']
title = specific_entry['title']
print(f"'{title}' by {artist}: {lyrics}")

'Kuch Kuch Hota Hai (Sad Version)' by Alka Yagnik: जान-ए-वफ़ा होके बेकरार जान-ए-वफ़ा होके बेकरार बरसों किया मैंने इंतज़ार पर कभी तूने नहीं सब कहा, अब दिल बेबसी में चुपके रोता करूँ, हाय, कुछ-कुछ करूँ, हाय, कुछ-कुछ


In [16]:
tokenized_lyrics = tokenizer(text=lyrics, padding=False, truncation=False)['input_ids']
print(tokenized_lyrics)

[104, 7593, 120, 426, 120, 461, 55836, 1218, 1162, 5461, 128940, 5133, 7593, 120, 426, 120, 461, 55836, 1218, 1162, 5461, 128940, 5133, 178788, 1219, 6226, 184626, 1397, 157695, 1154, 2893, 13865, 1530, 1254, 4297, 1539, 119, 1913, 9372, 40486, 143107, 1114, 145629, 59843, 1582, 173432, 119, 55873, 119, 1508, 120, 1508, 173432, 119, 55873, 119, 1508, 120, 1508, 105]


In [17]:
def truncate_middle(tokens, max_len=512):
    if len(tokens) > 512:
        midpoint = max_len//2
        left, right = max(0,midpoint-max_len//2), min(len(tokens),midpoint+max_len//2)
        tokens = tokens[left:right]
    
    return tokens

tokenized_lyrics = truncate_middle(tokenized_lyrics)
print(tokenized_lyrics)

[104, 7593, 120, 426, 120, 461, 55836, 1218, 1162, 5461, 128940, 5133, 7593, 120, 426, 120, 461, 55836, 1218, 1162, 5461, 128940, 5133, 178788, 1219, 6226, 184626, 1397, 157695, 1154, 2893, 13865, 1530, 1254, 4297, 1539, 119, 1913, 9372, 40486, 143107, 1114, 145629, 59843, 1582, 173432, 119, 55873, 119, 1508, 120, 1508, 173432, 119, 55873, 119, 1508, 120, 1508, 105]


In [18]:
def pad_sequence(tokens, max_len=512):
    padded_tokens = tokens + [tokenizer.pad_token_id]*(max_len-len(tokens))
    return padded_tokens

tokenized_lyrics = pad_sequence(tokenized_lyrics)
print(tokenized_lyrics)

[104, 7593, 120, 426, 120, 461, 55836, 1218, 1162, 5461, 128940, 5133, 7593, 120, 426, 120, 461, 55836, 1218, 1162, 5461, 128940, 5133, 178788, 1219, 6226, 184626, 1397, 157695, 1154, 2893, 13865, 1530, 1254, 4297, 1539, 119, 1913, 9372, 40486, 143107, 1114, 145629, 59843, 1582, 173432, 119, 55873, 119, 1508, 120, 1508, 173432, 119, 55873, 119, 1508, 120, 1508, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [27]:
import numpy as np
def tokenize_and_len(row):
    tokens = tokenizer(row['cleaned_lyrics'])['input_ids']
    tokens = truncate_middle(tokens)
    return tokens, len(tokens)

df[['tokenized_lyrics', 'token_len']] = df.apply(tokenize_and_len, axis=1, result_type="expand")
df

Unnamed: 0,artist,title,cleaned_lyrics,mood,tokenized_lyrics,token_len
0,Aastha Gill,Kamariya,आप एक अच्छे व्यक्ति हैं। तोद् भरम् छोद् घब्रन ...,Energetic,"[104, 1840, 1127, 10814, 2990, 1145, 492, 1273...",324
1,Aastha Gill,Hermosa,आउग केह्न्दे बर्चेलोन तोन् आप और देते हैं? बब्...,Energetic,"[104, 161032, 3805, 1110, 128784, 135742, 5492...",363
2,Aastha Gill,Naagin,लंबा गया है... तेरी बिन सुने परदेसी लंबा गया ह...,Energetic,"[104, 36069, 1258, 1115, 121, 121, 121, 51336,...",355
3,Aastha Gill,Video Bana De,केमरे केमरेवाले वीडियो बना मैं लगदी आज आग सोह्...,Energetic,"[104, 1110, 31459, 1216, 1110, 31459, 58244, 1...",445
4,Aastha Gill,Saara India,"मिक्ससिंह घर में, घर में, घर में सुनो दोस्तों,...",Happy,"[104, 103842, 9029, 2181, 1114, 119, 2181, 111...",257
...,...,...,...,...,...,...
1078,Vishal Dadlani,Good Morning,फर्राटे वाला फैन खर्राटे मारे मैन उठ उठ प्यारे...,Happy,"[104, 58238, 16971, 14406, 3399, 118571, 18444...",251
1079,Vishal Dadlani,Womaniya,जय हनुमान ज्ञान गुण सागर जय कपीस तिहु लोक उजाग...,Energetic,"[104, 11258, 15718, 4548, 8854, 9611, 11258, 6...",125
1080,Vishal Dadlani,Aadat Hai Voh,"ख़ाबों लिफ़ाफ़ों में, क़िस्सों में, किताबों में फ़ु...",Romantic,"[104, 487, 37591, 1325, 28167, 55836, 1218, 55...",222
1081,Vishal Dadlani,Aao Na,"जले, जले, जले, दीये जल सारी ज़िंदगी, ओ-ओ जले, ज...",Energetic,"[104, 58867, 119, 58867, 119, 58867, 119, 2095...",314


In [28]:
df.sort_values(by='token_len', ascending=True, inplace=True)
df

Unnamed: 0,artist,title,cleaned_lyrics,mood,tokenized_lyrics,token_len
212,Asha Bhosle,Haye Haye Tera Jawab,हाय हाय तेरा जवाब ऐसा सबब तौबा जलवा तेरा हाय ह...,Romantic,"[104, 55873, 55873, 40844, 11399, 3102, 154738...",34
631,Mahendra Kapoor,Jahan Daal Daal Pe,जहाँ डाल-डाल पर सोने की चिड़िया करती बसेरा भारत...,Happy,"[104, 4404, 17337, 120, 17337, 1154, 18013, 11...",50
1041,Udit Narayan,Majhi Dai Le,माझी दाईले जुनिभरि डुंगा तारिरहे माझी दाईले जु...,Happy,"[104, 26659, 127014, 1234, 3180, 98611, 20970,...",51
244,Atif Aslam,Jab koi baat,बात् बिगद् जये मुश्क़िल् पद् जये देन हुमनावा 2 ...,Romantic,"[104, 2231, 1343, 18561, 18250, 11258, 1216, 1...",54
838,Shaan,Fanaa For You (Chand Sifarish Club Mix),दिल् फ़न दिल् तबह् उद्ने यह...2 अगल-बगल...3 सच ...,Energetic,"[104, 54343, 491, 1303, 54343, 3143, 98893, 52...",54
...,...,...,...,...,...,...
349,Dino James,Tandav,"देखो, अंदर हूँ शरीफ़ आदमी तभी गानों में हमेशा ह...",Energetic,"[104, 55447, 119, 10306, 7096, 462, 2546, 5583...",512
348,Dino James,Thanks A Lot,"पूज पाथ्, गुलाबी, उप्वस् मुझ्को दिल् रज़् केह्न...",Sad,"[104, 2790, 2369, 12530, 152875, 119, 35785, 1...",512
347,Dino James,Yaadein,नहीं पता साथ ग़लत हून् बेचैन् तुम शिकायत क्यों...,Sad,"[104, 1254, 4401, 1283, 140391, 56574, 1343, 1...",512
363,Dino James,D N Me,आपको पता समय हुआ तत्त-तत्त-त-त्रा आकश् दो! तुम...,Energetic,"[104, 3079, 4401, 1541, 1648, 11219, 1397, 120...",512


In [29]:
import torch
from torch.utils.data import Dataset, DataLoader

class LyricsDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        lyrics = self.dataset[idx]
        tokens = self.tokenizer(lyrics)['input_ids']
        tokens = truncate_middle(tokens, self.max_len)
        tokens = pad_sequence(tokens, self.max_len)
        return torch.tensor(tokens)

dataset = LyricsDataset(df, tokenizer, 512)
