In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
model = AutoModelForMaskedLM.from_pretrained("google/muril-base-cased")
df = pd.read_csv('data/lyrics_cleaned_labelled.csv')

def tokenize_lyrics(lyrics:str):
    return tokenizer(lyrics, padding=True, truncation=True, max_length=128)

def encode_lyrics(tokenized_lyrics:str):
    input_ids = tokenized_lyrics['input_ids']
    attention_mask = tokenized_lyrics['attention_mask']
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask}

df['tokenized_lyrics'] = np.vectorize(tokenize_lyrics)(df['cleaned_lyrics'])
df['encoded_lyrics'] = np.vectorize(encode_lyrics)(df['tokenized_lyrics'])


In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
import os

tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
model = AutoModelForMaskedLM.from_pretrained("google/muril-base-cased")
data_path = os.path.join("..", "data","lyrics_cleaned_labelled.csv")
df = pd.read_csv(data_path)
df

Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,artist,title,cleaned_lyrics,mood
0,Aastha Gill,Kamariya,seeli peeli pa balam tod bharam chhod ghabrana...,Energetic
1,Aastha Gill,Hermosa,aauga kehnde barcelona ton utri ae dass mainu ...,Energetic
2,Aastha Gill,Naagin,long tara sana paradasa long tara sana paradas...,Energetic
3,Aastha Gill,Video Bana De,kamara kamaravala vadaya da ra ma lagada aja a...,Happy
4,Aastha Gill,Saara India,mixsingh house house house sana madaa va ha ka...,Happy
...,...,...,...,...
1049,Vishal Dadlani,Good Morning,phararata phana khararata mara utha ra utha pa...,Energetic
1050,Vishal Dadlani,Womaniya,jaya hanamana janana gana sagara jaya kapasa t...,Romantic
1051,Vishal Dadlani,Aadat Hai Voh,kaba lafafa ma qasasa ma kataba ma farasata ma...,Romantic
1052,Vishal Dadlani,Aao Na,jala jala jala daya jala zadaga oo jala jala j...,Romantic


In [8]:
def tokenize_lyrics(lyrics:str):
    return tokenizer(lyrics, padding=True, truncation=True, max_length=128)

df['tokenized_lyrics'] = [tokenize_lyrics(lyric) for lyric in df['cleaned_lyrics']]
df['tokenized_lyrics']

0       [input_ids, token_type_ids, attention_mask]
1       [input_ids, token_type_ids, attention_mask]
2       [input_ids, token_type_ids, attention_mask]
3       [input_ids, token_type_ids, attention_mask]
4       [input_ids, token_type_ids, attention_mask]
                           ...                     
1049    [input_ids, token_type_ids, attention_mask]
1050    [input_ids, token_type_ids, attention_mask]
1051    [input_ids, token_type_ids, attention_mask]
1052    [input_ids, token_type_ids, attention_mask]
1053    [input_ids, token_type_ids, attention_mask]
Name: tokenized_lyrics, Length: 1054, dtype: object

In [9]:
def encode_lyrics(tokenized_lyrics):
    return {'input_ids': tokenized_lyrics['input_ids'], 'attention_mask': tokenized_lyrics['attention_mask']}

df['encoded_lyrics'] = [encode_lyrics(lyric) for lyric in df['tokenized_lyrics']]
df['encoded_lyrics']

0       {'input_ids': [104, 2455, 3547, 175240, 1206, ...
1       {'input_ids': [104, 178675, 4057, 1159, 26871,...
2       {'input_ids': [104, 2650, 10007, 17981, 1192, ...
3       {'input_ids': [104, 105786, 2219, 105786, 7093...
4       {'input_ids': [104, 22351, 99807, 2949, 2949, ...
                              ...                        
1049    {'input_ids': [104, 182093, 8747, 45540, 18209...
1050    {'input_ids': [104, 3613, 1942, 10596, 30407, ...
1051    {'input_ids': [104, 60971, 1192, 6180, 19838, ...
1052    {'input_ids': [104, 26170, 1192, 26170, 1192, ...
1053    {'input_ids': [104, 176, 3373, 95255, 108181, ...
Name: encoded_lyrics, Length: 1054, dtype: object

In [11]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['mood'] = labelencoder.fit_transform(df['mood'])
df['mood']

0       2
1       2
2       2
3       3
4       3
       ..
1049    2
1050    6
1051    6
1052    6
1053    1
Name: mood, Length: 1054, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['mood'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['mood'], random_state=42)

In [15]:
import torch
from torch.utils.data import Dataset

class LyricsDataset(Dataset):
    def __init__(self, df):
        self.input_ids = [[x['input_ids'] for x in df['encoded_lyrics']]]        
        self.attention_mask = [[x['attention_mask'] for x in df['encoded_lyrics']]]
        self.labels = df['mood'].tolist

    def __getitem__(self, x):
        return {
            'input_ids': torch.tensor(self.input_ids[x], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[x], dtype=torch.long),
            'labels': torch.tensor(self.labels[x], dtype=torch.long)
        }
    
    def __len__(self):
        return len(self.input_ids)

train_dataset = LyricsDataset(train_df)
val_dataset = LyricsDataset(val_df)
test_dataset = LyricsDataset(test_df)