# Preprocessing

In [18]:
# Read the data
import pandas as pd
df = pd.read_csv('final_dataset.csv')
df.head()

Unnamed: 0,seeds,lyrics
0,9,"farewell friend, our ties no longer mend this ..."
1,1,we in the building yeah let's go king push the...
2,5,"day that i met you, girl i knew that it was so..."
3,7,living off of what you taught me only what's o...
4,1,there was once in days of yore and in ages and...


In [19]:
# Rename seeds to labels for BERT
df = df.rename(columns={'seeds': 'labels'})

In [20]:
df['lyrics'] = df['lyrics'].astype(str)

In [21]:
# Remove stopwords and stem the words using NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def remove_stopwords_and_stem(text):
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

# Remove numbers and special characters
import re
def remove_numbers_and_special_characters(text):
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    return text

df['lyrics'] = df['lyrics'].apply(remove_numbers_and_special_characters)
df['lyrics'] = df['lyrics'].apply(remove_stopwords_and_stem)

df

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mertbektas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,labels,lyrics
0,9,farewel friend tie longer mend tale come end m...
1,1,build yeah let go king push think know someth ...
2,5,day met girl knew someth special put finger fu...
3,7,live taught top fail see underneath want under...
4,1,day yore age time long gone part cairo merchan...
...,...,...
67,1,vishnuh pervad everywher vashatkaarah invok ob...
68,3,continu journey get eardrum straight pay atten...
69,8,anoth dream blown bit tryin live learn let go ...
70,6,albin lee meldau american aquarium thing chang...


In [22]:
# print(df['labels'].value_counts())

0    11
5    10
8     9
3     8
2     7
1     6
6     6
9     5
7     5
4     5
Name: labels, dtype: int64


In [23]:
# # Print numbers of labels 
# print(df['labels'].value_counts().count())

# # Print number of rows with labels of 10
# print(df[df['labels'] == 10].count())

10
labels    0
lyrics    0
dtype: int64


In [24]:
# # Removing the 10th label for now
# df = df[df['labels'] != 10]

In [25]:
# Split the data into train, validation and test sets
from sklearn.model_selection import train_test_split

train, temp = train_test_split(df,
                            random_state=2018,          
                            test_size=0.2)

val, test= train_test_split(temp,
                        random_state=2018,          
                        test_size=0.5)

                                                                        

In [27]:
import datasets

# Convert the data into HuggingFace datasets format
train_dataset = datasets.Dataset.from_pandas(train)
val_dataset = datasets.Dataset.from_pandas(val)
test_dataset = datasets.Dataset.from_pandas(test)

# Combine all the datasets into one dictionary
datasets = datasets.DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [28]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [30]:
def tokenize_function(example):
    return tokenizer(example['lyrics'], padding='max_length', truncation=True)

In [31]:
tokenized_datasets = datasets.map(tokenize_function, batched=True)
tokenized_datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'lyrics', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 57
    })
    validation: Dataset({
        features: ['labels', 'lyrics', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7
    })
    test: Dataset({
        features: ['labels', 'lyrics', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8
    })
})

In [34]:
# Drop __index_level_0__ column
tokenized_datasets = tokenized_datasets.remove_columns('__index_level_0__')

In [35]:
# Check format of datasetdict
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'lyrics', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 57
    })
    validation: Dataset({
        features: ['labels', 'lyrics', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7
    })
    test: Dataset({
        features: ['labels', 'lyrics', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8
    })
})

In [36]:
# Adding dynamic padding to the dataset
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [37]:
tokenized_datasets = tokenized_datasets.remove_columns(['lyrics'])
tokenized_datasets.set_format('torch')
tokenized_datasets['train'].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [38]:
# Defining data loaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8, collate_fn=data_collator)

In [39]:
for batch in train_dataloader:
    break
{key: val.shape for key, val in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 512]),
 'token_type_ids': torch.Size([8, 512]),
 'attention_mask': torch.Size([8, 512])}

# Training

In [44]:
from torch import mode
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased'
                                                           , num_labels=len(tokenized_datasets['train']['labels'].unique()))

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [50]:
# Freeze the first 5 layers of BERT model
modules = [model.bert.embeddings, model.bert.encoder.layer[:5]]
for module in modules:
    for param in module.parameters():
        param.requires_grad = False

In [52]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

In [53]:
# Optimization
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [54]:
# Scheduler
from transformers import get_scheduler

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

8


In [55]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
device

device(type='cpu')

In [None]:
# Training loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# Evaluation loop
import evaluate
import torch

accuracy = evaluate.load('accuracy')
model.eval()
for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    accuracy(predictions, batch['labels'])


In [None]:
# Trainer
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=evaluate.load('accuracy')
)

trainer.train()

In [None]:
# Test on the test set

trainer.evaluate(tokenized_datasets['test'])