In [1]:
from google.colab import drive
drive.mount('/content/drive')

!ls
!ls /content/drive/MyDrive/nlp_project

!pip install datasets
!pip install transformers
!pip install evaluate

import os
os.environ["WANDB_DISABLED"] = "true"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
drive  sample_data
1raw_lyrics_test_dataset.csv   raw_lyrics_test_dataset.gsheet
1raw_lyrics_train_dataset.csv  raw_lyrics_train_dataset.csv
cleaned_data.csv	       raw_lyrics_train_dataset.gsheet
raw_lyrics_test_dataset.csv    results
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-

In [2]:
from datasets import load_dataset
clickbait = load_dataset('csv',data_files={'train': '/content/drive/MyDrive/nlp_project/cleaned_data.csv',
                                           'test': '/content/drive/MyDrive/nlp_project/raw_lyrics_test_dataset.csv'})



Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [57]:
small_train_dataset = clickbait["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = clickbait["test"].shuffle(seed=42).select([i for i in list(range(300))])

In [58]:
print("Lyrics:", small_train_dataset[0]["Lyrics"])
print("Genre_Label:", small_train_dataset[0]["label"])

Lyrics: Lyrics[Verse 1] Well, when you go Don't ever think I'll make you try to stay And maybe when you get back I'll be off to find another way  [Pre-Chorus] And after all this time that you still owe You're still a good-for-nothing, I don't know So take your gloves and get out Better get out while you can  [Chorus] When you go, and would you even turn to say "I don't love you like I did yesterday"?  [Verse 2] Sometimes I cry so hard from pleading So sick and tired of all the needless beating But baby, when they knock you down and out Is where you oughta stay  [Pre-Chorus] And after all the blood that you still owe Another dollar's just another blow So fix your eyes and get up Better get up while you can, whoa, whoa, whoa   [Chorus] When you go, and would you even turn to say "I don't love you like I did yesterday"?  [Post-Chorus] Well, come on, come on [Guitar Solo]  [Bridge] When you go, would you have the guts to say "I don't love you like I loved you yesterday"?  [Outro] I don't l

In [59]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [60]:
def preprocess_function(examples):
    return tokenizer(examples["Lyrics"], truncation=True)


In [61]:
# (removing batched=True)
tokenized_small_train = small_train_dataset.map(preprocess_function)
tokenized_small_test = small_test_dataset.map(preprocess_function)


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [63]:
# Let's look at the first training sentence

print(tokenized_small_train)

print("Lyrics:", tokenized_small_train[0]["Lyrics"])
print("Genre_Label:", tokenized_small_train[0]["label"])
print("Input IDs:", tokenized_small_train[0]["input_ids"])
print("Attention Mask:", tokenized_small_train[0]["attention_mask"])

# Convert token IDs back to tokens
tokens = tokenizer.convert_ids_to_tokens(tokenized_small_train[0]["input_ids"])
print("Tokenized text:", tokens)

Dataset({
    features: ['Lyrics', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3000
})
Lyrics: Lyrics[Verse 1] Well, when you go Don't ever think I'll make you try to stay And maybe when you get back I'll be off to find another way  [Pre-Chorus] And after all this time that you still owe You're still a good-for-nothing, I don't know So take your gloves and get out Better get out while you can  [Chorus] When you go, and would you even turn to say "I don't love you like I did yesterday"?  [Verse 2] Sometimes I cry so hard from pleading So sick and tired of all the needless beating But baby, when they knock you down and out Is where you oughta stay  [Pre-Chorus] And after all the blood that you still owe Another dollar's just another blow So fix your eyes and get up Better get up while you can, whoa, whoa, whoa   [Chorus] When you go, and would you even turn to say "I don't love you like I did yesterday"?  [Post-Chorus] Well, come on, come on [Guitar Solo]  [Bridge] When you go

In [64]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [65]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
import numpy as np
from evaluate import load


def compute_metrics(eval_pred):
   load_accuracy = load("accuracy")
   load_f1 = load("f1")
   load_recall = load("recall")
   load_precision = load("precision")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
   recall = load_recall.compute(predictions=predictions, references=labels, average="weighted")["recall"]
   precision = load_precision.compute(predictions=predictions, references=labels, average="weighted")["precision"]
   return {"accuracy": accuracy, "f1": f1, "recall": recall, "precision": precision}

In [80]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/nlp_project/results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_small_train,
    eval_dataset=tokenized_small_test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [81]:

print(tokenized_small_train[0])


{'Lyrics': 'Lyrics[Verse 1] Well, when you go Don\'t ever think I\'ll make you try to stay And maybe when you get back I\'ll be off to find another way  [Pre-Chorus] And after all this time that you still owe You\'re still a good-for-nothing, I don\'t know So take your gloves and get out Better get out while you can  [Chorus] When you go, and would you even turn to say "I don\'t love you like I did yesterday"?  [Verse 2] Sometimes I cry so hard from pleading So sick and tired of all the needless beating But baby, when they knock you down and out Is where you oughta stay  [Pre-Chorus] And after all the blood that you still owe Another dollar\'s just another blow So fix your eyes and get up Better get up while you can, whoa, whoa, whoa   [Chorus] When you go, and would you even turn to say "I don\'t love you like I did yesterday"?  [Post-Chorus] Well, come on, come on [Guitar Solo]  [Bridge] When you go, would you have the guts to say "I don\'t love you like I loved you yesterday"?  [Out

In [82]:
trainer.train()

Step,Training Loss
500,0.9178


TrainOutput(global_step=940, training_loss=0.7255181008196891, metrics={'train_runtime': 703.9776, 'train_samples_per_second': 21.307, 'train_steps_per_second': 1.335, 'total_flos': 1987152721920000.0, 'train_loss': 0.7255181008196891, 'epoch': 5.0})

In [83]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

{'eval_loss': 1.0188297033309937, 'eval_accuracy': 0.6466666666666666, 'eval_f1': 0.6423506031130395, 'eval_recall': 0.6466666666666666, 'eval_precision': 0.6433570351403975, 'eval_runtime': 10.052, 'eval_samples_per_second': 29.845, 'eval_steps_per_second': 1.89, 'epoch': 5.0}
