<a href="https://colab.research.google.com/github/peteryushunli/rap_llm/blob/main/Fine_Tune_Classification_BERT_Model_with_LoRA_Hiphop_Lyircs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Processing the data (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece] transformers[torch] peft loralib

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.6.0-py3-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.9/134.9 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl (10 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K   

In [15]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from datasets import load_dataset

raw_datasets = load_dataset("brunokreiner/genius-lyrics")
raw_datasets

Downloading readme:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/663M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url'],
        num_rows: 480855
    })
})

In [4]:
def is_genres_list_not_null(example):
    return example['genres_list'] is not None and example['genres_list'] != ''

# Use the filter method
filtered_dataset = raw_datasets['train'].filter(is_genres_list_not_null)

def set_genre_binary(example):
    if 'hip hop' in example['genres_list'] or 'rap' in example['genres_list']:
        example['genre_binary'] = 'hip-hop'
    else:
        example['genre_binary'] = 'not hip-hop'
    return example

# Use the .map method to add the new column
hiphop_dataset = filtered_dataset.map(set_genre_binary, remove_columns=['Unnamed: 0'])

Filter:   0%|          | 0/480855 [00:00<?, ? examples/s]

Map:   0%|          | 0/49985 [00:00<?, ? examples/s]

In [5]:
import datasets
# 90% train, 20% test + validation
train_testvalid = hiphop_dataset.train_test_split(test_size=0.2)
# Split the 20% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# consolidate
dataset = datasets.DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary'],
        num_rows: 39988
    })
    test: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary'],
        num_rows: 4999
    })
    valid: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary'],
        num_rows: 4998
    })
})

In [6]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#tokenized_lyric = tokenizer(raw_train_dataset[4]["lyrics"])
#print(raw_train_dataset[4]["lyrics"])

In [7]:
def tokenize_function(example):
    tokenized_lyrics = tokenizer(example["lyrics"], padding = True, truncation=True)
    example['input_ids'] = tokenized_lyrics['input_ids']
    example['attention_mask'] = tokenized_lyrics['attention_mask']

    # Convert genre labels to integers
    example['labels'] = [0 if genre == 'not hip-hop' else 1 for genre in example['genre_binary']]
    return example

In [8]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/39988 [00:00<?, ? examples/s]

Map:   0%|          | 0/4999 [00:00<?, ? examples/s]

Map:   0%|          | 0/4998 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 39988
    })
    test: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4999
    })
    valid: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4998
    })
})

In [9]:
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary'])

In [10]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['valid'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (39988, 3)
Validation: (4998, 3)
Test: (4999, 3)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 39988
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4999
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4998
    })
})


### Fine-Tune the Model

In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
directory = '/content/drive/MyDrive/hiphop-classification-model'

from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',           # output directory
    num_train_epochs=10,               # total number of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=64,    # batch size for evaluation
    warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',             # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",      # Evaluation is done at the end of each epoch.
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

### Fine-Tune with LoRA

In [12]:
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
)

In [17]:
from peft import get_peft_model
import torch

# Load the model and tokenizer
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
peft_model = get_peft_model(model, lora_config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from transformers import TrainingArguments, Trainer
import time

output_dir = f'./peft-lyric-classification-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=64,)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    compute_metrics=compute_metrics,

)

In [19]:
peft_trainer.train()

peft_model_path="./peft-lyric-classification-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2379,0.255036,0.914766,0.626316,0.836066,0.500701
2,0.2328,0.247406,0.920768,0.662692,0.843818,0.545582
3,0.2305,0.236406,0.92397,0.691558,0.820809,0.597475
4,0.2194,0.239411,0.923169,0.682119,0.832323,0.57784
5,0.2327,0.235315,0.92397,0.690049,0.824561,0.593268


('./peft-lyric-classification-checkpoint-local/tokenizer_config.json',
 './peft-lyric-classification-checkpoint-local/special_tokens_map.json',
 './peft-lyric-classification-checkpoint-local/vocab.txt',
 './peft-lyric-classification-checkpoint-local/added_tokens.json',
 './peft-lyric-classification-checkpoint-local/tokenizer.json')

#### Evaluate the Model

In [28]:
from tqdm.auto import tqdm

# Place the model in evaluation mode
model.eval()

predictions = []
true_labels = []

for i in tqdm(range(len(test_dataset)), desc="Evaluating"):
    # Get the test data sample by sample
    sample = test_dataset[i]
    input_ids = torch.tensor(sample['input_ids']).unsqueeze(0).to(model.device)
    attention_mask = torch.tensor(sample['attention_mask']).unsqueeze(0).to(model.device)
    labels = torch.tensor(sample['labels']).unsqueeze(0).to(model.device)

    true_labels.append(labels.cpu().numpy())

    # Predict
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predictions.append(logits.argmax(-1).cpu().numpy())

Evaluating:   0%|          | 0/4999 [00:00<?, ?it/s]

NameError: ignored

In [29]:
import numpy as np

# Flatten the lists
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

# Now you can compute the metrics
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
acc = accuracy_score(true_labels, predictions)

print(f'Accuracy: {acc}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

Accuracy: 0.9217843568713743
F1 Score: 0.6947697111631538
Precision: 0.8210332103321033
Recall: 0.6021650879566982


### Share the Model to HuggingFace

In [32]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
!git config --global user.email "peteryushunli@gmail.com"
!git config --global user.name "Peter Li"

In [35]:
model.push_to_hub("bert-base-uncased-hiphoplyric-classification-LoRA")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/peteryushunli/bert-base-uncased-hiphoplyric-classification-LoRA/commit/bc65bca2444c0a47adcdc1d1ec850206624bcb37', commit_message='Upload BertForSequenceClassification', commit_description='', oid='bc65bca2444c0a47adcdc1d1ec850206624bcb37', pr_url=None, pr_revision=None, pr_num=None)

### Make a prediction

In [21]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Your new input data
texts = ["First-person shooter mode, we turnin' your song to a funeral. To them niggas that say they wan' off us, you better be talkin' 'bout workin' in cubicles",
         "I'm lettin' it rock 'cause I love the mystique. I still wanna get me a song with YB. Can't trust everything that you saw on IG. Just know if I diss you, I'd make sure you know that I hit you like I'm on your caller ID"]

# Tokenize the input texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Make sure you move your input tensors to the same device as the model
inputs = {key: value.to(model.device) for key, value in inputs.items()}

# Get predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

# Convert predictions to probabilities using softmax
probabilities = torch.nn.functional.softmax(predictions, dim=-1)

# Get the predicted class (0 or 1) based on the probabilities
predicted_class = torch.argmax(probabilities, dim=-1)

# Convert the predictions to a list for further processing or inspection
predicted_class = predicted_class.tolist()

# Output the predicted class
for text, pred_class in zip(texts, predicted_class):
    print(f"Text: '{text}' - Predicted class: {pred_class}")


Text: 'First-person shooter mode, we turnin' your song to a funeral. To them niggas that say they wan' off us, you better be talkin' 'bout workin' in cubicles' - Predicted class: 0
Text: 'I'm lettin' it rock 'cause I love the mystique. I still wanna get me a song with YB. Can't trust everything that you saw on IG. Just know if I diss you, I'd make sure you know that I hit you like I'm on your caller ID' - Predicted class: 0
