# Processing the data (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]



In [2]:
pip install transformers[torch]



In [3]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import load_dataset

raw_datasets = load_dataset("brunokreiner/genius-lyrics")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url'],
        num_rows: 480855
    })
})

In [5]:
def is_genres_list_not_null(example):
    return example['genres_list'] is not None and example['genres_list'] != ''

# Use the filter method
filtered_dataset = raw_datasets['train'].filter(is_genres_list_not_null)

def set_genre_binary(example):
    if 'hip hop' in example['genres_list'] or 'rap' in example['genres_list']:
        example['genre_binary'] = 'hip-hop'
    else:
        example['genre_binary'] = 'not hip-hop'
    return example

# Use the .map method to add the new column
hiphop_dataset = filtered_dataset.map(set_genre_binary, remove_columns=['Unnamed: 0'])

In [6]:
import datasets
# 90% train, 20% test + validation
train_testvalid = hiphop_dataset.train_test_split(test_size=0.2)
# Split the 20% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# consolidate
dataset = datasets.DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary'],
        num_rows: 39988
    })
    test: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary'],
        num_rows: 4999
    })
    valid: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary'],
        num_rows: 4998
    })
})

In [7]:
raw_train_dataset = dataset["train"]
raw_train_dataset[0]

{'id': '48YOrlKPBPS9XcaaSKmhzn',
 'lyrics': 'i lived in california i was working in manhattan my life was very busy yeah but nothing ever happened i wanted to make music and to show the world my soul so i moved down south where they still play rock and roll i always wanted summer i was tired of the snow i wanna find a lover where the sweet magnolias grow yeah there s something in the water on the shores of muscle shoals so i moved down south where they still play rock and roll i moved south where the grass is green i moved south where my friends take care of me yeah there s beauty in the valley and good people in the city but i moved down south where the music moves me yeah i moved down south where the music moves me i met up with my brothers and i made a couple more bought a van that smelled like armor all and we took it out on tour from atlanta to seattle we sang gospel and soul and we ll go to any city where they still play rock and roll and we go back home where the sweet magnolias

In [9]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_lyric = tokenizer(raw_train_dataset[4]["lyrics"])
print(raw_train_dataset[4]["lyrics"])
tokenized_lyric

i wonder if its nice today im wishing that the clouds will stay kinda wishing that i never wake wish i could dream forever but you know life just aint that way strange ships will pass this bay hold on just one more day i wanna die together and i hope my girl s been faithful cause i dont know i dont trust her anymore ship is sinking down we go and i dont know which way is home and all of these people they asking me questions they giving directions the fuck do you want came for the story the glory the power the pills to the powder the fuck do you want now that youre a superstar shine bright for the cameras now that youre a superstar shine bright for the cameras i wonder if it hurts today im kinda wishing you will stay cause i dont wanna be alone ok i wanna die together she said i hope youre faithful cause i dont know and i dont love you anymore ship is sinking down we go and i dont know which way is home  and all of these people they asking me questions they giving directions the fuck do

{'input_ids': [101, 1045, 4687, 2065, 2049, 3835, 2651, 10047, 10261, 2008, 1996, 8044, 2097, 2994, 17704, 10261, 2008, 1045, 2196, 5256, 4299, 1045, 2071, 3959, 5091, 2021, 2017, 2113, 2166, 2074, 7110, 2102, 2008, 2126, 4326, 3719, 2097, 3413, 2023, 3016, 2907, 2006, 2074, 2028, 2062, 2154, 1045, 10587, 3280, 2362, 1998, 1045, 3246, 2026, 2611, 1055, 2042, 11633, 3426, 1045, 2123, 2102, 2113, 1045, 2123, 2102, 3404, 2014, 4902, 2911, 2003, 10186, 2091, 2057, 2175, 1998, 1045, 2123, 2102, 2113, 2029, 2126, 2003, 2188, 1998, 2035, 1997, 2122, 2111, 2027, 4851, 2033, 3980, 2027, 3228, 7826, 1996, 6616, 2079, 2017, 2215, 2234, 2005, 1996, 2466, 1996, 8294, 1996, 2373, 1996, 15345, 2000, 1996, 9898, 1996, 6616, 2079, 2017, 2215, 2085, 2008, 2115, 2063, 1037, 18795, 12342, 4408, 2005, 1996, 8629, 2085, 2008, 2115, 2063, 1037, 18795, 12342, 4408, 2005, 1996, 8629, 1045, 4687, 2065, 2009, 13403, 2651, 10047, 17704, 10261, 2017, 2097, 2994, 3426, 1045, 2123, 2102, 10587, 2022, 2894, 7929, 104

In [10]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [13]:
def tokenize_function(example):
    tokenized_lyrics = tokenizer(example["lyrics"], padding = True, truncation=True)
    example['input_ids'] = tokenized_lyrics['input_ids']
    example['attention_mask'] = tokenized_lyrics['attention_mask']

    # Convert genre labels to integers
    example['labels'] = [0 if genre == 'not hip-hop' else 1 for genre in example['genre_binary']]
    return example

In [14]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/39988 [00:00<?, ? examples/s]

Map:   0%|          | 0/4999 [00:00<?, ? examples/s]

Map:   0%|          | 0/4998 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 39988
    })
    test: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4999
    })
    valid: Dataset({
        features: ['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4998
    })
})

In [15]:
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'lyrics', 'is_english', 'genres_list', 'popularity', 'release_date', 'artist_id', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_picture_url', 'genre_binary'])

In [16]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['valid'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (39988, 3)
Validation: (4998, 3)
Test: (4999, 3)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 39988
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4999
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4998
    })
})


### Fine-Tune the Model

In [17]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
directory = '/content/drive/MyDrive/hiphop-classification-model'

from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',           # output directory
    num_train_epochs=3,               # total number of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=64,    # batch size for evaluation
    warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',             # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",      # Evaluation is done at the end of each epoch.
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3025,0.262815,0.92537,0.694011,0.816602,0.603424


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3025,0.262815,0.92537,0.694011,0.816602,0.603424
2,0.1262,0.246687,0.917367,0.669335,0.762774,0.596291
3,0.3297,0.225111,0.926771,0.723147,0.769726,0.681883


TrainOutput(global_step=7500, training_loss=0.26429238992532095, metrics={'train_runtime': 3825.6507, 'train_samples_per_second': 31.358, 'train_steps_per_second': 1.96, 'total_flos': 3.156385464520704e+16, 'train_loss': 0.26429238992532095, 'epoch': 3.0})

### Share the Model to HuggingFace

In [21]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
!git config --global user.email "peteryushunli@gmail.com"
!git config --global user.name "Peter Li"

In [24]:
model.push_to_hub("bert-base-uncased-hiphoplyric-classification")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/peteryushunli/bert-base-uncased-hiphoplyric-classification/commit/de51f3356e93da07b728382ac99230d8b5fb80bc', commit_message='Upload BertForSequenceClassification', commit_description='', oid='de51f3356e93da07b728382ac99230d8b5fb80bc', pr_url=None, pr_revision=None, pr_num=None)

### Make a prediction

In [27]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Your new input data
texts = ["First-person shooter mode, we turnin' your song to a funeral. To them niggas that say they wan' off us, you better be talkin' 'bout workin' in cubicles",
         "I'm lettin' it rock 'cause I love the mystique. I still wanna get me a song with YB. Can't trust everything that you saw on IG. Just know if I diss you, I'd make sure you know that I hit you like I'm on your caller ID"]

# Tokenize the input texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Make sure you move your input tensors to the same device as the model
inputs = {key: value.to(model.device) for key, value in inputs.items()}

# Get predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

# Convert predictions to probabilities using softmax
probabilities = torch.nn.functional.softmax(predictions, dim=-1)

# Get the predicted class (0 or 1) based on the probabilities
predicted_class = torch.argmax(probabilities, dim=-1)

# Convert the predictions to a list for further processing or inspection
predicted_class = predicted_class.tolist()

# Output the predicted class
for text, pred_class in zip(texts, predicted_class):
    print(f"Text: '{text}' - Predicted class: {pred_class}")


Text: 'First-person shooter mode, we turnin' your song to a funeral. To them niggas that say they wan' off us, you better be talkin' 'bout workin' in cubicles' - Predicted class: 1
Text: 'I'm lettin' it rock 'cause I love the mystique. I still wanna get me a song with YB. Can't trust everything that you saw on IG. Just know if I diss you, I'd make sure you know that I hit you like I'm on your caller ID' - Predicted class: 0
