In [1]:
# Import modules
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import torch
try:
  from transformers import BertTokenizer, BertModel
except:
  !pip install transformers
  from transformers import BertTokenizer, BertModel

try:
  import langdetect
except:
  !pip install langdetect
  import langdetect

In [2]:
# Stopwords
nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Functions
# Function to clean and process the sentence
def pre_process_sentence(sentence, nltk_stopwords):
    s = sentence.lower()
    s = s.split()
    words = []
    for w in s:
        w = w.strip(string.punctuation)
        if w not in nltk_stopwords and len(w) > 1:
            words.append(w)
    return ' '.join(words)

# Function to clean the dataframe of non-English lyrics
def is_english(text):
    try:
        return langdetect.detect(text) == 'en'
    except:
        return False

# Function to clean and process the dataframe
def pre_process_data(new_df, nltk_stopwords):
    new_df.dropna(subset=['text'], inplace=True)  # drop rows with missing values in the 'text' column
    new_df = new_df[new_df['text'].apply(is_english)]
    new_df.text = new_df.text.map(lambda x: pre_process_sentence(x, nltk_stopwords))
    new_df.reset_index(drop=True, inplace=True)
    return new_df


In [2]:
def get_bert_features(df, tokenizer, model, device, max_length=300, batch_size=32):
    encoded_corpus = tokenizer(text=df.text.tolist(),
                                add_special_tokens=True,
                                padding='max_length',
                                truncation='longest_first',
                                max_length=max_length,
                                return_attention_mask=True)

    input_ids = encoded_corpus['input_ids']
    attention_mask = encoded_corpus['attention_mask']

    # Filter long inputs
    def _filter_long_descriptions(tokenizer, descriptions, max_len):
        indices = []
        lengths = tokenizer(descriptions, padding=False, 
                        truncation=False, return_length=True)['length']
        for i in range(len(descriptions)):
            if lengths[i] <= max_len-2:
                indices.append(i)
        return indices

    short_descriptions = _filter_long_descriptions(tokenizer, 
                                  df.text.tolist(), 300)
    input_ids = np.array(input_ids)[short_descriptions]
    attention_mask = np.array(attention_mask)[short_descriptions]

    batch_size = 32

    def _create_dataloaders(inputs, masks, batch_size):
        input_tensor = torch.tensor(inputs)
        mask_tensor = torch.tensor(masks)
        dataset = TensorDataset(input_tensor, mask_tensor)
        dataloader = DataLoader(dataset, batch_size=batch_size)
        return dataloader

    tr_loader = _create_dataloaders(input_ids, attention_mask, batch_size)

    # Put the model in evaluation mode
    model.eval()

    # Define an empty list to store the features
    features_list = []

    # Loop over the batches in the dataloader
    for batch in tr_loader:
        # Unpack the batch tuple into individual tensors
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)

        # Pass the tensors through the model
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get the last hidden state from the BERT model
        last_hidden_state = outputs.last_hidden_state

        # Append the features to the list
        features_list.append(last_hidden_state)

    # Concatenate all of the features in the list
    all_features = torch.cat(features_list, dim=0)

    # Return the concatenated tensor
    return all_features, short_descriptions


In [3]:
# Mounting for Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Load data from CSV file
df = pd.read_csv('drive/MyDrive/BERT_Feature_Extraction/spotify_songs.csv')

# Rename columns and assign to new dataframe
df = pd.DataFrame({
    'text': df.loc[:, 'lyrics'],
    'label': df.loc[:, 'track_popularity'],
    'genre': df.loc[:, 'playlist_genre']
})

# Remove genres we aren't looking at
genres_to_keep = ['rock', 'pop', 'edm']
df = df[df['genre'].isin(genres_to_keep)].reset_index(drop=True)

# Preprocess the dataframe
df = pre_process_data(df, nltk_stopwords)

df.to_csv(f"drive/MyDrive/BERT_Feature_Extraction/processed_spotify_data.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.text = new_df.text.map(lambda x: pre_process_sentence(x, nltk_stopwords))


In [3]:
# Use BERT for feature extraction

df = pd.read_csv('drive/MyDrive/BERT_Feature_Extraction/processed_spotify_data.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print("Using GPU.")
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")
model.to(device)

features, short_descriptions = get_bert_features(df, tokenizer, model, device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using GPU.


Token indices sequence length is longer than the specified maximum sequence length for this model (694 > 512). Running this sequence through the model will result in indexing errors


In [9]:
#df = df.iloc[short_descriptions]
array_of_features = features.cpu().numpy()
df['BERT_features'] = list(array_of_features)



In [13]:
df['genre'].unique()

array(['rock', 'edm', 'pop'], dtype=object)

In [16]:
# Drop columns
#df.drop(columns=['text'], inplace=True)
genres_to_keep = ['pop', 'edm', 'rock']
# Save the datasets to their respective genres
for genre in genres_to_keep:
    df_filtered = df[df['genre'] == genre].reset_index(drop=True)
    df_filtered = df_filtered.drop('genre', axis=1)
    df_filtered.to_csv(f"drive/MyDrive/BERT_Feature_Extraction/{genre}_processed_spotify_data.csv", index=False)
