In [None]:
# Import modules
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import torch
try:
  from transformers import BertTokenizer, BertModel
except:
  !pip install transformers
  from transformers import BertTokenizer, BertModel

try:
  import langdetect
except:
  !pip install langdetect
  import langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
# Stopwords
nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Functions
# Function to clean and process the sentence
def pre_process_sentence(sentence, nltk_stopwords):
    s = sentence.lower()
    s = s.split()
    words = []
    for w in s:
        w = w.strip(string.punctuation)
        if w not in nltk_stopwords and len(w) > 1:
            words.append(w)
    return ' '.join(words)

# Function to clean the dataframe of non-English lyrics
def is_english(text):
    try:
        return langdetect.detect(text) == 'en'
    except:
        return False

# Function to clean and process the dataframe
def pre_process_data(new_df, nltk_stopwords):
    new_df.dropna(subset=['lyrics'], inplace=True)  # drop rows with missing values in the 'text' column
    new_df = new_df[new_df['lyrics'].apply(is_english)]
    new_df.lyrics = new_df.lyrics.map(lambda x: pre_process_sentence(x, nltk_stopwords))
    new_df.reset_index(drop=True, inplace=True)
    return new_df


In [None]:
def get_bert_features(df, tokenizer, model, device, max_length=300, batch_size=32):
    encoded_corpus = tokenizer(text=df.lyrics.tolist(),
                                add_special_tokens=True,
                                padding='max_length',
                                truncation='longest_first',
                                max_length=max_length,
                                return_attention_mask=True)

    input_ids = encoded_corpus['input_ids']
    attention_mask = encoded_corpus['attention_mask']

    # Filter long inputs
    def _filter_long_descriptions(tokenizer, descriptions, max_len):
        indices = []
        lengths = tokenizer(descriptions, padding=False, 
                        truncation=False, return_length=True)['length']
        for i in range(len(descriptions)):
            if lengths[i] <= max_len-2:
                indices.append(i)
        return indices

    short_descriptions = _filter_long_descriptions(tokenizer, 
                                  df.lyrics.tolist(), 300)
    input_ids = np.array(input_ids)[short_descriptions]
    attention_mask = np.array(attention_mask)[short_descriptions]

    batch_size = 32

    def _create_dataloaders(inputs, masks, batch_size):
        input_tensor = torch.tensor(inputs)
        mask_tensor = torch.tensor(masks)
        dataset = TensorDataset(input_tensor, mask_tensor)
        dataloader = DataLoader(dataset, batch_size=batch_size)
        return dataloader

    tr_loader = _create_dataloaders(input_ids, attention_mask, batch_size)

    # Put the model in evaluation mode
    model.eval()

    # Define an empty list to store the features
    features_list = []

    # Loop over the batches in the dataloader
    for batch in tr_loader:
        # Unpack the batch tuple into individual tensors
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)

        # Pass the tensors through the model
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get the last hidden state from the BERT model
        last_hidden_state = outputs.last_hidden_state

        # Append the features to the list
        features_list.append(last_hidden_state)

    # Concatenate all of the features in the list
    all_features = torch.cat(features_list, dim=0)

    # Return the concatenated tensor
    return all_features, short_descriptions


In [None]:
# Mounting for Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load data from CSV file
df = pd.read_csv('drive/MyDrive/BERT_Feature_Extraction/lyrics_and_predictions_third_approach.csv')


# Preprocess the dataframe
df = pre_process_data(df, nltk_stopwords)

df.to_csv(f"drive/MyDrive/BERT_Feature_Extraction/processed_spotify_predictions_data.csv", index=False)

# Note this is only necesssary to be run if you haven't preprocessed the data yet 
# which I did in drive if you use processed_spotify_predictions_data.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.lyrics = new_df.lyrics.map(lambda x: pre_process_sentence(x, nltk_stopwords))


In [None]:
# Use BERT for feature extraction
# Set Up dataframe and models
df = pd.read_csv('drive/MyDrive/BERT_Feature_Extraction/processed_spotify_predictions_data.csv') 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print("Using GPU.")
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")
model.to(device)

features, short_descriptions = get_bert_features(df, tokenizer, model, device)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using GPU.


Token indices sequence length is longer than the specified maximum sequence length for this model (759 > 512). Running this sequence through the model will result in indexing errors


In [None]:
#df = df.iloc[short_descriptions]
array_of_features = features.cpu().numpy()
new_array = array_of_features.reshape(-1,300*768)
new_array.shape


new_df = df.iloc[short_descriptions] # Need this to remove some of the longer lyrics that are too big for the model

new_df = new_df.reset_index(drop=True)
new_new_df = pd.concat([new_df, pd.DataFrame(new_array, columns=[f"BERT_Feature_{i}" for i in range(new_array.shape[1])])], axis=1) # This is the dataframe with all of the BERT features and spotify data

In [None]:
new_new_df

Unnamed: 0,name,popularity,duration_ms,artists,danceability,energy,key,loudness,mode,speechiness,...,BERT_Feature_230390,BERT_Feature_230391,BERT_Feature_230392,BERT_Feature_230393,BERT_Feature_230394,BERT_Feature_230395,BERT_Feature_230396,BERT_Feature_230397,BERT_Feature_230398,BERT_Feature_230399
0,anthem,50,213133,zebrahead,0.378,0.985,0,-3.860,1,0.2620,...,-0.086471,0.028991,-0.151306,-0.008903,-0.210479,0.059991,0.136277,-0.180078,-0.546589,-0.134645
1,hey joe,42,185680,wilson pickett,0.510,0.671,5,-11.920,0,0.0583,...,-0.190741,-0.490491,-0.373608,-0.047741,0.008224,-0.344685,-0.103989,-0.329972,-0.165102,-0.068607
2,someone to watch over me,34,241573,willie nelson,0.609,0.240,9,-10.179,1,0.0259,...,-0.290693,-0.749686,-0.218025,0.016274,0.069605,0.103685,0.076115,-0.238064,-0.185407,-0.048231
3,september song,35,273333,willie nelson,0.295,0.114,8,-14.657,1,0.0333,...,-0.491260,-0.181898,-0.217705,0.146614,0.181118,-0.131955,0.216964,-0.218527,-0.088215,-0.321717
4,undo the right,20,155313,willie nelson,0.589,0.247,2,-11.177,1,0.0394,...,-0.111315,-0.452242,0.577594,0.341019,0.348333,-0.291094,-0.162603,-0.530708,0.150939,-0.592534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1378,inmates (we're all crazy),23,303773,alice cooper,0.402,0.377,1,-13.992,1,0.0527,...,-0.682434,-0.126216,-0.281561,-0.104521,-0.014616,-0.527953,0.157831,-0.258637,0.503942,-0.880165
1379,nurse rozetta,25,255707,alice cooper,0.390,0.648,5,-14.092,1,0.2880,...,-0.196174,-0.077336,-0.001263,-0.051212,-0.430424,0.043069,0.195756,-0.131033,0.090076,-0.371837
1380,hare krishna,17,493693,alice coltrane,0.116,0.461,9,-11.464,1,0.0346,...,0.018443,0.232805,-0.016292,-0.340698,0.318558,0.055475,0.022466,-0.034402,-0.311508,-0.198041
1381,hiding tonight,57,186720,alex turner,0.543,0.122,0,-18.863,1,0.0337,...,-0.084990,-0.163355,-0.069582,0.194998,0.023188,-0.119720,0.189449,0.008368,-0.240978,-0.164208


In [None]:
new_new_df.to_csv(f"drive/MyDrive/BERT_Feature_Extraction/new_combined_BERT_processed_spotify_data.csv", index=False)

In [None]:
df['genre'].unique()

array(['rock', 'edm', 'pop'], dtype=object)

In [None]:
# Drop columns
#df.drop(columns=['text'], inplace=True)
genres_to_keep = ['pop', 'edm', 'rock']
# Save the datasets to their respective genres
for genre in genres_to_keep:
    df_filtered = df[df['genre'] == genre].reset_index(drop=True)
    df_filtered = df_filtered.drop('genre', axis=1)
    df_filtered.to_csv(f"drive/MyDrive/BERT_Feature_Extraction/{genre}_processed_spotify_data.csv", index=False)
