In [1]:
#!conda install transformers

import pandas as pd
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
#import tensorflow_hub as hub
from transformers import BertTokenizer
#!pip install tensorflow_text
#import tensorflow_text as text
import numpy as np
df = pd.read_csv('unprocessed_lyrics.csv')
SIZE = 5000
df = df.groupby('Genre', group_keys=False).apply(lambda s: s.sample(SIZE, random_state=42)) #


In [2]:
df

Unnamed: 0,SName,Lyric,Genre,Artist,lyric_length
2157,Family Affair,Refrain:. Let's get it crunkupon. We gon' have...,Hip Hop,Mary J. Blige,567
3187,Ransom (ft. Lil' Wayne),"Ransom,. . Yeah,. its Drizzy Baby. you already...",Hip Hop,Drake,967
9164,Close To Me,"T.O.S.. (50 Cent). Unstoppable, incredible, im...",Hip Hop,G-Unit,557
2145,Zone,"uhh, yea. uh uh uh. alright, well alright. . i...",Hip Hop,Drake,400
6442,"Why You Up In Here (feat. Ludacris, Git Fresh ...",Flo-Rida. Gucci!. Bird!. I done bought all thi...,Hip Hop,Flo Rida,524
...,...,...,...,...,...
31738,What do You Need?,What do you need from me tonight?. I feel you ...,Rock,Goo Goo Dolls,197
31425,Rebel Heart,"(R. Stewart, J. Golub, C. Kentis, C. Rojas). I...",Rock,Rod Stewart,412
35407,Before The Dawn,Meet me after dark again and I'll hold you. I ...,Rock,Evanescence,114
35281,Spanish is the Loving Tongue,"Broke my heart, lost my soul. Adios,mi cora so...",Rock,Bob Dylan,126


In [3]:
#min()


In [4]:
# confirm TensorFlow sees the GPU
from tensorflow.python.client import device_lib
assert 'GPU' in str(device_lib.list_local_devices())

tf.config.list_physical_devices('GPU') 




[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices()) # list of DeviceAttributes



[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4561177390296544215
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14461698048
locality {
  bus_id: 1
  links {
  }
}
incarnation: 9959838745176952725
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5"
xla_global_id: 416903419
]


### Initializing empty arrays for storing tokenized text

In [6]:

possible_labels = df.Genre.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

df['label'] = df.Genre.replace(label_dict)

In [7]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(df.Lyric.values, 
                                                df.label.values, 
                                                random_state=42, 
                                                test_size=0.1, 
                                                stratify=df.label.values)

In [8]:
# set array dimensions
seq_len = 512 # Our input sequences can't be too long so we set it to max 512 tokens
num_samples = len(xtrain) 

# initialize empty zero arrays
Xids = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))

# check shape
print(Xids.shape)

# initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, lyric in enumerate(xtrain):
    tokens = tokenizer.encode_plus(lyric, max_length=seq_len, truncation=True, #truncation_True -> text longer than 512 tokens get compressed
                                   padding='max_length', add_special_tokens=True, #for text shorter than 512, pad to length 512. Special tokens adds [CLS],[SEP],[PAD]
                                   return_tensors='tf')
    # assign tokenized outputs to respective rows in numpy arrays
    Xids[i, :] = tokens['input_ids']
    Xmask[i, :] = tokens['attention_mask']

(13500, 512)


101 is for [CLS] "start of seq", 0 is for [PAD] , 

In [9]:
Xids

array([[  101.,  1135.,  5115., ...,     0.,     0.,     0.],
       [  101.,  1409.,  2701., ...,  1663.,   117.,   102.],
       [  101., 13085.,  1139., ...,     0.,     0.,     0.],
       ...,
       [  101.,   146.,  1274., ...,     0.,     0.,     0.],
       [  101.,   146.,  6101., ...,     0.,     0.,     0.],
       [  101., 19585., 10340., ...,     0.,     0.,     0.]])

In [10]:
Xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

Xmask is a control for the attention layer in bert. whereever xmask is 1 Bert will calculate the attention for that token. To avoid for Bert making connections with padding tokens



### one-hot encoding the Genres

In [11]:

# first extract sentiment column
arr = ytrain
# we then initialize the zero array
labels = np.zeros((num_samples, arr.max()+1))

# set relevant index for each row to 1 (one-hot encode)
labels[np.arange(num_samples), arr] = 1
print(labels)

[[0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]]


### Building a tf.data.Dataset object using the input and label tensors. Then transforming them into the correct format for the model.

In [12]:
# create the dataset object
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

def map_func(input_ids, masks, labels):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# then we use the dataset map method to apply this transformation
dataset = dataset.map(map_func)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


In [13]:
# we will split into batches of 8
batch_size = 8

# shuffle and batch - dropping any remaining samples that don't cleanly
# fit into a batch of 8
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
dataset.take(1) # We now have 8 samples per batch

<TakeDataset shapes: ({input_ids: (8, 512), attention_mask: (8, 512)}, (8, 3)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [14]:
split = 1
size = int((Xids.shape[0]/batch_size)*split)
print(size)
print('number of samples in training ' + str(size*batch_size))
# get training and validation sets
train_ds = dataset.take(size)

1687
number of samples in training 896


## Creating Keras FFN model with BERT layer

In [15]:
# AutoModel for PyTorch, TFAutoModel for TensorFlow
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [16]:
import tensorflow as tf

# two input layers, we ensure layer name variables match to dictionary keys in TF dataset
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

# we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)
embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # access final activations with [0]

# convert bert embeddings into 5 output classes
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(3, activation='softmax', name='outputs')(x)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Caus

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


In [17]:
# initialize model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# (optional) freeze bert layer
#model.layers[2].trainable = False
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [18]:
optimizer = tf.keras.optimizers.Adam(lr=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

  super(Adam, self).__init__(name, **kwargs)


In [20]:
history = model.fit(
    train_ds,
    epochs=2
)

Epoch 1/2
Epoch 2/2


## Making predictions

In [21]:
def prep_data(text):
    tokens = tokenizer.encode_plus(text, max_length=512,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_token_type_ids=False,
                                   return_tensors='tf')
    # tokenizer returns int32 tensors, we need to return float64, so we use tf.cast
    return {'input_ids': tf.cast(tokens['input_ids'], tf.float64),
            'attention_mask': tf.cast(tokens['attention_mask'], tf.float64)}

In [22]:
pred = [model.predict(prep_data(lyric) for lyric in xtest)]

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


In [23]:
y = ytest


In [24]:
pred = np.array(pred)
pred = [item for sublist in pred for item in sublist]
y_pred_bool = np.argmax(pred, axis=1)
y_pred_bool

array([2, 2, 2, ..., 2, 0, 2])

In [25]:
from sklearn.metrics import classification_report

print(classification_report(y, y_pred_bool, target_names=label_dict))

              precision    recall  f1-score   support

     Hip Hop       0.92      0.83      0.87       500
         Pop       0.70      0.53      0.61       500
        Rock       0.66      0.89      0.76       500

    accuracy                           0.75      1500
   macro avg       0.76      0.75      0.75      1500
weighted avg       0.76      0.75      0.75      1500



In [26]:
model.save('BERT_genreclassification_model')


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'




INFO:tensorflow:Assets written to: BERT_genreclassification_model/assets


INFO:tensorflow:Assets written to: BERT_genreclassification_model/assets
