# Classification using Transformer  2

## Loading Libraries

Before starting you will need to import the libraries that are needed.

In [61]:
import IPython.display as ipd
import glob
from scipy.io import wavfile
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib as plt
import matplotlib.pyplot as plt
import struct
from scipy.io import wavfile as wav
import os
from datetime import datetime 
from sklearn import metrics 
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint 

First, we extract the MFCCS (Mel-frequency cepstrum) from the audio files

In [62]:
def extract_features(file_name):
    nmfcc=40
    hop_length = 512
    n_fft = 1024
    n_mels = 128
    try:
        y, sr = librosa.load(file_name, res_type='kaiser_fast') 
        audio, _ = librosa.effects.trim(y)
        S = librosa.feature.mfcc(audio, sr=sr, n_fft=n_fft, hop_length=hop_length, center=False, n_mfcc=nmfcc, fmin=0)
        S_DB = librosa.power_to_db(S, ref=np.max)
        mfccsscaled = np.mean(S_DB.T,axis=0)   
    except Exception:
        print("Error encountered while parsing file: ", file_name)
        return None 
    return mfccsscaled

Now we read the audio file and csv file and preparing a formatted chart of X and Y values.

In [63]:
# Reading the audio file and csv file and preparing a formatted chart of X and Y values.
## Label filepath
fulldatasetpath = 'UrbanSound8K/audio/'
metadata = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
features = []

for index, row in metadata.iterrows():
    file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    class_label = row["classID"]
    data = extract_features(file_name)
    features.append([data, class_label])
    
# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])
print(featuresdf)

                                                feature  class_label
0     [-80.0, -2.5677357, -80.0, -80.0, -80.0, -80.0...            3
1     [-80.0, -1.0000091, -80.0, -3.5282156, -51.552...            2
2     [-80.0, -0.9102827, -80.0, -4.7280188, -51.951...            2
3     [-80.0, -1.4973112, -77.357445, -4.4096923, -4...            2
4     [-80.0, -0.58883995, -80.0, -3.3096602, -45.63...            2
...                                                 ...          ...
8727  [-80.0, -0.64906764, -80.0, -6.3651896, -75.83...            1
8728  [-80.0, -2.813218, -80.0, -4.689935, -80.0, -5...            1
8729  [-80.0, -1.5988646, -80.0, -6.2588015, -74.091...            1
8730  [-80.0, -0.66870856, -80.0, -6.298651, -80.0, ...            1
8731  [-80.0, -1.0094087, -80.0, -4.0532303, -72.861...            1

[8732 rows x 2 columns]


Multi-head attention class is defined here and the data is sequentially embedded.

In [66]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=6):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        ## x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        
        ## (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs) 
        
        ## (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs) 
        
        ## (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  
        ## (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  
        ## (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  
        ## (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  
        ## (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  
        ## (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  
        ## (batch_size, seq_len, embed_dim)
        return output

Here the Transformer class is introduced as a layer in Training model. Within this, further layers are attached.

In [67]:
class TransformerBlock(layers.Layer):
    ## For calling multihead attention on embedded data and arranging it sequentially and adding other layers.
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        
    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

Every data is tokenized here, and size of max token is the size of the biggest data figure.

In [68]:
class TokenAndPositionEmbedding(layers.Layer):
    ## For preliminary token generation and embedding
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

Generating audio data in the word domain to apply transformer

In [69]:
featuresdf=featuresdf.dropna(axis=0)
X = np.array(featuresdf.feature.tolist())
X=X*100000
min_X=-min([min(element) for element in X])
x=X+min_X
x=x.astype(int)
max_len=max([max(element) for element in x])

# Getting label size
y = np.array(featuresdf.class_label.tolist())

We split the data

In [70]:
from sklearn.model_selection import train_test_split 
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1,shuffle=False)
m,n=x.shape

# Dividing in train and test
x_train=x[0:int(m*9/10),:]
x_test=x[int(m*9/10):m,:]
y_train=y[0:int(m*9/10)]
y_test=y[int(m*9/10):m]

### Changing data type for transformer layers

In [71]:

m,n=x_test.shape

## Converting form and reshaping
TestX=x_test
TestY=y_test
testy=np.reshape(TestY,(m,))

## Changing datatypes
testx=np.empty((m,),object)
for i in range (0,m):
    testx[i]=list(int(v) for v in TestX[i])
    testy[i]=testy[i].astype(int)

## Printing data-types - relevant to transformer input
#print(type(testx))
#print(type(testx[m-1][n-1]))
#print(type(testx[m-1]))
#print(type(testy[m-1]))
#print(type(testy))

## Converting Train Data and Getting size of data
m,n=x_train.shape
#print(m,n)

## Converting form and reshaping
TrainX=x_train
TrainY=y_train
trainy=np.reshape(TrainY,(m,))

## Changing datatypes
trainx=np.empty((m,),object)
for i in range (0,m):
    trainx[i]=list(int(v) for v in TrainX[i])
    trainy[i]=TrainY[i].astype(int)
    
## Printing data-types - relevant to transformer input
#print(type(trainx))
#print(type(trainx[m-1][n-1]))
#print(type(trainx[m-1]))
#print(type(trainy[m-1]))
#print(type(trainy))
#print(trainx.shape,trainy.shape,testx.shape,testy.shape)

print(len(trainx), "Training sequences")
print(len(testx), "Validation sequences")

7858 Training sequences
874 Validation sequences


Now, we convert to padded tensor sequence

In [72]:
vocab_size = max_len+1
maxlen = 40

trainx = keras.preprocessing.sequence.pad_sequences(trainx,maxlen=maxlen)
testx = keras.preprocessing.sequence.pad_sequences(testx,maxlen=maxlen)
#print(trainx.shape,trainy.shape,testx.shape,testy.shape)
#print(trainx,trainy,testx,testy)

In [75]:
embed_dim = 30  ## Embedding size for each token
num_heads = 6  ## Number of attention heads
ff_dim = 30  ## Hidden layer size in feed forward network inside transformer

## Tokenizing input data with max dimension and embedding it
inputs = layers.Input(shape=(maxlen,))
#x = keras.Sequential()
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)

## Adding Sequential layer to the embedded data and attention layers too.
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)

## Add other layers
x = layers.Conv1D(6,3,padding="same")(x)
x = layers.Dense(30, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.MaxPool1D(pool_size=2, strides=2)(x)
x = layers.Dense(30, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(30, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Flatten()(x)
x = layers.Dense(30, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(30, activation="relu")(x)
x = layers.Dropout(0.5)(x)

## Producing general softmax layer for classification
outputs = layers.Dense(10, activation="softmax")(x)

## Generating model
model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()



TypeError: in converted code:


    TypeError: call() missing 1 required positional argument: 'training'


In [76]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(trainx, trainy, batch_size=100, epochs=5)

NameError: name 'model' is not defined

In [None]:
score = model.evaluate(trainx, trainy, verbose=0)
print("Training Performance",score)
score = model.evaluate(testx, testy, verbose=0)
print("Testing Performanr",score)

Training Performance [0.48297321796417236, 0.8677664995193481]
Testing Performanr [0.6155197620391846, 0.8627991080284119]


In [65]:
del model
keras.backend.clear_session()