# Classification using Approach 3

DESCRIPTION OF THE APPROACH

## Loading Libraries

Before starting you will need to import the libraries that are needed.

In [3]:
import IPython.display as ipd
import glob
from scipy.io import wavfile
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib as plt
import matplotlib.pyplot as plt
import struct
from scipy.io import wavfile as wav
import os
from datetime import datetime 
import sklearn
from sklearn import metrics 
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint 

Creation of classes

In [2]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=6):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        ## x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        
        ## (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs) 
        
        ## (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs) 
        
        ## (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  
        ## (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  
        ## (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  
        ## (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  
        ## (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  
        ## (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  
        ## (batch_size, seq_len, embed_dim)
        return output

In [4]:
class TransformerBlock(layers.Layer):
    ## For calling multihead attention on embedded data and arranging it sequentially and adding other layers.
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        
    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [5]:
class TokenAndPositionEmbedding(layers.Layer):
    ## For preliminary token generation and embedding
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [2]:
#startTime=0.11
#endTime=1.11
#song = AudioSegment.from_wav('UrbanSound8K/audio/fold2/204773-3-9-1.wav')
#extract = song[startTime:endTime]
#print(extract)
#extract.export('Extract',format='mp3')
#audio,sample_rate=librosa.load('UrbanSound8K/audio/fold2/204773-3-9-1.wav')
#nmfcc=20
#mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=nmfcc)
#mfccs = sklearn.preprocessing.scale(mfccs, axis=1)
#arr=mfccs
#print(arr.shape)
#new_arr = ((arr - arr.min()) * (1/(arr.max() - arr.min()) * 100000)).astype(int)
#new_arr=np.reshape(new_arr, (nmfcc*11,))
#print(new_arr.shape)

In [7]:
def extract_features(file_name):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccsscaled = np.mean(mfccs.T,axis=0)
        
    except Exception:
        print("Error encountered while parsing file: ", file_name)
        return None 
     
    return mfccsscaled

In [8]:
fulldatasetpath = 'UrbanSound8K/audio/'
metadata = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
features = []

for index, row in metadata.iterrows():
    file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    class_label = row["classID"]
    data = extract_features(file_name)
    features.append([data, class_label])
    
# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])
print(featuresdf)

  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]


                                                feature  class_label
0     [-215.79301, 71.66612, -131.81377, -52.09133, ...            3
1     [-424.68677, 110.56227, -54.148235, 62.01074, ...            2
2     [-459.56467, 122.800354, -47.92471, 53.265697,...            2
3     [-414.55377, 102.896904, -36.66495, 54.18041, ...            2
4     [-447.397, 115.0954, -53.809113, 61.60859, 1.6...            2
...                                                 ...          ...
8727  [-399.2257, 136.81902, -51.964222, 37.02399, -...            1
8728  [-346.72733, 87.48847, -46.265022, 52.748833, ...            1
8729  [-304.61316, 112.6199, -47.161945, 37.00349, -...            1
8730  [-344.71423, 126.75813, -56.17717, 36.070927, ...            1
8731  [-315.93384, 95.67589, -38.047768, 47.50074, -...            1

[8732 rows x 2 columns]


In [9]:
featuresdf=featuresdf.dropna(axis=0)
X = np.array(featuresdf.feature.tolist())
#X = sklearn.preprocessing.scale(X)
#x = ((X - X.min()) * (1/(X.max() - X.min()) * 100000)).astype(int)
#max_len=max([max(element) for element in x])
#print(x)
#y = np.array(featuresdf.class_label.tolist())
#print(y)
X=X*100000
min_X=-min([min(element) for element in X])
x=X+min_X
x=x.astype(int)
max_len=max([max(element) for element in x])
print(max_len)
# Getting label size
y = np.array(featuresdf.class_label.tolist())
print(x,x.shape)
print(y,y.shape)

104338845
[[55124989 83870903 63522914 ... 76536014 76615704 76739729]
 [34235614 87760517 71289467 ... 76766031 76635793 76761442]
 [30747824 88984326 71911819 ... 76926832 76858952 76620659]
 ...
 [46242974 87966281 71988096 ... 76400932 76975348 77471480]
 [42232867 89380104 71086573 ... 75924064 76526383 77287832]
 [45110906 86271879 72899514 ... 76762048 75557914 76107755]] (8732, 40)
[3 2 2 ... 1 1 1] (8732,)


In [10]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

In [11]:
m,n=x_test.shape
print(m,n)

## Converting form and reshaping
TestX=x_test
TestY=y_test
testy=np.reshape(TestY,(m,))

## Changing datatypes
testx=np.empty((m,),object)
for i in range (0,m):
    testx[i]=list(int(v) for v in TestX[i])
    testy[i]=testy[i].astype(int)

## Printing data-types - relevant to transformer input
print(type(testx))
print(type(testx[m-1][n-1]))
print(type(testx[m-1]))
print(type(testy[m-1]))
print(type(testy))

## Converting Train Data and Getting size of data
m,n=x_train.shape
print(m,n)

## Converting form and reshaping
TrainX=x_train
TrainY=y_train
trainy=np.reshape(TrainY,(m,))

## Changing datatypes
trainx=np.empty((m,),object)
for i in range (0,m):
    trainx[i]=list(int(v) for v in TrainX[i])
    trainy[i]=TrainY[i].astype(int)
    
## Printing data-types - relevant to transformer input
print(type(trainx))
print(type(trainx[m-1][n-1]))
print(type(trainx[m-1]))
print(type(trainy[m-1]))
print(type(trainy))
print(trainx.shape,trainy.shape,testx.shape,testy.shape)

2882 40
<class 'numpy.ndarray'>
<class 'int'>
<class 'list'>
<class 'numpy.int32'>
<class 'numpy.ndarray'>
5850 40
<class 'numpy.ndarray'>
<class 'int'>
<class 'list'>
<class 'numpy.int32'>
<class 'numpy.ndarray'>
(5850,) (5850,) (2882,) (2882,)


In [12]:
vocab_size = max_len+1
maxlen = 40
print(len(trainx), "Training sequences")
print(len(testx), "Validation sequences")

## Converting to padded tensor sequence
trainx = keras.preprocessing.sequence.pad_sequences(trainx,maxlen=maxlen)
testx = keras.preprocessing.sequence.pad_sequences(testx,maxlen=maxlen)
print(trainx.shape,trainy.shape,testx.shape,testy.shape)
print(trainx,trainy,testx,testy)

5850 Training sequences
2882 Validation sequences
(5850, 40) (5850,) (2882, 40) (2882,)
[[62137597 78335727 72880427 ... 77052909 76297692 77056225]
 [56190113 96227093 76519287 ... 76470681 76472803 77059141]
 [57354196 90859410 72696932 ... 76759726 76683174 76637231]
 ...
 [18281848 92145918 70166513 ... 76385245 76937164 76049535]
 [66350048 93174070 72786761 ... 76973658 77026353 76877930]
 [42899530 86940032 73994583 ... 76712891 76271752 76351661]] [7 9 3 ... 8 5 8] [[58395013 87664977 64456540 ... 76694969 76592161 76925777]
 [52305891 94894248 73165505 ... 76648285 76396553 76633610]
 [15201312 90125158 75244779 ... 76230071 76791889 76514939]
 ...
 [34846868 93652304 68048023 ... 76506201 76687604 76680986]
 [58320698 87138247 74485121 ... 76733707 76708008 76718180]
 [47970864 97041497 65043084 ... 76516383 76899214 77234400]] [2 3 8 ... 4 9 5]


In [13]:
embed_dim = 30  ## Embedding size for each token
num_heads = 6  ## Number of attention heads
ff_dim = 30  ## Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(10, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


TypeError: in converted code:


    TypeError: call() missing 1 required positional argument: 'training'


In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(trainx, trainy, batch_size=100, epochs=100, validation_data=(testx,testy))

In [77]:
del model
keras.backend.clear_session()