In [3]:
# !pip install librosa 

<a id="1"></a>
# Including Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import librosa
import librosa.display
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import IPython
# import keras.layers as L
# import tensorflow as tf
# from keras.callbacks import EarlyStopping, ReduceLROnPlateau 
# from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder,StandardScaler
# import re
# import itertools
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
device = "cuda" if torch.cuda.is_available() else "cpu"
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="2"></a>
# Including the Datasets

In [None]:
main_Path='Emotions/'

<a id="3"></a>
# Data Analyze

In [None]:

emotions=os.listdir(main_Path)
main=[]
for emotion in emotions:
    path=main_Path+emotion+'/'
    for file in os.listdir(path):
        main.append([emotion,file])
main_df=pd.DataFrame(main,columns=['Emotion','File'])
main_df.head()

In [None]:
main_df['Emotion'].unique()

In [None]:

main_df.to_csv('main_df.csv',index=False)

In [None]:
main_df.head(15)

In [None]:
main_df.tail(15)

In [None]:
plt.figure(figsize=(12,6))
plt.title('Emotions Counts')
emotions=sns.countplot(x='Emotion',data=main_df,palette='Set2')
emotions.set_xticklabels(emotions.get_xticklabels(),rotation=45)

In [None]:
emotion_names=main_df['Emotion'].unique()

In [None]:
colors={'Disgusted':'#804E2D','Happy':'#F19C0E','Sad':'#478FB8','Neutral':'#4CB847','Fearful':'#7D55AA','Angry':'#C00808','Suprised':'#EE00FF'}

In [None]:
def wave_plot(data,sr,emotion,color):
    plt.figure(figsize=(12,5))
    plt.title(f'{emotion} emotion for waveplot',size=17)
    librosa.display.waveshow(y=data,sr=sr,color=color)

In [None]:
def spectogram(data,sr,emotion):
    audio=librosa.stft(data)
    audio_db=librosa.amplitude_to_db(abs(audio))
    plt.figure(figsize=(12,5))
    plt.title(f'{emotion} emotion for spectogram',size=17)
    librosa.display.specshow(audio_db,sr=sr,x_axis='time',y_axis='hz')

In [None]:
audio_path=[]
for emotion in emotion_names:
    path=main_Path+emotion+ "/" + np.array(main_df['File'][main_df['Emotion']==emotion])[1]
    data,sr=librosa.load(path)
    wave_plot(data,sr,emotion,colors[emotion])
    spectogram(data,sr,emotion)
    audio_path.append(path)

In [None]:
print('Disgust Audio Sample\n')
IPython.display.Audio(audio_path[0])

In [None]:
print('Happy Audio Sample\n')
IPython.display.Audio(audio_path[1])

In [None]:
print('Sad Audio Sample\n')
IPython.display.Audio(audio_path[2])

In [None]:
print('Neutral Audio Sample\n')
IPython.display.Audio(audio_path[3])

In [None]:
print('Fear Audio Sample\n')
IPython.display.Audio(audio_path[4])

In [None]:
print('Angry Audio Sample\n')
IPython.display.Audio(audio_path[5])

In [None]:
print('Surprise Audio Sample\n')
IPython.display.Audio(audio_path[6])

<a id="7"></a>
## Audio Augmentation

In [None]:
def add_noise(data,random=False,rate=0.035,threshold=0.075):
    if random:
        rate=np.random.random()*threshold
    noise=rate*np.random.uniform()*np.amax(data)
    augmented_data=data+noise*np.random.normal(size=data.shape[0])
    return augmented_data

def shifting(data,rate=1000):
    augmented_data=int(np.random.uniform(low=-5,high=5)*rate)
    augmented_data=np.roll(data,augmented_data)
    return augmented_data

def pitching(data,sr,pitch_factor=0.7,random=False):
    if random:
        pitch_factor=np.random.random() * pitch_factor
    return librosa.effects.pitch_shift(data,sr=sr,n_steps=pitch_factor)

def streching(data,rate=0.8):
    return librosa.effects.time_stretch(data,rate=rate)
    

In [None]:
data,sr=librosa.load(audio_path[6])

<a id="4"></a>
### Original Audio

In [None]:
plt.figure(figsize=(12,5))
librosa.display.waveshow(data,sr=sr,color='#EE00FF')
IPython.display.Audio(audio_path[6])

<a id="8"></a>
### Noised Audio

In [None]:
noised_audio=add_noise(data)
plt.figure(figsize=(12,5))
librosa.display.waveshow(noised_audio,sr=sr,color='#EE00FF')
IPython.display.Audio(noised_audio,rate=sr)

<a id="10"></a>
### Streched Audio

In [None]:
stretched_audio=streching(data)
plt.figure(figsize=(12,5))
librosa.display.waveshow(stretched_audio,sr=sr,color='#EE00FF')
IPython.display.Audio(stretched_audio,rate=sr)

<a id="11"></a>
### Shifted Audio

In [None]:
shifted_audio=shifting(data)
plt.figure(figsize=(12,5))
librosa.display.waveshow(shifted_audio,sr=sr,color='#EE00FF')
IPython.display.Audio(shifted_audio,rate=sr)

<a id="12"></a>
### Pitched Audio

In [None]:
pitched_audio=pitching(data,sr)
plt.figure(figsize=(12,5))
librosa.display.waveshow(pitched_audio,sr=sr,color='#EE00FF')
IPython.display.Audio(pitched_audio,rate=sr)

<a id="13"></a>
## Feature Extraction

In [None]:
def zcr(data,frame_length,hop_length):
    zcr=librosa.feature.zero_crossing_rate(data,frame_length=frame_length,hop_length=hop_length)
    return np.squeeze(zcr)
def rmse(data,frame_length=2048,hop_length=512):
    rmse=librosa.feature.rms(y=data,frame_length=frame_length,hop_length=hop_length)
    return np.squeeze(rmse)
def mfcc(data,sr,frame_length=2048,hop_length=512,flatten:bool=True):
    mfcc=librosa.feature.mfcc(y=data,sr=sr)
    return np.squeeze(mfcc.T)if not flatten else np.ravel(mfcc.T)

def extract_features(data,sr,frame_length=2048,hop_length=512):
    result=np.array([])
    
    result=np.hstack((result,
                      zcr(data,frame_length,hop_length),
                      rmse(data,frame_length,hop_length),
                      mfcc(data,sr,frame_length,hop_length)
                     ))
    return result

def get_features(path,duration=2.5, offset=0.6):
    data,sr=librosa.load(path,duration=duration,offset=offset)
    aud=extract_features(data,sr)
    audio=np.array(aud)
    
    noised_audio=add_noise(data,random=True)
    aud2=extract_features(noised_audio,sr)
    audio=np.vstack((audio,aud2))
    
    pitched_audio=pitching(data,sr,random=True)
    aud3=extract_features(pitched_audio,sr)
    audio=np.vstack((audio,aud3))
    
    pitched_audio1=pitching(data,sr,random=True)
    pitched_noised_audio=add_noise(pitched_audio1,random=True)
    aud4=extract_features(pitched_noised_audio,sr)
    audio=np.vstack((audio,aud4))
    
    return audio

<a id="14"></a>
## Processing

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,random_state=42,test_size=0.1,shuffle=True)
X_train.shape, X_test.shape, X_val.shape, y_train.shape,y_test.shape,y_val.shape

In [None]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)
X_val=scaler.transform(X_val)
X_train.shape,X_test.shape,X_val.shape,y_train.shape,y_test.shape,y_val.shape

In [None]:
X_train=np.expand_dims(X_train,axis=2)
X_val=np.expand_dims(X_val,axis=2)
X_test=np.expand_dims(X_test,axis=2)
X_train.shape, X_test.shape, X_val.shape

In [None]:
X,Y=[],[]
for path,emotion,index in zip(main_df.File,main_df.Emotion,range(main_df.File.shape[0])):
    features=get_features(main_Path + emotion + "/" + path)
    if index%500==0:
        print(f'{index} audio has been processed')
    for i in features:
        X.append(i)
        Y.append(emotion)
print('Done')

In [None]:
extract=pd.DataFrame(X)
extract['Emotion']=Y
extract.to_csv("processed_data.csv",index=False)
extract.head(10)

<a id="15"></a>
# Including Analyzed Audio Features

In [None]:
df=pd.read_csv("processed_data.csv")
df.shape
df=df.sample(frac=1).reset_index(drop=True)
print(df.shape)


In [None]:
df=df.fillna(0)
print(df.isna().any())
df.shape

<a id="16"></a>
# Processing Analyzed Data for Training

In [None]:
X=df.drop(labels='Emotion',axis=1)
Y=df['Emotion']

#Added conversion to number
indexes = {'Disgusted':0,'Happy':1,'Sad':2,'Neutral':3,'Fearful':4,'Angry':5,'Suprised':6}
Y = Y.map(indexes)
Y.head()

In [None]:
# lb=LabelEncoder()
# Y=np_utils.to_categorical(lb.fit_transform(Y.astype(str)))
# print(lb.classes_)
# Y

import torch
lb=LabelEncoder()
Y = torch.tensor(Y)
Y = lb.fit_transform(Y)
Y = torch.nn.functional.one_hot(torch.tensor(Y)).float()
print(lb.classes_)
Y.shape

<a id="17"></a>
## Train, Test and Validation Spliting

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,random_state=42,test_size=0.2,shuffle=True)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,random_state=42,test_size=0.1,shuffle=True)
X_train.shape, X_test.shape, X_val.shape, y_train.shape,y_test.shape,y_val.shape

In [None]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)
X_val=scaler.transform(X_val)
X_train.shape,X_test.shape,X_val.shape,y_train.shape,y_test.shape,y_val.shape

In [None]:
X_train=np.expand_dims(X_train,axis=2)
X_val=np.expand_dims(X_val,axis=2)
X_test=np.expand_dims(X_test,axis=2)
X_train.shape, X_test.shape, X_val.shape

<a id="19"></a>
# Begin Training

In [443]:
# model=tf.keras.Sequential([
#     L.Conv1D(512,kernel_size=5, strides=1,padding='same', activation='relu',input_shape=(X_train.shape[1],1)),
#     L.BatchNormalization(),
#     L.MaxPool1D(pool_size=5,strides=2,padding='same'),
#     L.Conv1D(512,kernel_size=5,strides=1,padding='same',activation='relu'),
#     L.BatchNormalization(),
#     L.MaxPool1D(pool_size=5,strides=2,padding='same'),
#     L.Conv1D(256,kernel_size=5,strides=1,padding='same',activation='relu'),
#     L.BatchNormalization(),
#     L.MaxPool1D(pool_size=5,strides=2,padding='same'),
#     L.Conv1D(256,kernel_size=3,strides=1,padding='same',activation='relu'),
#     L.BatchNormalization(),
#     L.MaxPool1D(pool_size=5,strides=2,padding='same'),
#     L.Conv1D(128,kernel_size=3,strides=1,padding='same',activation='relu'),
#     L.BatchNormalization(),
#     L.MaxPool1D(pool_size=3,strides=2,padding='same'),
#     L.Flatten(),
#     L.Dense(512,activation='relu'),
#     L.BatchNormalization(),
#     L.Dense(7,activation='softmax')
# ])
# model.compile(optimizer='adam',loss='categorical_crossentropy',metrics='accuracy')


# New code to define model in pytorch

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class AudioModel(nn.Module):
    def __init__(self):
        super(AudioModel, self).__init__()
        
        # Define the model to use
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=512, kernel_size=5, stride=1, padding=2)
        self.bn1 = nn.BatchNorm1d(512)
        self.pool1 = nn.MaxPool1d(kernel_size=5, stride=2, padding=2)
        
        self.conv2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=5, stride=1, padding=2)
        self.bn2 = nn.BatchNorm1d(512)
        self.pool2 = nn.MaxPool1d(kernel_size=5, stride=2, padding=2)
        
        self.conv3 = nn.Conv1d(in_channels=512, out_channels=256, kernel_size=5, stride=1, padding=2)
        self.bn3 = nn.BatchNorm1d(256)
        self.pool3 = nn.MaxPool1d(kernel_size=5, stride=2, padding=2)
        
        self.conv4 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm1d(256)
        self.pool4 = nn.MaxPool1d(kernel_size=5, stride=2, padding=2)
        
        self.conv5 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.bn5 = nn.BatchNorm1d(128)
        self.pool5 = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
        
        self.fc1 = nn.Linear(in_features=128*75, out_features=512) # adjust the input size
        self.bn6 = nn.BatchNorm1d(512) # adjust the input size
        
        self.fc2 = nn.Linear(in_features=512, out_features=7)
        
    def forward(self, x):
        x = self.pool1(self.bn1(F.relu(self.conv1(x))))
        x = self.pool2(self.bn2(F.relu(self.conv2(x))))
        x = self.pool3(self.bn3(F.relu(self.conv3(x))))
        x = self.pool4(self.bn4(F.relu(self.conv4(x))))
        x = self.pool5(self.bn5(F.relu(self.conv5(x))))
        x = x.view(-1, 128*75) # flatten the tensor
        x = self.fc1(x)
        x = nn.functional.relu(x)
        x = self.bn6(x)
        x = self.fc2(x)
        x = nn.functional.softmax(x, dim=1) # add softmax activation
        return x
    # def __init__(self, learning_rate=2e-4):
    #     super(AudioModel, self).__init__()
        
    #     # Define the model to use
    #     self.conv1 = nn.Conv1d(in_channels=1, out_channels=512, kernel_size=5, stride=1, padding=2)
    #     self.bn1 = nn.BatchNorm1d(512)
    #     self.pool1 = nn.MaxPool1d(kernel_size=5, stride=2, padding=2)
        
    #     self.conv2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=5, stride=1, padding=2)
    #     self.bn2 = nn.BatchNorm1d(512)
    #     self.pool2 = nn.MaxPool1d(kernel_size=5, stride=2, padding=2)
        
    #     self.conv3 = nn.Conv1d(in_channels=512, out_channels=256, kernel_size=5, stride=1, padding=2)
    #     self.bn3 = nn.BatchNorm1d(256)
    #     self.pool3 = nn.MaxPool1d(kernel_size=5, stride=2, padding=2)
        
    #     self.conv4 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)
    #     self.bn4 = nn.BatchNorm1d(256)
    #     self.pool4 = nn.MaxPool1d(kernel_size=5, stride=2, padding=2)
        
    #     self.conv5 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, stride=1, padding=1)
    #     self.bn5 = nn.BatchNorm1d(128)
    #     self.pool5 = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
        
    #     self.fc1 = nn.Linear(in_features=75, out_features=512)
    #     # self.fc1 = nn.Linear(in_features=128*16, out_features=512)
    #     self.bn6 = nn.BatchNorm1d(128)
        
    #     self.fc2 = nn.Linear(in_features=512, out_features=7)
    #     self.out = nn.Conv1d(128, 1, 1)
    #     self.softmax = nn.Softmax(dim=1)

    #     # Model from tutorial start with something I know works
    #     # self.conv1 = nn.Conv1d(in_channels = 1, out_channels=60, kernel_size=3, padding=1)
    #     # self.conv2 = nn.Conv1d(in_channels=60, out_channels=60, kernel_size=3, padding=1)
    #     # self.conv3 = nn.Conv1d(in_channels=60, out_channels=60, kernel_size=3, padding=1)
    #     # self.conv4 = nn.Conv1d(in_channels=60, out_channels=60, kernel_size=3, padding=1)
    #     # self.classifier = nn.Linear(2376, 7)
    #     # self.out = nn.Conv1d(in_channels=60, out_channels=7, kernel_size=1, padding=0) 

    #     # self.fc1 = nn.Linear(in_features=60, out_features=1)
        
        
    # def forward(self, x):
    #     x = self.pool1(self.bn1(F.relu(self.conv1(x))))
    #     x = self.pool2(self.bn2(F.relu(self.conv2(x))))
    #     x = self.pool3(self.bn3(F.relu(self.conv3(x))))
    #     x = self.pool4(self.bn4(F.relu(self.conv4(x))))
    #     x = self.pool5(self.bn5(F.relu(self.conv5(x))))
    #     # x = self.bn5(F.relu(self.conv5(x)))
    #     # x = x.view(-1, 128*16) #Not sure what this is. what you doing ChatGPT?
    #     x = self.fc1(x)
    #     x = nn.functional.relu(x)
    #     x = self.bn6(x)
    #     x = self.fc2(x)
    #     x = self.out(x)
    #     x = self.softmax(x)
    #     # x = F.relu(self.conv1(x))
    #     # x = F.relu(self.conv2(x))
    #     # x = F.relu(self.conv3(x))
    #     # x = F.relu(self.conv4(x))
    #     # x = self.classifier(x)
    #     # # x = self.out(x)
    #     # # x = self.fc1(x)
    #     # x = x.view(-1, 7) 
    #     return x

audio_model = AudioModel()
audio_model

AudioModel(
  (conv1): Conv1d(1, 512, kernel_size=(5,), stride=(1,), padding=(2,))
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=5, stride=2, padding=2, dilation=1, ceil_mode=False)
  (conv2): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=5, stride=2, padding=2, dilation=1, ceil_mode=False)
  (conv3): Conv1d(512, 256, kernel_size=(5,), stride=(1,), padding=(2,))
  (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=5, stride=2, padding=2, dilation=1, ceil_mode=False)
  (conv4): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn4): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=5, stride=2, padding=2, dilation=1, ceil_mode=F

<a id="18"></a>
## Tuning for Training

In [444]:
# early_stop=EarlyStopping(monitor='val_acc',mode='auto',patience=5,restore_best_weights=True)
# lr_reduction=ReduceLROnPlateau(monitor='val_acc',patience=3,verbose=1,factor=0.5,min_lr=0.00001)
#this code block intially came before defining the model. 


# New Code from the denoising tutorial

from tqdm import tqdm

class EarlyStopper():
    def __init__(self, patience: int = 3):
        self.best_val = +np.inf
        self.current_val = +np.inf
        self.patience = patience
        self.steps_since_last_best = 0
        
    def __call__(self, val) -> bool:
        self.current_val = val
        self._update_best()
        if self.steps_since_last_best == self.patience:
            return True
        else:
            return False
        
    def _update_best(self):
        if self.current_val < self.best_val:
            tqdm.write(f"New best: {self.current_val:.4f}")
            self.best_val=self.current_val
            self.steps_since_last_best = 0
        else:
            self.steps_since_last_best+=1

optim = torch.optim.Adam(params=audio_model.parameters())
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optim,
    "min",
    factor=0.1,
    patience=2,
    verbose=True
)

In [445]:
# history=model.fit(X_train, y_train, epochs=10, validation_data=(X_val,y_val), batch_size=64)
# model.save("res_model.h5")


# New code to define a model trainer

def train(model, optim, scheduler, criterion, train_loader) -> None:
    device = next(iter(audio_model.parameters())).device # Check if this can be just model
    model.train()
    total_loss = 0
    for x,y in train_loader:
        x = x.to(device)
        y = y.to(device)
        y_pred = model(x)
        y_pred = y_pred.reshape(8, 7, -1)
        # print(f"y_pred = {y_pred.shape}, y = {y.shape}")
        loss = criterion(y_pred.squeeze(), y)
        loss.backward()
        optim.step()
        optim.zero_grad()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    scheduler.step(avg_loss)


In [446]:
X_train.shape, y_train.shape

((36857, 2376, 1), torch.Size([36857, 7]))

In [447]:
# X1 = X_train[:16, :]
# Y1 = y_train[:16, :]

# X1.shape, Y1.shape

((16, 2376, 1), torch.Size([16, 7]))

In [448]:
from torch.utils.data import TensorDataset, DataLoader

def get_data_loader(x,y,bs):
    # Permute np.array from NLC TO NCL
    x = torch.permute(torch.Tensor(x), dims=(0, 2, 1))
    y = torch.Tensor(y)
    return DataLoader(
        TensorDataset(x, y),
        batch_size=bs
    )

# train_loader = get_data_loader(X1, Y1, 8)
train_loader = get_data_loader(X_train, y_train, 8)
val_loader = get_data_loader(X_val, y_val, 8)
test_loader = get_data_loader(X_test, y_test, 8)

for x, y in train_loader:
    break

print(x.shape)
print(y.shape)

torch.Size([8, 1, 2376])
torch.Size([8, 7])


In [449]:
@torch.no_grad()
def validate(model, val_loader, criterion) -> float: 
    model.eval()
    device = next(iter(audio_model.parameters())).device
    total_loss = 0
    for x,y in val_loader:
        x = x.to(device)
        y = y.to(device)
        y_pred = model(x)
        y_pred = y_pred.reshape(8, 7, -1)
        # print(f"y_pred = {y_pred.shape}, y = {y.shape}")
        loss = criterion(y_pred.squeeze(), y)
        # loss = criterion(y_pred.reshape(8, 7, -1), y) # Remove channel dim from y_pred
        # loss = criterion(y_pred, y)
        total_loss+=loss.item()
    return total_loss / len(val_loader)

In [450]:
model_name = "./best_model.pt"
epochs = 100
criterion = nn.CrossEntropyLoss()
early_stopper = EarlyStopper(patience=8)
audio_model.to(device)

for _ in tqdm(range(epochs)):
    train(audio_model, optim, lr_scheduler, criterion, train_loader)
    val_mse = validate(audio_model, val_loader, criterion)
    if early_stopper(val_mse):
        break
    if early_stopper.steps_since_last_best == 0:
        torch.save(audio_model, model_name)

 20%|██        | 1/5 [03:06<12:27, 186.85s/it]

New best: 1.9542


 20%|██        | 1/5 [03:28<13:52, 208.11s/it]


KeyboardInterrupt: 

In [451]:
# net = AudioModel()
net = torch.load(model_name)


In [463]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in test_loader:
        images, labels = data
        # calculate outputs by running images through the network
        outputs = net(images)
        # the class with the highest energy is what we choose as prediction
        # print(labels)
        _, labels = torch.max(labels.data, 1)
        _, predicted = torch.max(outputs.data, 1)

        l1 = labels
        p1 = predicted
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the test images: {100 * correct / total} %')

Accuracy of the network on the test images: 8.0 %


<a id="20"></a>
# Drawing Charts

<a id="21"></a>
## Accuracy Charts

In [None]:
fig=px.line(history.history,y=['accuracy','val_accuracy'],
           labels={'index':'epoch','value':'accuracy'},
           title=f'According to the epoch accuracy and validation accuracy chart for the model')
fig.show()

<a id="22"></a>
## Loss Charts

In [None]:
fig=px.line(history.history,y=['loss','val_loss'],
           labels={'index':'epoch','value':'loss'},
           title=f'According to the epoch loss and validation loss chart for the model')
fig.show()

<a id="24"></a>
# Testing Model and Test Results

In [None]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_pred

In [None]:
y_check=np.argmax(y_test,axis=1)
y_check

In [None]:
loss,accuracy=model.evaluate(X_test,y_test,verbose=0)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

<a id="25"></a>
## Confusion Matrix

In [None]:
conf=confusion_matrix(y_check,y_pred)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(12,7))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for model ')
plt.show()

In [None]:
print(f'Model Confusion Matrix\n',classification_report(y_check,y_pred,target_names=emotion_names))

In [None]:

model.save("res_model.h5")

# save tflite model for android
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
open("res_model.tflite", "wb").write(tflite_model)


In [None]:
import sounddevice as sd
import soundfile as sf
import numpy as np

fs = 44100  # Sample rate
seconds = 3  # Duration of recording

myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1)
sd.wait()  # Wait until recording is finished
path = 'test.wav'
sf.write(path, myrecording, fs)  # Save as WAV file

# play the sample
sd.play(myrecording, fs)

features=get_features(path)
features=np.expand_dims(features,axis=0)
features=np.expand_dims(features,axis=2)
features.shape

pred=model.predict(features)
# print predicted emotion name and the probability
print(emotion_names[np.argmax(pred)])
print(np.max(pred))



