In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

import sys
print(sys.path)

['/home/nesl/anaconda3/envs/iros24/lib/python312.zip', '/home/nesl/anaconda3/envs/iros24/lib/python3.12', '/home/nesl/anaconda3/envs/iros24/lib/python3.12/lib-dynload', '', '/home/nesl/anaconda3/envs/iros24/lib/python3.12/site-packages']


In [2]:
import glob
import shutil
import random
import numpy as np
import pandas as pd
from os.path import join
from torch.utils.data import Dataset
import torch
from torch.utils.tensorboard import SummaryWriter
from torchinfo import summary
import configparser
import matplotlib.pyplot as plt
from tqdm import tqdm
import datetime
import json

In [3]:
from torch.utils.data import DataLoader
from torch import nn
from torch import optim
from multimodal import *

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [7]:
time_window = 1
audio_rate = 16000
audio_input_length = int(audio_rate * time_window)
n_train_multi_data_per_class = 800
n_test_multi_data_per_class = 200

audio_train_set = ESC70Select(
    time_window=time_window,
    folds=[1, 2, 3, 4],
    transforms=lambda x: nn.functional.pad(x,((audio_input_length-x.shape[1])//2, (audio_input_length-x.shape[1])//2))\
        if (x.shape[1] % 2) == 0 \
        else nn.functional.pad(x,((audio_input_length-x.shape[1])//2, (audio_input_length-x.shape[1])//2 + 1)), 
    overwrite=False,
    use_bc_learning=False,
    audio_rate=audio_rate)

imu_train_set = WISDMSelect(
    folds=[1, 2, 3, 4],
    time_window=time_window,
    overwrite=False,
    normalize_acc=True)

audio_train_loader = torch.utils.data.DataLoader(audio_train_set, 
                                            batch_size=32, 
                                            shuffle=True)

imu_train_loader = DataLoader(imu_train_set, batch_size=128, 
                            shuffle=True, num_workers=4)

multimodal_train_set = MultimodalDataset(audio_train_set, 
                                         imu_train_set, 
                                         num_data_per_class=n_train_multi_data_per_class, 
                                         time_window=time_window,
                                         overwrite=False
                                         )

print('Audio data: ({}, {}, {})'.format(len(audio_train_set.sounds), 
                          audio_train_set.sounds[0].shape[0], 
                          audio_train_set.sounds[0].shape[1]))
print('IMU data: ({}, {}, {})'.format(len(imu_train_set.imus), 
                          imu_train_set.imus[0].shape[0], 
                          imu_train_set.imus[0].shape[1]))
print('Audio data: ({}, {}, {})'.format(len(multimodal_train_set.sounds), 
                          multimodal_train_set.sounds[0].shape[0], 
                          multimodal_train_set.sounds[0].shape[1]))
print('IMU data: ({}, {}, {})'.format(len(multimodal_train_set.imus), 
                          multimodal_train_set.imus[0].shape[0], 
                          multimodal_train_set.imus[0].shape[1]))

print(multimodal_train_set.get_label_mapping())

multimodal_train_loader = DataLoader(multimodal_train_set, batch_size=8, 
                            shuffle=True, num_workers=4)
 

./Audio/ESC50/meta/esc50.csv
./Audio/kitchen20/kitchen20.csv
./Audio/silent_sound/silent_sound.csv
loading  fold1
loading  fold2
loading  fold3
loading  fold4
loading  fold1
loading  fold2
loading  fold3
loading  fold4
Loading...
Audio data: (480, 1, 16000)
IMU data: (60613, 6, 20)
Audio data: (7200, 1, 16000)
IMU data: (7200, 6, 20)
{'brush_teeth': 0, 'click_mouse': 1, 'drink': 2, 'eat': 3, 'flush_toilet': 4, 'sit': 5, 'type': 6, 'walk': 7, 'wash': 8}


In [8]:
audio_test_set = ESC70Select(
    time_window=time_window,
    folds=[5],
    transforms=lambda x: nn.functional.pad(x,((audio_input_length-x.shape[1])//2, (audio_input_length-x.shape[1])//2))\
        if (x.shape[1] % 2) == 0 \
        else nn.functional.pad(x,((audio_input_length-x.shape[1])//2, (audio_input_length-x.shape[1])//2 + 1)), 
    overwrite=False,
    use_bc_learning=False,
    audio_rate=audio_rate)

audio_test_loader = DataLoader(audio_test_set, batch_size=32, 
                            shuffle=False, num_workers=2)

imu_test_set = WISDMSelect(
    folds=[5],
    time_window=time_window,
    overwrite=False,
    normalize_acc=True)

imu_test_loader = DataLoader(imu_test_set, batch_size=128, 
                            shuffle=False, num_workers=2)


multimodal_test_set = MultimodalDataset(audio_test_set, 
                                        imu_test_set, 
                                        num_data_per_class=n_test_multi_data_per_class,
                                        time_window=time_window,
                                        overwrite=False
                                        )


print('Audio data: ({}, {}, {})'.format(len(multimodal_test_set.sounds), 
                          multimodal_test_set.sounds[0].shape[0], 
                          multimodal_test_set.sounds[0].shape[1]))
print('IMU data: ({}, {}, {})'.format(len(multimodal_test_set.imus), 
                          multimodal_test_set.imus[0].shape[0], 
                          multimodal_test_set.imus[0].shape[1]))

multimodal_test_loader = DataLoader(multimodal_test_set, batch_size=8, 
                            shuffle=True, num_workers=4)

./Audio/ESC50/meta/esc50.csv
./Audio/kitchen20/kitchen20.csv
./Audio/silent_sound/silent_sound.csv
loading  fold5
loading  fold5
Loading...
Audio data: (1800, 1, 16000)
IMU data: (1800, 6, 20)


# Audio Module: BEATs
Use a pre-trained model to extract sound features.

In [7]:
sys.path.append('/home/liying/Documents/MS thesis/master-thesis/BEATs')
print(sys.path)

['/home/nesl/Documents/IROS24/CED_Methods_Eval', '/home/nesl/anaconda3/envs/iros24/lib/python312.zip', '/home/nesl/anaconda3/envs/iros24/lib/python3.12', '/home/nesl/anaconda3/envs/iros24/lib/python3.12/lib-dynload', '', '/home/nesl/anaconda3/envs/iros24/lib/python3.12/site-packages', '/home/liying/Documents/MS thesis/master-thesis/BEATs']


In [8]:
audio_train_set.get_label_mapping()

{'blender': 0,
 'no_sound': 1,
 'stove-burner': 2,
 'water-flowing': 3,
 'drawer': 4,
 'clean-dishes': 5,
 'chopping': 6,
 'eat': 7,
 'peel': 8,
 'toilet_flush': 9,
 'footsteps': 10,
 'brushing_teeth': 11,
 'drinking_sipping': 12,
 'mouse_click': 13,
 'keyboard_typing': 14}

In [9]:
from BEATs import BEATs, BEATsConfig

# load the pre-trained checkpoints
checkpoint = torch.load('./BEATs/BEATs_iter3_plus_AS2M.pt')

cfg = BEATsConfig(checkpoint['cfg'])
BEATs_model = BEATs(cfg)
BEATs_model.load_state_dict(checkpoint['model'])
BEATs_model.eval()

# extract the the audio representation
audio_input_16khz = audio_train_set.sounds[0]
padding_mask = torch.zeros_like(audio_train_set.sounds[0]).bool()

representation = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]

ModuleNotFoundError: No module named 'BEATs'

In [10]:
print(summary(BEATs_model))

Layer (type:depth-idx)                                  Param #
BEATs                                                   --
├─Linear: 1-1                                           393,984
├─Conv2d: 1-2                                           131,072
├─Dropout: 1-3                                          --
├─TransformerEncoder: 1-4                               --
│    └─Sequential: 2-1                                  --
│    │    └─Conv1d: 3-1                                 4,719,488
│    │    └─SamePad: 3-2                                --
│    │    └─GELU: 3-3                                   --
│    └─ModuleList: 2-2                                  --
│    │    └─TransformerSentenceEncoderLayer: 3-4        7,092,244
│    │    └─TransformerSentenceEncoderLayer: 3-5        7,092,244
│    │    └─TransformerSentenceEncoderLayer: 3-6        7,092,244
│    │    └─TransformerSentenceEncoderLayer: 3-7        7,092,244
│    │    └─TransformerSentenceEncoderLayer: 3-8        7,092,244

## Experiment: finetune model on ESC70Select dataset

In [11]:
class BEATsFinetuned(nn.Module):
    def __init__(self, BEATs_pretrained_model, n_class, predictor_dropout=0.0):
        super().__init__()
        self.BEATs = BEATs_pretrained_model
        self.predictor_dropout = nn.Dropout(predictor_dropout)
        self.predictor = nn.Linear(768, n_class)
 
    def forward(self, x):
        x = self.BEATs.extract_features(x, padding_mask=torch.zeros_like(x).bool())[0]
        x = self.predictor_dropout(x)
        logits = self.predictor(x).mean(dim=1)
        lprobs = torch.sigmoid(logits)
        return lprobs

    def extract_features(self, x):
        x = self.BEATs.extract_features(x, padding_mask=torch.zeros_like(x).bool())[0]
        x = self.predictor_dropout(x)
        logits = self.predictor(x).mean(dim=1)
        return logits

In [12]:
# Freeze the pretrained model
for param in BEATs_model.parameters():
    param.requires_grad = False

BEATs_finetuned_model = BEATsFinetuned(BEATs_model, audio_train_set.nClasses)
output = BEATs_finetuned_model(audio_train_set.sounds[0])
output.shape

torch.Size([1, 15])

### Training

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(BEATs_finetuned_model.parameters(), lr=0.001, momentum=0.9)
BEATs_finetuned_model.to(device)

print(audio_train_set.get_label_mapping())
# Training loop
n_epochs = 100
summary = {'loss': [[] for _ in range(n_epochs)], 'acc': [[] for _ in range(n_epochs)]}
for e in range(n_epochs):
    for i, (sounds, labels) in enumerate(audio_train_loader):
        optimizer.zero_grad()
        sounds = sounds.squeeze(dim=1).to(device)
        labels = labels.to(device)
        # Run the Net
        x = BEATs_finetuned_model(sounds)
        # print(x.shape)
        # print(labels.shape)
        # x = x.view(x.size()[:-1])

        # Optimize net
        loss = criterion(x, labels.long())
        loss.backward()
        optimizer.step()
        summary['loss'][e].append(loss.item())

            # Calculat accuracy
        _, pred = x.data.topk(1, dim=1)
        pred = pred.view(pred.shape[:-1])
        acc = torch.sum(pred == labels)/x.shape[0]
        summary['acc'][e].append(acc.item())
        
    print('Loss: {}, Accuracy: {}'.format(np.mean(summary['loss'][e]), np.mean(summary['acc'][e])))

{'no_sound': 0, 'blender': 1, 'stove-burner': 2, 'water-flowing': 3, 'drawer': 4, 'clean-dishes': 5, 'chopping': 6, 'eat': 7, 'peel': 8, 'toilet_flush': 9, 'footsteps': 10, 'brushing_teeth': 11, 'drinking_sipping': 12, 'mouse_click': 13, 'keyboard_typing': 14}
Loss: 2.703262436389923, Accuracy: 0.08125
Loss: 2.6863094727198282, Accuracy: 0.18125
Loss: 2.668513762950897, Accuracy: 0.3
Loss: 2.6522530714670816, Accuracy: 0.3645833333333333
Loss: 2.635932270685832, Accuracy: 0.4666666666666667
Loss: 2.620411479473114, Accuracy: 0.5125
Loss: 2.606025139490763, Accuracy: 0.5645833333333333
Loss: 2.5914759993553163, Accuracy: 0.6208333333333333
Loss: 2.577638653914134, Accuracy: 0.6666666666666666
Loss: 2.564376974105835, Accuracy: 0.7125
Loss: 2.551648751894633, Accuracy: 0.71875
Loss: 2.5392409880956013, Accuracy: 0.7375
Loss: 2.5272345622380574, Accuracy: 0.7625
Loss: 2.5158973693847657, Accuracy: 0.7604166666666666
Loss: 2.5044994155565896, Accuracy: 0.7666666666666667
Loss: 2.4939009388

### Testing

In [14]:
test_accuracy = []
for i, (sounds, labels) in enumerate(audio_test_loader):
        # Run the Net
        sounds = sounds.squeeze(dim=1).to(device)
        labels = labels.to(device)
        x = BEATs_finetuned_model(sounds)

        # loss = criterion(x, labels.long())
        # summary['loss'][e].append(loss.item())
        # Calculat accuracy
        _, pred = x.data.topk(1, dim=1)
        pred = pred.view(pred.shape[:-1])
        acc = torch.sum(pred == labels)/x.shape[0]
        summary['acc'][e].append(acc.item())
print(np.mean(summary['acc'][e]))

0.89


# IMU Module: LIMU-BERT
A pre-trained autoencoder model using LIMU-BERT architecture to extract acceleration features.

In [11]:
sys.path.append('/home/liying/Documents/MS thesis/master-thesis/LIMUBert')
print(sys.path)

['/home/liying/Documents/MS thesis/master-thesis', '/home/liying/miniconda3/envs/pytorch-gpu/lib/python39.zip', '/home/liying/miniconda3/envs/pytorch-gpu/lib/python3.9', '/home/liying/miniconda3/envs/pytorch-gpu/lib/python3.9/lib-dynload', '', '/home/liying/miniconda3/envs/pytorch-gpu/lib/python3.9/site-packages', '/home/liying/Documents/MS thesis/master-thesis/BEATs', '/home/liying/Documents/MS thesis/master-thesis/LIMUBert']


In [12]:
from LIMUBert.utils import load_model_config, Preprocess4Normalization, IMUDataset
from LIMUBert.models import LIMUBertModel4Pretrain

In [13]:
# Load LIMU-BERT model

model_cfg = load_model_config('pretrain_base', 'base', 'v1', path_bert='LIMUBert/config/limu_bert.json')
if model_cfg is None:
    print("Unable to find corresponding model config!")

pipeline = [Preprocess4Normalization(model_cfg.feature_num)]
LIMUBert_model = LIMUBertModel4Pretrain(model_cfg, output_embed=True)

In [14]:
# load the pre-trained checkpoints
checkpoint = torch.load('./LIMUBert/saved/pretrain_base_wisdm_20_100/wisdm.pt')

# cfg = BEATsConfig(checkpoint['cfg'])
# BEATs_model = BEATs(cfg)
LIMUBert_model.load_state_dict(checkpoint)
# LIMUBert_model.eval()
print(summary(LIMUBert_model))

Layer (type:depth-idx)                   Param #
LIMUBertModel4Pretrain                   --
├─Transformer: 1-1                       --
│    └─Embeddings: 2-1                   --
│    │    └─Linear: 3-1                  504
│    │    └─Embedding: 3-2               8,640
│    │    └─LayerNorm: 3-3               144
│    └─MultiHeadedSelfAttention: 2-2     --
│    │    └─Linear: 3-4                  5,256
│    │    └─Linear: 3-5                  5,256
│    │    └─Linear: 3-6                  5,256
│    └─Linear: 2-3                       5,256
│    └─LayerNorm: 2-4                    144
│    └─PositionWiseFeedForward: 2-5      --
│    │    └─Linear: 3-7                  10,512
│    │    └─Linear: 3-8                  10,440
│    └─LayerNorm: 2-6                    144
├─Linear: 1-2                            5,256
├─Linear: 1-3                            5,256
├─LayerNorm: 1-4                         144
├─Linear: 1-5                            438
Total params: 62,646
Trainable param

In [15]:
LIMUBert_model.to('cpu')
print(len(imu_test_loader.dataset.imus))
# for i, (imus, labels) in enumerate(imu_test_loader):
#     # Run the Net
# #     print(imus.shape)
#     imus = imus.transpose(-1, 1).to('cpu')
#     labels = labels.to('cpu')
#     x = LIMUBert_model(imus)
#     if i % 50 == 0:
#         print(x.shape)
# print(np.mean(summary['acc'][e]))

# test_output=LIMUBert_model(torch.rand(1, 100, 6))
# test_output.shape

14398


## Experiment: finetune model on WISDMSelect dataset

In [12]:
class LIMUBertFinetuned(nn.Module):
    def __init__(self, LIMUBert_pretrained_model, n_class, predictor_dropout=0.0):
        super().__init__()
        self.LIMUBert = LIMUBert_pretrained_model
        self.predictor_dropout = nn.Dropout(predictor_dropout)
#         self.predictor = nn.Linear(72, n_class)
        self.predictor = GRU(72, n_class)
 
    def forward(self, x):
        x = self.LIMUBert(x.transpose(-1,1)) # Input to LIMUBert model is N * L * C
        x = self.predictor_dropout(x)
#         logits = self.predictor(x).mean(dim=1)
        logits = self.predictor(x)
        lprobs = torch.sigmoid(logits)
        return lprobs

    def extract_features(self, x):
        x = self.LIMUBert(x.transpose(-1,1))
        x = self.predictor_dropout(x)
#         logits = self.predictor(x).mean(dim=1)
        logits = self.predictor(x)
        return logits
    
class GRU(nn.Module):
    def __init__(self, input_feature_dim, output_feature_dim, training=False):
        super().__init__()
        self.dropout = True
        self.num_rnn = 2
        self.num_linear = 1
        self.rnn_io = [[input_feature_dim, 20], [20, output_feature_dim]]
        self.num_layers = [2, 1]
        for i in range(self.num_rnn):
            self.__setattr__('gru' + str(i), nn.GRU(self.rnn_io[i][0], self.rnn_io[i][1], num_layers=self.num_layers[i],
                                         batch_first=True))
        

    def forward(self, input_seqs, training=False):
        h = input_seqs
        for i in range(self.num_rnn):
            rnn = self.__getattr__('gru' + str(i))
            h, _ = rnn(h)
            h = nn.functional.relu(h)
#         print(h.shape)  
        h = h[:, -1, :]
#         print(h.shape)
        if self.dropout:
            h = nn.functional.dropout(h, training=training)
        return h

In [13]:
# Freeze the pretrained model
for param in LIMUBert_model.parameters():
    param.requires_grad = False
    
LIMUBert_finetuned_model = LIMUBertFinetuned(LIMUBert_model, imu_train_set.nClasses)
# output = LIMUBert_finetuned_model(imu_train_set.imus[0].unsqueeze(0))
# output.shape
print(summary(LIMUBert_finetuned_model))

Layer (type:depth-idx)                        Param #
LIMUBertFinetuned                             --
├─LIMUBertModel4Pretrain: 1-1                 --
│    └─Transformer: 2-1                       --
│    │    └─Embeddings: 3-1                   (9,288)
│    │    └─MultiHeadedSelfAttention: 3-2     (15,768)
│    │    └─Linear: 3-3                       (5,256)
│    │    └─LayerNorm: 3-4                    (144)
│    │    └─PositionWiseFeedForward: 3-5      (20,952)
│    │    └─LayerNorm: 3-6                    (144)
│    └─Linear: 2-2                            (5,256)
│    └─Linear: 2-3                            (5,256)
│    └─LayerNorm: 2-4                         (144)
│    └─Linear: 2-5                            (438)
├─Dropout: 1-2                                --
├─GRU: 1-3                                    --
│    └─GRU: 2-6                               8,160
│    └─GRU: 2-7                               720
Total params: 71,526
Trainable params: 8,880
Non-trainable params

### Training

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(LIMUBert_finetuned_model.parameters(), lr=0.001, momentum=0.9)
# optimizer = optim.Adam(params=LIMUBert_finetuned_model.parameters(), lr=1e-3)
LIMUBert_finetuned_model.to(device)

imu_train_loader = DataLoader(imu_train_set, batch_size=128, 
                            shuffle=True, num_workers=4)
print(imu_train_set.get_label_mapping())
# Training loop
n_epochs = 50
summary = {'loss': [[] for _ in range(n_epochs)], 'acc': [[] for _ in range(n_epochs)]}
for e in range(n_epochs):
    for i, (imus, labels) in enumerate(imu_train_loader):
        optimizer.zero_grad()
        imus = imus.to(device)
        labels = labels.to(device)
        # Run the Net
        x = LIMUBert_finetuned_model(imus)
        # print(x.shape)
        # print(labels.shape)
        # x = x.view(x.size()[:-1])

        # Optimize net
        loss = criterion(x, labels.long())
        loss.backward()
        optimizer.step()
        summary['loss'][e].append(loss.item())

            # Calculat accuracy
        _, pred = x.data.topk(1, dim=1)
        pred = pred.view(pred.shape[:-1])
        acc = torch.sum(pred == labels)/x.shape[0]
        summary['acc'][e].append(acc.item())
        
    print('Loss: {}, Accuracy: {}'.format(np.mean(summary['loss'][e]), np.mean(summary['acc'][e])))

{'walking': 0, 'jogging': 1, 'sitting': 2, 'standing': 3, 'typing': 4, 'teeth': 5, 'pasta': 6, 'drinking': 7}
Loss: 2.078253234060187, Accuracy: 0.1540648496464679
Loss: 2.0764355885355097, Accuracy: 0.1673331767320633
Loss: 2.0747011887399776, Accuracy: 0.18660479329134289
Loss: 2.073084316755596, Accuracy: 0.20871240606433467
Loss: 2.0715565054040206, Accuracy: 0.22311795118607972
Loss: 2.070141440943668, Accuracy: 0.23756109036897358
Loss: 2.068834124113384, Accuracy: 0.24810385343275573
Loss: 2.06751742613943, Accuracy: 0.2566118422307466
Loss: 2.0662729765239516, Accuracy: 0.2641635339511068
Loss: 2.065074012154027, Accuracy: 0.2707706768261759
Loss: 2.063909068860506, Accuracy: 0.2780592105890575
Loss: 2.062872856541684, Accuracy: 0.2832307330871883
Loss: 2.0618502039658395, Accuracy: 0.28916353395110683
Loss: 2.060886337882594, Accuracy: 0.2919736843360098
Loss: 2.059885213249608, Accuracy: 0.2940836467240986
Loss: 2.058993831433748, Accuracy: 0.2954816730398881
Loss: 2.05809270

In [67]:
import copy
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(LIMUBert_finetuned_model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(params=LIMUBert_finetuned_model.parameters(), lr=1e-3)
LIMUBert_finetuned_model.to(device)

imu_tmp_train_set = copy.deepcopy(imu_train_set)
# imu_tmp_train_set.imus[0][0:3] /= 9.8
imu_tmp_train_set.imus = [imu_tmp_train_set.imus[0]]*len(imu_tmp_train_set)
imu_tmp_train_set.labels = [imu_tmp_train_set.labels[0]]*len(imu_tmp_train_set)

imu_tmp_train_loader = DataLoader(imu_tmp_train_set, batch_size=128, 
                            shuffle=True, num_workers=4)



# Training loop
n_epochs = 100
summary = {'loss': [[] for _ in range(n_epochs)], 'acc': [[] for _ in range(n_epochs)]}
for e in range(n_epochs):
    for i, (imus, labels) in enumerate(imu_tmp_train_loader):
        optimizer.zero_grad()
        imus = imus.to(device)
        labels = labels.to(device)
        # Run the Net
        x = LIMUBert_finetuned_model(imus)
        # print(x.shape)
        # print(labels.shape)
        # x = x.view(x.size()[:-1])

        # Optimize net
        loss = criterion(x, labels.long())
        loss.backward()
        optimizer.step()
        summary['loss'][e].append(loss.item())

            # Calculat accuracy
        _, pred = x.data.topk(1, dim=1)
        pred = pred.view(pred.shape[:-1])
        acc = torch.sum(pred == labels)/x.shape[0]
        summary['acc'][e].append(acc.item())
        
    print('Loss: {}, Accuracy: {}'.format(np.mean(summary['loss'][e]), np.mean(summary['acc'][e])))

Loss: 1.8995697397934763, Accuracy: 1.0
Loss: 1.8805682696794208, Accuracy: 1.0
Loss: 1.880459771658245, Accuracy: 1.0
Loss: 1.8804107791499087, Accuracy: 1.0
Loss: 1.8804002197165237, Accuracy: 1.0
Loss: 1.8803832317653455, Accuracy: 1.0
Loss: 1.8803760930111533, Accuracy: 1.0
Loss: 1.8803772851040488, Accuracy: 1.0


KeyboardInterrupt: 

### Testing

In [18]:
test_summary = {'loss': [], 'acc': []}
for i, (imus, labels) in enumerate(imu_test_loader):
        # Run the Net
        imus = imus.to(device)
        labels = labels.to(device)
        x = LIMUBert_finetuned_model(imus)

        # loss = criterion(x, labels.long())
        # summary['loss'][e].append(loss.item())
        # Calculat accuracy
        _, pred = x.data.topk(1, dim=1)
        pred = pred.view(pred.shape[:-1])
        acc = torch.sum(pred == labels)/x.shape[0]
        test_summary['acc'].append(acc.item())
print(np.mean(test_summary['acc']))

0.3212503270679341


# Multimodal Model

## Generate Audio and IMU Embeddings

In [14]:
class AudioModule(nn.Module):
    def __init__(self, BEATs_pretrained_model, dropout_p=0.0):
        super().__init__()
        self.BEATs = BEATs_pretrained_model # need to freeze params
        self.dropout = nn.Dropout(dropout_p)
 
    def forward(self, x):
        x = self.BEATs.extract_features(x, padding_mask=torch.zeros_like(x).bool())[0]
        embeddings = self.dropout(x)
#         embeddings = embeddings.mean(dim=1)
        return embeddings


class IMUModule(nn.Module):
    def __init__(self, LIMUBert_pretrained_model, dropout_p=0.0):
        super().__init__()
        self.imu_model = LIMUBert_pretrained_model # need to freeze params
        self.dropout = nn.Dropout(dropout_p)
 
    def forward(self, x):
        x = self.imu_model(x)
        embeddings = self.dropout(x)
        return embeddings
    

class GRU(nn.Module):
    def __init__(self, input_feature_dim, output_feature_dim, training=False):
        super().__init__()
        self.dropout = True
        self.num_rnn = 2
        self.num_linear = 1
        self.rnn_io = [[input_feature_dim, 256], [256, output_feature_dim]]
        self.num_layers = [2, 1]
        for i in range(self.num_rnn):
            self.__setattr__('gru' + str(i), nn.GRU(self.rnn_io[i][0], self.rnn_io[i][1], num_layers=self.num_layers[i],
                                         batch_first=True))
        

    def forward(self, input_seqs, training=False):
        h = input_seqs
        for i in range(self.num_rnn):
            rnn = self.__getattr__('gru' + str(i))
            h, _ = rnn(h)
            h = nn.functional.relu(h)
#         print(h.shape)  
        h = h[:, -1, :]
#         print(h.shape)
        if self.dropout:
            h = nn.functional.dropout(h, training=training)
        return h

In [5]:
def get_multimodal_embed_dataset(multimodal_loader, audio_module, imu_module, device, overwrite=False):
    
    save_path = multimodal_loader.dataset.db_path.split('.npz')[0] + '_embeddings.npz'
    print(save_path)
    config_file = './Multimodal/dataset_config.json'
    
    if not os.path.isfile(save_path) or overwrite:
        dataset = {}
        dataset['audio_embeddings'] = []
        dataset['imu_embeddings'] = []
        dataset['labels'] = []
        
        audio_module.eval()
        imu_module.eval()

        for i, (sounds, imus, labels) in enumerate(multimodal_loader):
            sounds = sounds.squeeze(dim=1).to(device)
            imus = imus.permute(0, 2, 1).to(device)
            
            with torch.no_grad():
                audio_embeddings = audio_module(sounds).cpu().numpy()
                imu_embeddings = imu_module(imus).cpu().numpy()
            
            dataset['audio_embeddings'].append(audio_embeddings)
            dataset['imu_embeddings'].append(imu_embeddings)
            dataset['labels'].append(labels.numpy())
            
        dataset['audio_embeddings'] = np.concatenate(dataset['audio_embeddings'], axis=0)
        dataset['imu_embeddings'] = np.concatenate(dataset['imu_embeddings'], axis=0)
        dataset['labels'] = np.concatenate(dataset['labels'], axis=0)
        
        np.savez(save_path, **dataset)
        
    else:
        dataset = np.load(save_path, allow_pickle=True)
    
    if not os.path.isfile(config_file) or overwrite:
        dataset_config = {}
        dataset_config['db_path'] = save_path
        dataset_config['classes'] = multimodal_loader.dataset.classes
        dataset_config['nClasses'] = multimodal_loader.dataset.nClasses
        dataset_config['time_window'] = multimodal_loader.dataset.time_window
        dataset_config['num_data_per_class'] = multimodal_loader.dataset.num_data_per_class
        dataset_config['label_mapping'] = multimodal_loader.dataset.get_label_mapping()
        
        with open(config_file, 'w') as f:
            json.dump(dataset_config, f)
        
    else:
        with open(config_file, 'r') as f:
            dataset_config = json.load(f)
        
    return dataset, dataset_config

In [9]:
# get Multimodal Embedding Dataset for training and testing
audio_module = None #AudioModule(BEATs_model).to(device)
imu_module = None #IMUModule(LIMUBert_model).to(device)

embed_dataset, embed_dataset_config = get_multimodal_embed_dataset(DataLoader(multimodal_train_set, batch_size=32, 
                            shuffle=False, num_workers=4), audio_module, imu_module, device=device, overwrite=False)
multimodal_embed_train_set = MultimodalEmbed(embed_dataset, embed_dataset_config)

embed_dataset, embed_dataset_config = get_multimodal_embed_dataset(DataLoader(multimodal_test_set, batch_size=32, 
                            shuffle=False, num_workers=4), audio_module, imu_module, device=device, overwrite=False)
multimodal_embed_test_set = MultimodalEmbed(embed_dataset, embed_dataset_config)

./Multimodal/MultimodalDataset_audio-16000_1234_imu-20_1234_embeddings.npz
./Multimodal/MultimodalDataset_audio-16000_5_imu-20_5_embeddings.npz


In [10]:
multimodal_embed_test_set.imu_embeddings.shape

(1800, 20, 72)

In [11]:
# get DataLoader for training and testing
multimodal_embed_train_loader = DataLoader(multimodal_embed_train_set, batch_size=128, 
                            shuffle=True, num_workers=4)
multimodal_embed_test_loader = DataLoader(multimodal_embed_test_set, batch_size=128, 
                            shuffle=False, num_workers=4)

In [12]:
class AudioAndIMUFusion(nn.Module):
    def __init__(
        self,
        audio_feature_dim1,
        imu_feature_dim1,
        audio_feature_dim2,
        imu_feature_dim2,
        fusion_output_dim,
        output_dim,
        dropout_p=0.0,
        ):
        """
        
        """
        super().__init__()
        self.audio_gru = GRU(audio_feature_dim1, audio_feature_dim2)
        self.imu_gru = GRU(imu_feature_dim1, imu_feature_dim2)
        self.fusion = nn.Linear(
            in_features=(audio_feature_dim2 + imu_feature_dim2), 
            out_features=fusion_output_dim
            )
        self.fc = nn.Linear(
            in_features=fusion_output_dim, 
            out_features=output_dim
            )
        self.dropout = nn.Dropout(dropout_p)
    
    def forward(self, audio_embeddings, imu_embeddings):
        audio_features = self.audio_gru(audio_embeddings)
#         audio_features = audio_embeddings
#         print(audio_features.shape)
        imu_features = self.imu_gru(imu_embeddings)
        combined = torch.cat([audio_features, imu_features], dim=1)
        fused = self.dropout(
            nn.functional.relu(
                self.fusion(combined)
                )
            )
        logits = self.fc(fused)
        pred = torch.sigmoid(logits)
        return pred
    
    def extract_features(self, audio_embeddings, imu_embeddings):
        audio_features = self.audio_gru(audio_embeddings)
        imu_features = self.imu_gru(imu_embeddings)
        combined = torch.cat([audio_features, imu_features], dim=1)
        fused = self.dropout(
            nn.functional.relu(
                self.fusion(combined)
                )
            )
        return fused

In [15]:
# Freeze the two feature models
# for param in BEATs_model.parameters():
#     param.requires_grad = False
# for param in LIMUBert_model.parameters():
#     param.requires_grad = False

audio_feature_dim1 = 768
imu_feature_dim1 = 72
audio_feature_dim2 = 128
imu_feature_dim2 = 128
fusion_output_dim = 128
n_class = multimodal_embed_train_set.nClasses

multimodal_model = AudioAndIMUFusion(audio_feature_dim1, 
                                     imu_feature_dim1, 
                                     audio_feature_dim2, 
                                     imu_feature_dim2, 
                                     fusion_output_dim,
                                     n_class,
                                     dropout_p=0.7,
                                    )

In [19]:
print(summary(multimodal_model))

Layer (type:depth-idx)                   Param #
AudioAndIMUFusion                        --
├─GRU: 1-1                               --
│    └─GRU: 2-1                          1,182,720
│    └─GRU: 2-2                          148,224
├─GRU: 1-2                               --
│    └─GRU: 2-3                          648,192
│    └─GRU: 2-4                          148,224
├─Linear: 1-3                            32,896
├─Linear: 1-4                            1,161
├─Dropout: 1-5                           --
Total params: 2,161,417
Trainable params: 2,161,417
Non-trainable params: 0


## Training

In [16]:
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(params=multimodal_model.parameters(), lr=5e-4)
optimizer = torch.optim.AdamW(multimodal_model.parameters(), lr=0.0001, betas=[0.9, 0.95], weight_decay=0.1, )



# Place on GPU
multimodal_model.to(device)

# Training loop
n_epochs = 200
summary = {'loss': [[] for _ in range(n_epochs)], 'acc': [[] for _ in range(n_epochs)]}
for e in range(n_epochs):
    multimodal_model.train()
    for i, (audio_embeds, imu_embeds, labels) in enumerate(tqdm(multimodal_embed_train_loader)):
        # Zero the grads
        optimizer.zero_grad()
        audio_embeds = audio_embeds.to(device)
        imu_embeds = imu_embeds.to(device)
        labels = labels.to(device)
        
        # Run the Net
        x = multimodal_model(audio_embeds, imu_embeds)
#         print(x.shape)
        # print(labels.shape)
        # x = x.view(x.size()[:-1])

        # Optimize net
        loss = criterion(x, labels.long())
        loss.backward()
        optimizer.step()
        summary['loss'][e].append(loss.item())

            # Calculat accuracy
        _, pred = x.data.topk(1, dim=1)
        pred = pred.view(pred.shape[:-1])
        acc = torch.sum(pred == labels)/x.shape[0]
        summary['acc'][e].append(acc.item())

    with torch.no_grad():
        multimodal_model.eval()
        test_loss = []
        test_acc = []
        for i, (audio_embeds, imu_embeds, labels) in enumerate(multimodal_embed_test_loader):
            audio_embeds = audio_embeds.to(device)
            imu_embeds = imu_embeds.to(device)
            labels = labels.to(device)

            # Run the Net
            x = multimodal_model(audio_embeds, imu_embeds)
            # Optimize net
            loss = criterion(x, labels.long())
            test_loss.append(loss.item())

                # Calculat accuracy
            _, pred = x.data.topk(1, dim=1)
            pred = pred.view(pred.shape[:-1])
            acc = torch.sum(pred == labels)/x.shape[0]
            test_acc.append(acc.item())
 
    print('Epoch: {}, Train Loss: {}, Train Accuracy: {}, Test Loss: {}, Test Accuracy: {}'.format(e, np.mean(summary['loss'][e]), np.mean(summary['acc'][e]),  np.mean(test_loss), np.mean(test_acc)))

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 57/57 [00:00<00:00, 76.13it/s]


Epoch: 0, Train Loss: 2.181260435204757, Train Accuracy: 0.3125, Test Loss: 2.1599936803181965, Test Accuracy: 0.6260416666666667


100%|██████████| 57/57 [00:00<00:00, 92.62it/s] 


Epoch: 1, Train Loss: 2.0949983429490473, Train Accuracy: 0.5379660087719298, Test Loss: 2.0445607980092366, Test Accuracy: 0.6927083333333334


100%|██████████| 57/57 [00:00<00:00, 96.53it/s] 


Epoch: 2, Train Loss: 1.9583727267750524, Train Accuracy: 0.6611842105263158, Test Loss: 1.9300234079360963, Test Accuracy: 0.7057291666666666


100%|██████████| 57/57 [00:00<00:00, 92.86it/s] 


Epoch: 3, Train Loss: 1.8490639799519588, Train Accuracy: 0.7224506578947368, Test Loss: 1.8405141989390055, Test Accuracy: 0.6947916666666667


100%|██████████| 57/57 [00:00<00:00, 93.15it/s] 


Epoch: 4, Train Loss: 1.76199920344771, Train Accuracy: 0.776452850877193, Test Loss: 1.7658018906911215, Test Accuracy: 0.7380208333333333


100%|██████████| 57/57 [00:00<00:00, 87.59it/s] 


Epoch: 5, Train Loss: 1.6957013983475535, Train Accuracy: 0.8029057017543859, Test Loss: 1.7097668011983236, Test Accuracy: 0.8010416666666667


100%|██████████| 57/57 [00:00<00:00, 93.81it/s] 


Epoch: 6, Train Loss: 1.644287297600194, Train Accuracy: 0.8237390350877193, Test Loss: 1.6706857522328695, Test Accuracy: 0.8182291666666667


100%|██████████| 57/57 [00:00<00:00, 99.00it/s] 


Epoch: 7, Train Loss: 1.6038195752260977, Train Accuracy: 0.8481359649122807, Test Loss: 1.6503604253133137, Test Accuracy: 0.7958333333333333


100%|██████████| 57/57 [00:00<00:00, 97.02it/s] 


Epoch: 8, Train Loss: 1.5716565784655119, Train Accuracy: 0.864172149122807, Test Loss: 1.6209610223770141, Test Accuracy: 0.7973958333333333


100%|██████████| 57/57 [00:00<00:00, 95.26it/s] 


Epoch: 9, Train Loss: 1.5450152158737183, Train Accuracy: 0.8796600877192983, Test Loss: 1.616658099492391, Test Accuracy: 0.7911458333333333


100%|██████████| 57/57 [00:00<00:00, 88.33it/s] 


Epoch: 10, Train Loss: 1.5244255923388297, Train Accuracy: 0.8882949561403509, Test Loss: 1.6196765502293904, Test Accuracy: 0.7619791666666667


100%|██████████| 57/57 [00:00<00:00, 95.53it/s] 


Epoch: 11, Train Loss: 1.5047379648476316, Train Accuracy: 0.905016447368421, Test Loss: 1.5967788537343344, Test Accuracy: 0.8067708333333333


100%|██████████| 57/57 [00:00<00:00, 94.26it/s] 


Epoch: 12, Train Loss: 1.4909984898148922, Train Accuracy: 0.912828947368421, Test Loss: 1.5972280899683635, Test Accuracy: 0.8046875


100%|██████████| 57/57 [00:00<00:00, 86.69it/s] 


Epoch: 13, Train Loss: 1.4788877441172015, Train Accuracy: 0.9192708333333334, Test Loss: 1.6263335704803468, Test Accuracy: 0.7614583333333333


100%|██████████| 57/57 [00:00<00:00, 89.58it/s] 


Epoch: 14, Train Loss: 1.4674702548144156, Train Accuracy: 0.9226973684210527, Test Loss: 1.5987172524134319, Test Accuracy: 0.7958333333333333


100%|██████████| 57/57 [00:00<00:00, 94.18it/s] 


Epoch: 15, Train Loss: 1.4613152972438879, Train Accuracy: 0.9250274122807017, Test Loss: 1.5905312140782675, Test Accuracy: 0.8067708333333333


100%|██████████| 57/57 [00:00<00:00, 88.90it/s]


Epoch: 16, Train Loss: 1.4550655724709494, Train Accuracy: 0.9283168859649122, Test Loss: 1.610495098431905, Test Accuracy: 0.7796875


100%|██████████| 57/57 [00:00<00:00, 95.28it/s] 


Epoch: 17, Train Loss: 1.4482678007661252, Train Accuracy: 0.9353070175438597, Test Loss: 1.6140638907750449, Test Accuracy: 0.7916666666666666


100%|██████████| 57/57 [00:00<00:00, 97.61it/s] 


Epoch: 18, Train Loss: 1.4442324722022342, Train Accuracy: 0.9379111842105263, Test Loss: 1.6338053941726685, Test Accuracy: 0.7583333333333333


100%|██████████| 57/57 [00:00<00:00, 93.78it/s] 


Epoch: 19, Train Loss: 1.4391862279490422, Train Accuracy: 0.9303728070175439, Test Loss: 1.613700771331787, Test Accuracy: 0.690625


100%|██████████| 57/57 [00:00<00:00, 95.41it/s] 


Epoch: 20, Train Loss: 1.43349934460824, Train Accuracy: 0.9143366228070176, Test Loss: 1.6079200426737468, Test Accuracy: 0.7052083333333333


100%|██████████| 57/57 [00:00<00:00, 94.97it/s] 


Epoch: 21, Train Loss: 1.4289677080355192, Train Accuracy: 0.9021381578947368, Test Loss: 1.616308824221293, Test Accuracy: 0.6880208333333333


100%|██████████| 57/57 [00:00<00:00, 89.84it/s] 


Epoch: 22, Train Loss: 1.4239282670773958, Train Accuracy: 0.9029605263157895, Test Loss: 1.6225910743077596, Test Accuracy: 0.6859375


100%|██████████| 57/57 [00:00<00:00, 90.68it/s] 


Epoch: 23, Train Loss: 1.421956315375211, Train Accuracy: 0.8948739035087719, Test Loss: 1.6055894374847413, Test Accuracy: 0.7119791666666667


100%|██████████| 57/57 [00:00<00:00, 91.31it/s] 


Epoch: 24, Train Loss: 1.4189345899381136, Train Accuracy: 0.8963815789473685, Test Loss: 1.608495823542277, Test Accuracy: 0.6963541666666667


100%|██████████| 57/57 [00:00<00:00, 93.28it/s] 


Epoch: 25, Train Loss: 1.415962162770723, Train Accuracy: 0.8959703947368421, Test Loss: 1.5909561077753702, Test Accuracy: 0.7182291666666667


100%|██████████| 57/57 [00:00<00:00, 91.90it/s] 


Epoch: 26, Train Loss: 1.4138356133511192, Train Accuracy: 0.9004934210526315, Test Loss: 1.6121392170588176, Test Accuracy: 0.7036458333333333


100%|██████████| 57/57 [00:00<00:00, 90.41it/s] 


Epoch: 27, Train Loss: 1.4118456213097823, Train Accuracy: 0.8991228070175439, Test Loss: 1.5972264210383098, Test Accuracy: 0.7026041666666667


100%|██████████| 57/57 [00:00<00:00, 90.21it/s] 


Epoch: 28, Train Loss: 1.4105556805928547, Train Accuracy: 0.9020010964912281, Test Loss: 1.6086188952128093, Test Accuracy: 0.6864583333333333


100%|██████████| 57/57 [00:00<00:00, 90.75it/s] 


Epoch: 29, Train Loss: 1.408807112459551, Train Accuracy: 0.9061129385964912, Test Loss: 1.6052643537521363, Test Accuracy: 0.6963541666666667


100%|██████████| 57/57 [00:00<00:00, 90.78it/s] 


Epoch: 30, Train Loss: 1.4085095932609157, Train Accuracy: 0.911047149122807, Test Loss: 1.586393698056539, Test Accuracy: 0.7223958333333333


100%|██████████| 57/57 [00:00<00:00, 92.17it/s] 


Epoch: 31, Train Loss: 1.4069540437899137, Train Accuracy: 0.9124177631578947, Test Loss: 1.578396487236023, Test Accuracy: 0.7322916666666667


100%|██████████| 57/57 [00:00<00:00, 87.15it/s]


Epoch: 32, Train Loss: 1.406744921416567, Train Accuracy: 0.9172149122807017, Test Loss: 1.5772295077641805, Test Accuracy: 0.7395833333333334


100%|██████████| 57/57 [00:00<00:00, 86.51it/s] 


Epoch: 33, Train Loss: 1.4054816041076392, Train Accuracy: 0.9322916666666666, Test Loss: 1.603254206975301, Test Accuracy: 0.7666666666666667


100%|██████████| 57/57 [00:00<00:00, 89.77it/s] 


Epoch: 34, Train Loss: 1.402961804155718, Train Accuracy: 0.9459978070175439, Test Loss: 1.603063154220581, Test Accuracy: 0.7880208333333333


100%|██████████| 57/57 [00:00<00:00, 91.46it/s] 


Epoch: 35, Train Loss: 1.4023310740788777, Train Accuracy: 0.9606633771929824, Test Loss: 1.607339572906494, Test Accuracy: 0.7723958333333333


100%|██████████| 57/57 [00:00<00:00, 92.21it/s] 


Epoch: 36, Train Loss: 1.39858643423047, Train Accuracy: 0.983141447368421, Test Loss: 1.6075265645980834, Test Accuracy: 0.7723958333333333


100%|██████████| 57/57 [00:00<00:00, 91.77it/s] 


Epoch: 37, Train Loss: 1.3951105799591332, Train Accuracy: 0.9890350877192983, Test Loss: 1.5895511309305828, Test Accuracy: 0.7927083333333333


100%|██████████| 57/57 [00:00<00:00, 88.85it/s] 


Epoch: 38, Train Loss: 1.3912326076574493, Train Accuracy: 0.9899945175438597, Test Loss: 1.6220246156056721, Test Accuracy: 0.7713541666666667


100%|██████████| 57/57 [00:00<00:00, 91.89it/s] 


Epoch: 39, Train Loss: 1.3897906604566073, Train Accuracy: 0.9917763157894737, Test Loss: 1.6245938618977864, Test Accuracy: 0.7489583333333333


100%|██████████| 57/57 [00:00<00:00, 90.50it/s] 


Epoch: 40, Train Loss: 1.3891112197909439, Train Accuracy: 0.9898574561403509, Test Loss: 1.6271171649297078, Test Accuracy: 0.7494791666666667


100%|██████████| 57/57 [00:00<00:00, 91.77it/s] 


Epoch: 41, Train Loss: 1.3874122322651379, Train Accuracy: 0.9905427631578947, Test Loss: 1.6316508134206136, Test Accuracy: 0.7614583333333333


100%|██████████| 57/57 [00:00<00:00, 92.08it/s] 


Epoch: 42, Train Loss: 1.3864448948910362, Train Accuracy: 0.9897203947368421, Test Loss: 1.6339030901590983, Test Accuracy: 0.7442708333333333


100%|██████████| 57/57 [00:00<00:00, 95.96it/s] 


Epoch: 43, Train Loss: 1.3859323221340514, Train Accuracy: 0.9917763157894737, Test Loss: 1.6362557729085287, Test Accuracy: 0.7291666666666666


100%|██████████| 57/57 [00:00<00:00, 91.90it/s] 


Epoch: 44, Train Loss: 1.3856997887293498, Train Accuracy: 0.9910910087719298, Test Loss: 1.6251979112625121, Test Accuracy: 0.7484375


100%|██████████| 57/57 [00:00<00:00, 85.23it/s]


Epoch: 45, Train Loss: 1.384167750676473, Train Accuracy: 0.9902686403508771, Test Loss: 1.6298786401748657, Test Accuracy: 0.7447916666666666


100%|██████████| 57/57 [00:00<00:00, 92.39it/s] 


Epoch: 46, Train Loss: 1.3842440529873496, Train Accuracy: 0.9898574561403509, Test Loss: 1.6184158643086752, Test Accuracy: 0.7588541666666667


100%|██████████| 57/57 [00:00<00:00, 92.70it/s] 


Epoch: 47, Train Loss: 1.3849899371465046, Train Accuracy: 0.9898574561403509, Test Loss: 1.631377100944519, Test Accuracy: 0.7609375


100%|██████████| 57/57 [00:00<00:00, 90.01it/s] 


Epoch: 48, Train Loss: 1.3836656787939239, Train Accuracy: 0.9905427631578947, Test Loss: 1.6402361869812012, Test Accuracy: 0.7291666666666666


100%|██████████| 57/57 [00:00<00:00, 92.71it/s] 


Epoch: 49, Train Loss: 1.3831435567454289, Train Accuracy: 0.990953947368421, Test Loss: 1.648812993367513, Test Accuracy: 0.7515625


100%|██████████| 57/57 [00:00<00:00, 91.63it/s] 


Epoch: 50, Train Loss: 1.3824809413207204, Train Accuracy: 0.9920504385964912, Test Loss: 1.6199593544006348, Test Accuracy: 0.7609375


100%|██████████| 57/57 [00:00<00:00, 91.82it/s] 


Epoch: 51, Train Loss: 1.3824287339260704, Train Accuracy: 0.9919133771929824, Test Loss: 1.6328867038091024, Test Accuracy: 0.7515625


100%|██████████| 57/57 [00:00<00:00, 85.69it/s] 


Epoch: 52, Train Loss: 1.3824656532521833, Train Accuracy: 0.9927357456140351, Test Loss: 1.6342565377553304, Test Accuracy: 0.7598958333333333


100%|██████████| 57/57 [00:00<00:00, 91.46it/s] 


Epoch: 53, Train Loss: 1.3817576044484188, Train Accuracy: 0.9915021929824561, Test Loss: 1.6270543177922567, Test Accuracy: 0.7703125


100%|██████████| 57/57 [00:00<00:00, 87.60it/s] 


Epoch: 54, Train Loss: 1.3818046858436184, Train Accuracy: 0.9912280701754386, Test Loss: 1.6339701970418294, Test Accuracy: 0.7354166666666667


100%|██████████| 57/57 [00:00<00:00, 85.46it/s] 


Epoch: 55, Train Loss: 1.3817455454876548, Train Accuracy: 0.9916392543859649, Test Loss: 1.6424381097157796, Test Accuracy: 0.7395833333333334


100%|██████████| 57/57 [00:00<00:00, 95.02it/s] 


Epoch: 56, Train Loss: 1.3812132843753748, Train Accuracy: 0.9919133771929824, Test Loss: 1.6627338727315266, Test Accuracy: 0.71875


 11%|█         | 6/57 [00:00<00:01, 29.87it/s]


KeyboardInterrupt: 

In [186]:
torch.save({
    'model_config': {
        'n_class': n_class,
        'audio_feature_dim': audio_feature_dim,
        'imu_feature_dim': imu_feature_dim,
        'fusion_output_size': fusion_output_size,
        },
    'model_state_dict': multimodal_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    }, 
    './saved_models/multimodal_embed_model_{}-{}.pt'.format(datetime.datetime.now().date().month, datetime.datetime.now().date().day))

## Testing

In [30]:
checkpoint = torch.load('./saved_models/multimodal_embed_model_4-17.pt', map_location=device)

multimodal_model = AudioAndIMUFusion(
                               checkpoint['model_config']['n_class'],
                               checkpoint['model_config']['audio_feature_dim'],
                               checkpoint['model_config']['imu_feature_dim'],
                               checkpoint['model_config']['fusion_output_size']
                               )
                                     
multimodal_model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [26]:
criterion = nn.CrossEntropyLoss()
multimodal_model.to(device)

test_loss = []
test_acc = []

for i, (audio_embeds, imu_embeds, labels) in enumerate(multimodal_embed_test_loader):
    
    audio_embeds = audio_embeds.to(device)
    imu_embeds = imu_embeds.to(device)
    labels = labels.to(device)

    # Run the Net
    x = multimodal_model(audio_embeds, imu_embeds)
    # Optimize net
    loss = criterion(x, labels.long())
    test_loss.append(loss.item())

        # Calculat accuracy
    _, pred = x.data.topk(1, dim=1)
    pred = pred.view(pred.shape[:-1])
    acc = torch.sum(pred == labels)/x.shape[0]
    test_acc.append(acc.item())
    
print('Loss: {}, Accuracy: {}'.format(np.mean(test_loss), np.mean(test_acc)))

Loss: 1.645623509089152, Accuracy: 0.7151041666666667


# Generate and save fusion embeddings

In [32]:
embeddings1 = 0
for i, (audio_embeds, imu_embeds, labels) in enumerate(multimodal_embed_train_loader):
    audio_embeds = audio_embeds.to(device)
    imu_embeds = imu_embeds.to(device)
    labels = labels.to(device)

    embeddings1 = multimodal_model.extract_features(audio_embeds, imu_embeds).detach().cpu().numpy()
print(embeddings1.shape)

(8, 128)


In [33]:
def get_fusion_embed_dataset(multimodal_embed_loader, multimodal_model, device, overwrite=False):
    
    save_path = multimodal_embed_loader.dataset.db_path.replace('MultimodalDataset', 'fusion')
    config_file = './Multimodal/dataset_config.json'
    multimodal_model.eval()
    
    if not os.path.isfile(save_path) or overwrite:
        dataset = {}
        dataset['embeddings'] = []
        dataset['labels'] = []

        for i, (audio_embeds, imu_embeds, labels) in enumerate(multimodal_embed_loader):
            audio_embeds = audio_embeds.to(device)
            imu_embeds = imu_embeds.to(device)

            embeddings = multimodal_model.extract_features(audio_embeds, imu_embeds).detach().cpu().numpy()
            
            dataset['embeddings'].append(embeddings)
            dataset['labels'].append(labels.numpy())
            
        dataset['embeddings'] = np.concatenate(dataset['embeddings'], axis=0)
        dataset['labels'] = np.concatenate(dataset['labels'], axis=0)
        np.savez(save_path, **dataset)
        
    else:
        dataset = np.load(save_path, allow_pickle=True)
    
    with open(config_file, 'r') as f:
        dataset_config = json.load(f)
        
    return dataset, dataset_config


In [34]:
fusion_dataset, fusion_dataset_config = get_fusion_embed_dataset(multimodal_embed_train_loader, multimodal_model, device)
fusion_embed_train_set = FusionEmbed(fusion_dataset, fusion_dataset_config)

fusion_dataset, fusion_dataset_config = get_fusion_embed_dataset(multimodal_embed_test_loader, multimodal_model, device)
fusion_embed_test_set = FusionEmbed(fusion_dataset, fusion_dataset_config)

In [35]:
print(len(audio_train_set.sounds))
print(len(imu_train_set.imus))
print(fusion_embed_train_set.embeddings.shape)
print(len(fusion_embed_test_set.label_mapping))


480
12102
(900, 128)
9


In [8]:
from sklearn.model_selection import KFold

In [45]:
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True)

In [37]:
kf.split(multimodal_train_set)[0]

TypeError: 'generator' object is not subscriptable

In [46]:
kf

KFold(n_splits=5, random_state=None, shuffle=True)

In [50]:
print(len(multimodal_train_set))
folds = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(multimodal_train_set)):
    print(f"Fold {fold + 1}", (train_idx, valid_idx))
    folds.append((train_idx, valid_idx))
    


7200
Fold 1 (array([   0,    1,    2, ..., 7197, 7198, 7199]), array([   3,   32,   52, ..., 7190, 7194, 7196]))
Fold 2 (array([   1,    2,    3, ..., 7197, 7198, 7199]), array([   0,    4,    6, ..., 7177, 7181, 7192]))
Fold 3 (array([   0,    2,    3, ..., 7196, 7197, 7199]), array([   1,   16,   18, ..., 7179, 7186, 7198]))
Fold 4 (array([   0,    1,    3, ..., 7195, 7196, 7198]), array([   2,    9,   12, ..., 7193, 7197, 7199]))
Fold 5 (array([   0,    1,    2, ..., 7197, 7198, 7199]), array([   5,   17,   19, ..., 7185, 7187, 7195]))


In [62]:
kf = KFold(n_splits=k_folds, shuffle=True)

folds = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(multimodal_train_set)):
    print(f"Fold {fold + 1}", (train_idx, valid_idx))
    folds.append((train_idx, valid_idx))

for train_idx, valid_idx in folds:
    print(train_idx, valid_idx)
    


Fold 1 (array([   0,    2,    3, ..., 7196, 7198, 7199]), array([   1,    5,   11, ..., 7191, 7195, 7197]))
Fold 2 (array([   0,    1,    3, ..., 7197, 7198, 7199]), array([   2,    4,   18, ..., 7189, 7193, 7194]))
Fold 3 (array([   0,    1,    2, ..., 7197, 7198, 7199]), array([   3,    9,   10, ..., 7164, 7188, 7190]))
Fold 4 (array([   0,    1,    2, ..., 7195, 7197, 7198]), array([   6,    8,   12, ..., 7192, 7196, 7199]))
Fold 5 (array([   1,    2,    3, ..., 7196, 7197, 7199]), array([   0,    7,   21, ..., 7179, 7187, 7198]))
<class 'numpy.ndarray'> [   1    5   11 ... 7191 7195 7197]
<class 'numpy.ndarray'> [   2    4   18 ... 7189 7193 7194]
<class 'numpy.ndarray'> [   3    9   10 ... 7164 7188 7190]
<class 'numpy.ndarray'> [   6    8   12 ... 7192 7196 7199]
<class 'numpy.ndarray'> [   0    7   21 ... 7179 7187 7198]
