In [None]:
class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, audiofeature):
        mfcc = audiofeature
        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        mfcc_tensor = torch.from_numpy(mfcc)
        mfcc_tensor = mfcc_tensor.unsqueeze(0)
        #mfcc = mfcc.transpose((2, 0, 1))
        return mfcc_tensor.double()

In [None]:
def audiocombination(directory):
    features = []
    labels = []
    filenames_audio = []
    for actor_number in os.listdir(directory):
        for file in sorted(os.listdir(directory+actor_number)):
            # load the wavefiles
            y, _ = librosa.load(directory+actor_number+'/'+file, sr=48000, offset = 0, duration=3)  # the default sample rate for them is 16kHz, but you can also change that
            S = librosa.feature.melspectrogram(y=y, sr=48000, n_mels=128, fmax=8000)
            feature = librosa.feature.mfcc(S=librosa.power_to_db(S))

            # truncate or zero-pad the signal
            feature_new = np.empty((0, 282))
            for i in range(feature.shape[0]):
                temp = np.concatenate([feature[i], np.zeros(282-feature.shape[1])])
                feature_new = np.append(feature_new, [temp], axis = 0)

            number = file.split("-")
            emotion = number[2]
            #if emotion == "01" or emotion == "02":
            if emotion =="01":
                label = 4 #neutral
            if emotion == "02":
                label = 4 #"neutral"
            elif emotion == "03":
                label = 3 #"happy"
            elif emotion == "04":
                label = 5 #"sad"
            elif emotion == "05":
                label = 0 #"angry"
            elif emotion == "06":
                label = 2 #"fearful"
            elif emotion == "07":
                label = 1 #"disgust"
            elif emotion == "08":
                label = 6 #"surprised" 

            features.append(feature_new)
            labels.append(label)
            filenames_audio.append(file)

    return features, labels, filenames_audio

def imagecombination(directory, single):
    features = []
    filenames_image = []
    for actor_number in os.listdir(directory):
        for videoes in sorted(os.listdir(directory + actor_number)):
            # if only use one frame for image network, then randomly draw one from the frames
            if single:
                numberoffile = len([name for name in os.listdir(directory + actor_number + '/' + videoes)])
                #print(numberoffile)
                index = random.randrange(1, numberoffile-1)
                index = str(index*10)
                target_path = directory + actor_number + '/' +videoes + '/' + index + ".jpg"
                image = io.imread(target_path)
                features.append(image)
            
                filenames_image.append(videoes)

    return features, filenames_image

In [None]:
class AudioImage_dataset(Dataset):
    def __init__(self, image_path, audio_path, mode, single, image_transform, audio_transform):

        self.image_path = image_path
        self.audio_path = audio_path
        
        self.mode = mode
        self.single = single
        self.image_transform = image_transform
        self.audio_transform = audio_transform
        
        ## Notice the path of image and audio for the train and val is different, add mode in the path
        self.audiofeatures, self.labels, self.filenames_audio = audiocombination(self.audio_path+self.mode+'/')
        self.imagefeatures, self.filenames_image = imagecombination(self.image_path+self.mode+'/', self.single)
        
    
    def __len__(self):
        return len(self.labels)
    

    def __getitem__(self, idx):
        
        audiofeature = self.audiofeatures[idx]
        transformed_audio = self.audio_transform(audiofeature)
        imagefeature = self.imagefeatures[idx]
        transformed_image = self.image_transform(imagefeature)
        label = self.labels[idx]
        filenames_audio = self.filenames_audio[idx]
        filenames_image = self.filenames_image[idx]
        sample = {'mfcc': transformed_audio, 'image': transformed_image, 'label': torch.tensor(label).double(), 
                  'filenames_audio': filenames_audio, 'filenames_image': filenames_image}
        
        return sample

In [None]:
class ImageAudio_dataloader():
    def __init__(self, BATCH_SIZE, single, num_workers, image_path, audio_path, image_transform, audio_transform):

        self.BATCH_SIZE=BATCH_SIZE
        self.single = single
        self.num_workers=num_workers
        self.image_path=image_path
        self.audio_path=audio_path
        self.image_transform = image_transform
        self.audio_transform = audio_transform
        #self.in_channel = in_channel
        #self.frame_count ={}
        # split the training and testing videos
        #splitter = UCF101_splitter(path=ucf_list,split=ucf_split)
        #self.train_video, self.test_video = splitter.split_video()
    
    def run(self):
        #print("Now in run ")
        #self.load_frame_count()
        #self.get_training_dic()
        #self.val_sample()
        train_loader, dataset_size_train = self.train()
        val_loader, dataset_size_valid = self.validate()
        
        return train_loader, val_loader, dataset_size_train, dataset_size_valid
    
    def train(self):
        #print("Now in train")
        #applying trabsformation on training videos 
        
        training_set = AudioImage_dataset(image_path=self.image_path, audio_path=self.audio_path,
                                          mode='train', single = self.single, 
                                          image_transform = self.image_transform,
                                          audio_transform = self.audio_transform)
        #print('Eligible videos for training :',len(training_set),'videos')
        dataset_size_train = len(training_set)
        
        train_loader = DataLoader(
            dataset=training_set, 
            batch_size=self.BATCH_SIZE,
            shuffle=True,
            num_workers=self.num_workers)
        return train_loader, dataset_size_train

    def validate(self):
        #print("Now in Validate")
        #applying transformation for validation videos 
        validation_set = AudioImage_dataset(image_path=self.image_path,audio_path=self.audio_path,
                                            mode='valid', single = self.single, 
                                            image_transform = self.image_transform,
                                            audio_transform = self.audio_transform)
        dataset_size_valid = len(validation_set)
        #print('Eligible videos for validation:',len(validation_set),'videos')
        val_loader = DataLoader(
            dataset=validation_set, 
            batch_size=self.BATCH_SIZE, 
            shuffle=True,
            num_workers=self.num_workers)
        return val_loader, dataset_size_valid

In [None]:
#loading the train and test data
'''
change here for loading data for spatial loader, temporal loader
'''
data_loader = ImageAudio_dataloader(BATCH_SIZE=16, single = True, num_workers=0,
                                image_path='./Multimodal-Emotion-Recognition/image_data/',  # path for image data       
                                audio_path='./Multimodal-Emotion-Recognition/audio_data/',  # path for audio data
                                image_transform = transforms.Compose([
                                    transforms.ToPILImage(),
                                    transforms.RandomHorizontalFlip(),
                                    transforms.ToTensor(),
                                    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                         std=[0.229, 0.224, 0.225])
                                ]),
                                audio_transform = transforms.Compose([
                                    ToTensor()
                                ]))
train_loader, valid_loader, dataset_size_train, dataset_size_valid = data_loader.run()

'''
appending train-loader and valid loader for training the model
'''
fullloader = {}
fullloader['train'] = train_loader
fullloader['valid'] = valid_loader
dataset_sizes = {'train': dataset_size_train, 'valid': dataset_size_valid}