In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os # interface with underlying OS that python is running on
# CREATE DIRECTORY OF AUDIO FILES 
path = "/content/drive/MyDrive/ravdess/ravdess/"
actor_folders = os.listdir(path) #list files in audio directory
actor_folders.sort() 
actor_folders[0:5]

['Actor_21', 'Actor_22', 'Actor_23', 'Actor_24', 'Images']

In [None]:
try:
    # creating a folder named Wave
    if not os.path.exists('/content/drive/MyDrive/ravdess/ravdess/Wave'): 
        os.makedirs('/content/drive/MyDrive/ravdess/ravdess/Wave') 
  
# if not created then raise error 
except OSError: 
    print ('Error: Creating directory of Wave') 
  

# Process Audio Files

## Extract Audio From Video

In [3]:
import os
from glob import glob
from tqdm import tqdm

try:
    # creating a folder named Wave
    if not os.path.exists('/content/drive/MyDrive/ravdess/ravdess/Wave'): 
        os.makedirs('/content/drive/MyDrive/ravdess/ravdess/Wave') 
  
# if not created then raise error 
except OSError: 
    print ('Error: Creating directory of Wave')

inputdir = '/content/drive/MyDrive/ravdess/ravdess/Actor_*' 
outputdir = '/content/drive/MyDrive/ravdess/ravdess/Wave'
#iterate over Actor folders
for i in actor_folders:
      print(i)
      # go through files in Actor folder
      for filename in os.listdir('/content/drive/MyDrive/ravdess/ravdess/' + i):
           actual_filename = filename[:-4]
           os.system('ffmpeg -i {}/{} -acodec pcm_s16le -ar 16000 {}/{}.wav'.format(inputdir, filename, outputdir, actual_filename))
          

Actor_21
Actor_22
Actor_23
Actor_24
Images
Wave


In [19]:
import shutil, sys 
shutil.rmtree('/content/drive/MyDrive/ravdess/ravdess/Wave')

In [None]:
pip install tqdm




## Extract Features And Do Some Process Operations

In [4]:
def extract_feature(file_name):
    """Function Extracts Features from WAV file"""
    X, sample_rate = librosa.load(file_name)
    stft=np.abs(librosa.stft(X))
    result=np.array([])
    mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    result=np.hstack((result, mfccs))
    chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    result=np.hstack((result, chroma))
    mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    result=np.hstack((result, mel))
    return result

In [5]:
emotions={
  '01':'neutral/calm',
  '02':'happy',
  '03':'sad',
  '04':'angry',
  '05':'fearful',
  '06':'disgust',
  '07':'surprised'
}

def gender(g):
    """Returns Gender Label"""
    if int(g[0:2]) % 2 == 0:
        return 'female'
    else:
        return 'male'

In [6]:
from tqdm import tqdm
import glob 
import pandas as pd
import numpy as np
import librosa
import librosa.display
  
x,y=[],[]
for file in tqdm(glob.glob("/content/drive/MyDrive/ravdess/ravdess/Wave/*.wav")):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[3]] + '_' + gender(file_name.split("-")[-1])
        feature=extract_feature(file)
        mean = np.mean(feature, axis=0)
        std = np.std(feature, axis=0)
        feature =(feature - mean)/std

        x.append(feature)
        y.append(emotion)

100%|██████████| 453/453 [01:52<00:00,  4.01it/s]


In [7]:
print(x)
print(y)

[array([-1.32529826e+01,  1.80034398e+00,  7.95289647e-02,  5.94898909e-01,
        1.19141786e-01,  1.10474789e-01,  7.09013729e-02,  1.49062920e-02,
       -1.36793007e-01,  8.61843924e-02,  9.86503041e-02, -1.84123744e-02,
        1.10268349e-01,  8.38261475e-02,  8.77421762e-02,  1.39689045e-02,
        6.23162500e-02,  4.87875366e-02,  9.63537574e-02,  3.70109374e-02,
        1.45853442e-02,  7.00071068e-02, -2.02334578e-04,  1.06274154e-02,
        1.25003479e-02,  3.91032035e-02,  4.20328874e-03,  3.97497293e-02,
        6.44163309e-02,  7.56374773e-02,  5.82302460e-02,  2.13742782e-02,
        3.72111005e-02,  2.67856606e-02,  5.19930279e-02,  3.19869956e-02,
        2.45804313e-02,  2.40483524e-03,  1.80888676e-02,  2.83622045e-02,
        7.79588193e-02,  7.84959187e-02,  7.93505097e-02,  7.92122868e-02,
        7.95605343e-02,  8.03745865e-02,  8.01339573e-02,  8.15845960e-02,
        8.14274364e-02,  8.04246936e-02,  7.98900319e-02,  7.88342884e-02,
        6.43882247e-02, 

In [7]:
print(y)

['neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'happy_male', 'neutral/calm_male', 'neutral/calm_male', 'happy_male', 'happy_male', 'neutral/calm_male', 'happy_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'happy_male', 'happy_male', 'happy_male', 'neutral/calm_male', 'happy_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'happy_male', 'happy_male', 'happy_male', 'happy_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'happy_male', 'happy_male', 'happy_male', 'happy_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'happy_male', 'happy_male', 'happy_male', 'happy_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'happy_male', 'happy_male', 'happy_male', 'happy_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/calm_male', 'neutral/cal

# Extract Images From Videos

In [9]:
# Importing all necessary libraries 
import cv2 
import os 
Image="/content/drive/MyDrive/ravdess/ravdess/"
currentFrame = 0
for i in actor_folders:
    filename = os.listdir(Image + i) #iterate over Actor folders
    for file in filename: # go through files in Actor folder
       if file.endswith(".mp4"):
         path=os.path.join("/content/drive/MyDrive/ravdess/ravdess/" + i + '/', str(file))
         cam = cv2.VideoCapture(path) 
         #print(path)
         #images, frame_times = get_frames_from_video(file, STRIDE)
         #frame_capture(path)
         while(True): 
         # reading from frame 
           ret,frame = cam.read() 
           if currentFrame == 6:
             break
           else:
         # if video is still left continue creating images 
              name = '/content/drive/MyDrive/ravdess/ravdess/Images/frame' + str(currentFrame) + '.jpg'
              print ('Creating...' + name)
              cv2.imwrite(name,frame) 
           currentFrame += 1 
         # writing the extracted images 
             
         # increasing counter so that it will 
         # show how many frames are created 
             
     
  
# Release all space and windows once done 
cam.release() 
cv2.destroyAllWindows() 

Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame0.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame1.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame2.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame3.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame4.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame5.jpg


In [10]:
# Importing all necessary libraries 
import cv2 
import os 
  
# Read the video from specified path 
cam = cv2.VideoCapture("/content/drive/MyDrive/ravdess/ravdess/Actor_21/01-01-01-01-01-01-21.mp4") 
  
try: 
      
    # creating a folder named data 
    if not os.path.exists('/content/drive/MyDrive/ravdess/ravdess/Images'): 
        os.makedirs('/content/drive/MyDrive/ravdess/ravdess/Images') 
  
# if not created then raise error 
except OSError: 
    print ('Error: Creating directory of images') 
  
# frame 
currentframe = 0
  
while(True): 
      
    # reading from frame 
    ret,frame = cam.read() 
  
    if ret: 
        # if video is still left continue creating images 
        name = '/content/drive/MyDrive/ravdess/ravdess/Images/frame' + str(currentframe) + '.jpg'
        print ('Creating...' + name) 
  
        # writing the extracted images 
        cv2.imwrite(name, frame) 
  
        # increasing counter so that it will 
        # show how many frames are created 
        currentframe += 1
    else: 
        break
  
# Release all space and windows once done 
cam.release() 
cv2.destroyAllWindows() 

Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame0.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame1.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame2.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame3.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame4.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame5.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame6.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame7.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame8.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame9.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame10.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame11.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame12.jpg
Creating.../content/drive/MyDrive/ravdess/ravdess/Images/frame13.jpg
Creating.../content/drive/MyDrive/ravdess/ra

In [27]:
import shutil, sys 
shutil.rmtree('/content/drive/MyDrive/ravdess/ravdess/Images/')

In [11]:
import cv2
import numpy as np
import os

def frame_capture(file):
  # Playing video from file:
  cap = cv2.VideoCapture(file)

  try:
      if not os.path.exists('/content/drive/MyDrive/ravdess/ravdess/Images'):
          os.makedirs('/content/drive/MyDrive/ravdess/ravdess/Images')
  except OSError:
      print('Error: Creating directory of the Image')

  currentFrame = 0
  while(True):
      # Capture frame by frame
      ret, frame = cap.read()
      # Only take the first frame and tenth frame
      if currentFrame == 6:
          name = '/content/drive/MyDrive/ravdess/ravdess/Images/frame' + str(currentFrame) + '.jpg'
          print ('Creating...' + name)
          cv2.imwrite(name, frame)

      # To stop duplicate images
      currentFrame += 1

  cap.release()
  cv2.destroyAllWindows()

In [13]:
try: 
      
    # creating a folder named data 
    if not os.path.exists('/content/drive/MyDrive/ravdess/ravdess/Images/faces'): 
        os.makedirs('/content/drive/MyDrive/ravdess/ravdess/Images/faces') 
  
# if not created then raise error 
except OSError: 
    print ('Error: Creating directory of images') 
  

In [21]:
ImagePath= glob.glob("/content/drive/MyDrive/ravdess/ravdess/Images/*.jpg")
currentface = 0
for img in ImagePath:
          #p=path.joinpath(i)
         # print(img)
          #ret,frame = cam.read()
    image = cv2.imread(img)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    faceCascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
    faces = faceCascade.detectMultiScale(
        gray,
        scaleFactor=1.3,
        minNeighbors=3,
        minSize=(30, 30))
    print("Found {0} Faces!".format(len(faces)))
    for (x, y, w, h) in faces:
         i=cv2.rectangle(image, (x, y), (x+w, y+h), (0, 255, 0), 2)
         path =  '/content/drive/MyDrive/ravdess/ravdess/Images/faces/face' + str(currentface) + '.jpg'
         currentface +=1
         cv2.imwrite(path, i)

Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Faces!
Found 1 Fa

In [None]:
try:
    # creating a folder named data 
    if not os.path.exists('/content/drive/MyDrive/ravdess/ravdess/Images/ResizedImage'): 
        os.makedirs('/content/drive/MyDrive/ravdess/ravdess/Images/ResizedImage') 
  
# if not created then raise error 
except OSError: 
    print ('Error: Creating directory of faces') 
  
path = "/content/drive/MyDrive/ravdess/ravdess/Images/faces/"
dirs = os.listdir(path)
for item in dirs:
            im = Image.open(path + item)
            f, e = os.path.splitext(path+item)
            imResize = im.resize((64,64), Image.ANTIALIAS)
            imResize.save('/content/drive/MyDrive/ravdess/ravdess/Images/ResizedImage/'+item, 'JPEG', quality=90)


In [55]:
import shutil, sys 
shutil.rmtree('/content/drive/MyDrive/ravdess/ravdess/Images/ResizedImage')

## Split The Dataset

In [None]:
#Important parameters and training/Test size
prediction_time = 1 
testdatasize = 8000
unroll_length = 50
testdatacut = testdatasize + unroll_length  + 1

#Training data
x_train = data_n[0:-prediction_time-testdatacut].values
y_train = data_n[prediction_time:-testdatacut  ][0].values

#Test data
x_test = data_n[0-testdatacut:-prediction_time].values
y_test = data_n[prediction_time-testdatacut:  ][0].values

## Building The LSTM Model

In [58]:
#Importing Required Libraries
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
sns.set(color_codes=True)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn
import matplotlib.dates as md
from sklearn import preprocessing
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.optimizers import Adam
import time
import datetime
from datetime import datetime

In [None]:
model = Sequential()

model.add(LSTM(100,input_dim=x_train.shape[-1], return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(100, return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(6))
model.add(Activation('linear'))

start = time.time()
model.compile(loss='mae', optimizer='Adam')
print('compilation time : {}'.format(time.time() - start))
model.summary()

In [None]:
if file.endswith('.jpg'):
            im = Image.open(file)
            f, e = os.path.splitext(file)
            imResize = im.resize((64,64), Image.ANTIALIAS)
            imResize.save(f + ' resized.jpg', 'JPEG', quality=90)
            print('done')

In [24]:
try:
    # creating a folder named data 
    if not os.path.exists('/content/drive/MyDrive/ravdess/ravdess/Images/faces'): 
        os.makedirs('/content/drive/MyDrive/ravdess/ravdess/Images/faces') 
  
# if not created then raise error 
except OSError: 
    print ('Error: Creating directory of faces') 
  

In [None]:

status = cv2.imwrite('faces_detected.jpg', image)

In [None]:
print(status)

True


In [None]:
pip install tqdm



In [None]:
#!/usr/bin/python
from PIL import Image
import os, sys

#path = "/root/Desktop/python/images/"
path = "/content/"
dirs = os.listdir( path )

def resize():
    for item in dirs:
        if os.path.isfile(path+item):
            im = Image.open(path+item)
            f, e = os.path.splitext(path+item)
            imResize = im.resize((64,64), Image.ANTIALIAS)
            imResize.save(f + ' resized.jpg', 'JPEG', quality=90)

resize()

In [None]:
print(images)

[]


In [60]:
def get_frames_from_video(video_file, stride=1.0):
    """
    video_file - path to file
    stride - i.e 1.0 - extract frame every second, 0.5 - extract every 0.5 seconds
    return: list of images, list of frame times in seconds
    """
    video = cv2.VideoCapture(video_file)
    fps = video.get(cv2.CAP_PROP_FPS)
    i = 0.
    images = []
    frame_times = []

    while video.isOpened():
        ret, frame = video.read()
        if ret:
            images.append(frame)
            frame_times.append(i)
            i += stride
            video.set(1, round(i * fps))
        else:
            video.release()
            break
    return images, frame_times


def resize_if_necessary(image, max_size=1024):
    """
    if any spatial shape of image is greater 
    than max_size, resize image such that max. spatial shape = max_size,
    otherwise return original image
    """
    if max_size is None:
        return image
    height, width = image.shape[:2]
    if max([height, width]) > max_size:
        ratio = float(max_size / max([height, width]))
        image = cv2.resize(image, (0, 0), fx=ratio, fy=ratio, interpolation=cv2.INTER_CUBIC)
    return image


In [None]:

STRIDE = 1.0
MAX_IMAGE_SIZE = 64
#images, frame_times = get_frames_from_video('/content/drive/MyDrive/ravdess/ravdess/Actor_21/01-01-01-01-01-01-21.mp4', STRIDE)
images = [resize_if_necessary(image, MAX_IMAGE_SIZE) for image in images]

In [None]:
import os
import cv2
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(20,10))
columns = 5
for i, (image, frame_time) in enumerate(zip(images, frame_times)):
    plt.subplot(len(images) / columns + 1, columns, i + 1).set_title("Frame time: " + str(frame_time))
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

<Figure size 1440x720 with 0 Axes>

In [None]:
print(images)

[]



# Assignment 2/C
**Disclaimer: Only for ADL/AML students!**

### General information
**You have to solve all tasks to pass!**

Grade is calculated by the day of the last submission, but you will only get it after you've succesfully presented it. 

*Deadlines and grades:* 
  * 2020.10.27 - 2020.11.24 ==> 5
  * 2020.11.25 - 2020.12.01 ==> 4
  * 2020.12.02 - 2020.12.08 ==> 3
  * 2020.12.09 - 2020.12.15 ==> 2
  * 2020.12.16 or later ==> 1

You can **use only these** 3rd party **packages:** `cv2, keras, matplotlib, numpy, pandas, sklearn, skimage, tensorflow, librosa`.

### Description
In this assignment you have to build a multimodal deep neural network for emotion detection using tf.keras. You have to work with the RAVDESS dataset, which contains short (~4 seconds long) video clip recordings of speakers, who are acting the different emotions through 2 sentences. We will extract and combine RGB frames with MFCCs and utilize both video and audio information sources to achieve a better prediction.

### Use GPU
Runtime -> Change runtime type

At Hardware accelerator select  GPU then save it.  

### Useful shortcuts
* Run selected cell: *Ctrl + Enter*
* Insert cell below: *Ctrl + M B*
* Insert cell above: *Ctrl + M A*
* Convert to text: *Ctrl + M M*
* Split at cursor: *Ctrl + M -*
* Autocomplete: *Ctrl + Space* or *Tab*
* Move selected cells up: *Ctrl + M J*
* Move selected cells down: *Ctrl + M K*
* Delete selected cells: *Ctrl + M D*

If you have any question, feel free to ask.
fodorad201@gmail.com

## Prepare dataset

* Download the RAVDESS dataset. Here you can find more information about the dataset: https://zenodo.org/record/1188976#.X5g53OLPw2w
The dataset is available here as well: http://nipg1.inf.elte.hu:8765
ravdess.zip contains all of the mp4 clips. The labels are in the file names. (classification task)

* Preprocess the data.
  * Remove the silence parts from the beginning and the end of video clips. (Tips: ffmpeg filters)
  * Audio representation: 
    * Extract the audio from the video. (Tips: ffmpeg)
    * Extract 24 Mel Frequency Cepstral Coefficients from the audio. (Tips: use librosa.)
    * Calculate the mean number of (spectral) frames in the dataset.
    * Standardize the MFCCs sample-wise. (Tips: zero mean and unit variance)
    * Use pre-padding (Note: with 0, which is also the mean after standardization) to unify the length of the samples.
    * Audio representation per sample is a tensor with shape (N,M,1) where N is the number of coefficients (e.g. 24) and M is the number of audio frames.
  * Visual representation:
    * Extract the faces from the images. (Tips: You can use the cv2.CascadeClassifier, or the DLIB package to determine facial keypoints, or MTCNN to predict bounding boxes.)
    * Resize the face images to 64x64. (Tips: You can use lower/higher resolution as well.)
    * Subsample the frames to reduce complexity (6 frames/video is enough).
    * Apply data augmentation, and scaling [0, 1].
    * Video representation per sample is a tensor with shape (F,H,W,3) where F is the number of frames (e.g. 6), H and W are the spatial dimensions (e.g. 64).
  * Ground truth labels:
    * There are 8 class labels. However, Class 1 (Neutral) and Class 2 (Calm) are almost the same. It is a commonly used practice to merge these two classes. Combine them to reduce complexity.
    * (Optional) Use one-hot-encoding with categorical_crossentropy loss later on, or keep them between [0, 6] and use sparse_categorical_crossentropy loss. It's up to you.

* Split the datasets into train-valid-test sets. Samples from the same speaker shouldn't appear in multiple sets. (Example split using speaker ids: 1-17: train set, 18-22: validation set, 23-24: test set)
* Create a generator, which iterates over the audio and visual representations. (Note: the generator should produce a tuple ([x0, x1], y), where x0 is the audio, x1 is the video representation, y is the ground truth.
* Print the size of each set, plot 3 samples: frames, MFCCs and their corresponding emotion class labels. (Tips: use librosa for plotting MFCCs)

Alternative considerations. They may require additional steps:
* You can use Mean (axis=1) MFCCs vectors to further reduce complexity. Input of the corresponding subnetwork should be modified to accept inputs with shape (N, 1).
* You can use log-melspectrograms as well. Note, that raw spectrograms are displaying power. Mel scale should be applied on the frequency axis, and log on the third dimension (decibels are expected). You can use librosa for that (librosa.feature.melspectrogram, librosa.power_to_db)
* A better evaluation procedure here is the LOO (Leave-One-Out) cross-validation, however it can be costy.

In [60]:
# New model
from keras.layers.convolutional import Conv1D
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, LSTM, merge
from keras.layers import BatchNormalization
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Conv1D, MaxPooling1D, SimpleRNN, Bidirectional, Input
from tensorflow.python.keras import Model, Sequential
def fcnet(input_size=(64, 64, 3)):
 model = Sequential()
 model.add(Conv1D(32, 3),input_shape=input_size,Activation('relu'))
 model.add(BatchNormalization())
 model.add(Activation('relu'))
 model.add(Conv1D(32, 3))
 model.add(BatchNormalization())
 model.add(Activation('relu'))
 model.add(Conv1D(64, 3))
 model.add(BatchNormalization())
 model.add(Activation('relu'))
 model.add(Dropout(0.25))
 model.add(Dense(128)) # Target class number
 model.add(Activation('relu'))
 model.summary()

SyntaxError: ignored

In [65]:
output_length = 32 

dropout = 0.5
lstm_dim = 256

In [66]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers import Bidirectional, TimeDistributed
from keras.callbacks import EarlyStopping

def fcnet(input_size=(64, 64, 3)):
 model = Sequential()
 model.add(Bidirectional(LSTM(64), reset_selective = True))
 model.add(tf.keras.layers.Dropout(0.5))
 model.add(tf.keras.layers.Bidirectional(LSTM(64)))
 model.add(tf.keras.layers.Dense(units = 128, activation = 'relu'))
 model.add(Dense(units = 1, activation = 'softmax'))
 print(model.summary())

model = fcnet(input_size=(64, 64, 3))
model.summary()

TypeError: ignored

In [67]:
token_num = 4000 
data_length =100
model = Sequential()
model.add(Embedding(output_dim = output_length, 
                    input_dim = token_num, 
                    input_length = data_length))
model.add(Dropout(dropout))

# using BLSTM (this will be better than LSTM, Avg acc is around 0.85
model.add(Bidirectional(LSTM(lstm_dim), merge_mode = 'sum'))
model.add(Dropout(dropout))


model.add(Dense(units = 256, activation = 'relu'))
model.add(Dropout(dropout))

model.add(Dense(units = 1, activation = 'sigmoid'))
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 32)           128000    
_________________________________________________________________
dropout (Dropout)            (None, 100, 32)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               591872    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

## Create Model

* Create the audio subnetwork
  * Choose one of these:
    * BLSTM (64 units, return sequences) + Dropout 0.5 + BLSTM (64 units) + Dense (128 units, ReLU)
    * Conv1D (32 filters, 3x3) + BatchNorm + ReLU, Conv1D (32 filters, 3x3) + BatchNorm + ReLU, Conv1D (64 filters, 3x3) + BatchNorm + ReLU, LSTM (64 units) + Dropout 0.5 + Dense (128 units, ReLU)
    * Conv2D (32 filters, 3x3) + BatchNorm + ReLU, MaxPool2D, Conv2D (32 filters, 3x3) + BatchNorm + ReLU, MaxPool2D, Flatten, Dense (128 units, ReLU)
  * You can try other configurations, better submodels. Have a reason for your choice!
* Create the visual subnetwork
  * Choose a visual backbone, which is applied frame-wise (Tips: use TimeDistributed Layer for this):
    * VGG-like architecture (Conv2D + MaxPooling blocks)
    * ResNet / Inception architecture (Residual blocks, Inception cells)
  * You can try other configurations, better submodels (like 3D convolution nets). Have a reason for your choice!
  * Apply Max pooling over the time dimension to reduce complexity (or use GRU or LSTM for better temporal modelling)
* Model fusion:
  * Concatenate the final hidden representations of the audio and visual subnetwork.
  * Apply fully connected layers on it (256 units, ReLU), then an another dense layer (7 units, softmax).
  * You can feed multiple inputs to the Model using a list: 
  model = tf.keras.models.Model(inputs=[input_audio, input_video], outputs=output)

## Extra task (Optional)
Use the VGGFace2 model (and pretrained weights) in the visual subnetwork. It is trained on faces, so a much better representation can be obtained with it. Finetune the network for enhanced prediction.
(code: https://github.com/rcmalli/keras-vggface, but other implementation can be used as well)
Note, that this repository use the classic keras, while we are using tf.keras.
It may rise compatibility problems.

**If you can successfully use the VGGFace2 pretrained net, +1 is added to the final grade.**

## Additional notes

* Do the preprocessing steps offline, create pkl (or npy, hdf5, etc..) files, so you don't have to repeat most of the steps again. Then you can upload it, and train using colab without much struggling.
* Use Adam optimizer.
* Use learning rate scheduler.
* Check the training curve. If overfitting happens, add more regularization: weight decay (L2: 1e-3, 5e-4, etc...), Dropout

## Final steps, evaluation

* Plot the training / validation curve.
* Calculate accuracy, print a confusion matrix.