In [2]:
import glob
import os
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

import math

import tensorflow as tf
from keras.layers import *
from keras import Model
import keras.backend as K
from keras.optimizers import Adam
from keras.models import load_model
from keras.layers.core import Lambda
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, Callback, ReduceLROnPlateau, EarlyStopping, ReduceLROnPlateau
from callbacks import Metrics, learningratescheduler, earlystopping, reducelronplateau
from plotting import plot_loss_and_acc
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 1
set_session(tf.Session(config=config))

In [3]:
# To read the images in numerical order
import re
numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

In [4]:
def l2_loss(spect1, spect2):
    loss = tf.sqrt(tf.nn.l2_loss(spect1[:,:,:,0] - spect2[:,:,:,0]))
    #loss = tf.sqrt(tf.nn.l2_loss(spect1 - spect2))
    return loss

In [5]:
def LipNet(input_shape, pretrained=None, output_size = 28, absolute_max_string_len=32):
        
        '''if K.image_data_format() == 'channels_first':
            input_shape = (img_c, frames_n, img_w, img_h)
        else:
            input_shape = (frames_n, img_w, img_h, img_c)'''

        input_data = Input(name='the_input', shape=input_shape, dtype='float32')

        zero1 = ZeroPadding3D(padding=(1, 2, 2), name='zero1')(input_data)
        conv1 = Conv3D(32, (3, 5, 5), strides=(1, 2, 2), kernel_initializer='he_normal', name='conv1')(zero1)
        batc1 = BatchNormalization(name='batc1')(conv1)
        actv1 = Activation('relu', name='actv1')(batc1)
        drop1 = SpatialDropout3D(0.5)(actv1)
        maxp1 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max1')(drop1)

        zero2 = ZeroPadding3D(padding=(1, 2, 2), name='zero2')(maxp1)
        conv2 = Conv3D(64, (3, 5, 5), strides=(1, 1, 1), kernel_initializer='he_normal', name='conv2')(zero2)
        batc2 = BatchNormalization(name='batc2')(conv2)
        actv2 = Activation('relu', name='actv2')(batc2)
        drop2 = SpatialDropout3D(0.5)(actv2)
        maxp2 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max2')(drop2)

        zero3 = ZeroPadding3D(padding=(1, 1, 1), name='zero3')(maxp2)
        conv3 = Conv3D(96, (3, 3, 3), strides=(1, 1, 1), kernel_initializer='he_normal', name='conv3')(zero3)
        batc3 = BatchNormalization(name='batc3')(conv3)
        actv3 = Activation('relu', name='actv3')(batc3)
        drop3 = SpatialDropout3D(0.5)(actv3)
        maxp3 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max3')(drop3)

        resh1 = TimeDistributed(Flatten())(maxp3)

        gru_1 = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='Orthogonal', name='gru1'), merge_mode='concat')(resh1)
        gru_2 = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='Orthogonal', name='gru2'), merge_mode='concat')(gru_1)

        # transforms RNN output to character activations:
        dense1 = Dense(output_size, kernel_initializer='he_normal', name='dense1')(gru_2)

        y_pred = Activation('softmax', name='softmax')(dense1)

        #labels = Input(name='the_labels', shape=[absolute_max_string_len], dtype='float32')
        #input_length = Input(name='input_length', shape=[1], dtype='int64')
        #label_length = Input(name='label_length', shape=[1], dtype='int64')

        #loss_out = CTC('ctc', [y_pred, labels, input_length, label_length])

        model = Model(inputs=input_data, outputs=y_pred)
        
        if pretrained == True:
            model.load_weights('/Users/manideepkolla/Downloads/unseen-weights178.h5')
        return model

In [6]:
class VideoModel():

    def __init__(self, filters,filters_audio, audio_ip_shape, video_ip_shape):
        
        self.filters = filters
        self.filters_audio=filters_audio       
        self.audio_ip_shape = audio_ip_shape
        self.video_ip_shape = video_ip_shape

        self.conv1 = Conv2D(filters = filters, kernel_size = (7), padding = "same", dilation_rate = (1,1),
                      activation = "relu")
        self.bn1 = BatchNormalization(axis=-1)

        self.conv2 = Conv2D(filters = filters, kernel_size = (5), padding = "same", dilation_rate = (1,1),
                      activation = "relu")
        self.bn2 = BatchNormalization(axis=-1)

        self.conv3 = Conv2D(filters = filters, kernel_size = (5), padding = "same", dilation_rate = (2,2),
                      activation = "relu")
        self.bn3 = BatchNormalization(axis=-1)

        self.conv4 = Conv2D(filters = filters, kernel_size = (5), padding = "same", dilation_rate = (4,4),
                      activation = "relu")
        self.bn4 = BatchNormalization(axis=-1)

        self.conv5 = Conv2D(filters = filters, kernel_size = (5), padding = "same", dilation_rate = (8,8),
                      activation = "relu")
        self.bn5 = BatchNormalization(axis=-1)

        self.conv6 = Conv2D(filters = filters, kernel_size = (5), padding = "same", dilation_rate = (16,16),
                      activation = "relu")
        self.bn6 = BatchNormalization(axis=-1)
    

        self.conv7 = Lambda(lambda x : tf.expand_dims(x, axis = -1))

        self.conv8 = Lambda(lambda x: tf.image.resize_nearest_neighbor(x, size = (298, x.shape[-2])))
        
        #self.lipnet_model = LipNet(img_c=self.video_ip_shape[3], img_w=self.video_ip_shape[2], img_h=self.video_ip_shape[1], frames_n=self.video_ip_shape[0], absolute_max_string_len=32, output_size=28).build()

    def FullModel(self, lipnet_pretrained):

        ip = Input(shape = (self.audio_ip_shape[0], self.audio_ip_shape[1], 2)) #; print("input_audio", ip.shape) 
        ip_embeddings_1 = Input(shape = (int(self.video_ip_shape[0]), int(self.video_ip_shape[1]),int(self.video_ip_shape[2]), int(self.video_ip_shape[3])))#; print("ip video", ip_embeddings_1.shape)  #[75, 512]
        #ip_embeddings_2 = Input(shape = (video_ip_shape[0], video_ip_shape[1])); print("ip video", ip_embeddings_2.shape)  #[75, 512]

        ip_magnitude = Lambda(lambda x : x[:,:,:,0],name="ip_mag")(ip)#; print("ip_mag ", ip_magnitude.shape)  #takes magnitude from stack[magnitude,phase]
        ip_phase = Lambda(lambda x : tf.expand_dims(x[:,:,:,1], axis = -1),name="ip_phase")(ip)#; print("ip_phase ", ip_phase.shape)  #takes phase from stack[magnitude,phase]

        ip_embeddings_1_expanded = Lambda(lambda x : tf.expand_dims(x, axis = -1))(ip_embeddings_1)
        #ip_embeddings_2_expanded = Lambda(lambda x : tf.expand_dims(x, axis = -1))(ip_embeddings_2)

        #audio_stream = self.AudioModel(ip)
        conv = Conv2D(filters = self.filters_audio, kernel_size = (3,3), strides = (1,1), padding = "same", dilation_rate = (1,1),
                      activation = "relu")(ip) ; print("conv ", conv.shape)
        conv = BatchNormalization(axis=-1)(conv)
        #conv = SpatialDropout2D(rate = dropout)(conv)
        
        conv = Conv2D(filters = self.filters_audio, kernel_size = (3,3), strides = (1,1), padding = "same", dilation_rate = (1,1),
                      activation = "relu")(conv) ; print("conv ", conv.shape)
        conv = BatchNormalization(axis=-1)(conv)
        #conv = SpatialDropout2D(rate = dropout)(conv)
        
        conv = Conv2D(filters = self.filters_audio, kernel_size = (3,3), strides = (1,1), padding = "same", dilation_rate = (1,1),
                      activation = "relu")(conv) ; print("conv ", conv.shape)
        conv = BatchNormalization(axis=-1)(conv)
        #conv = SpatialDropout2D(rate = dropout)(conv)
        
        conv = Conv2D(filters = self.filters_audio* 2, kernel_size = (3,3), strides = (1,1), padding = "same", dilation_rate = (1,1),
                      activation = "relu")(conv) ; print("conv ", conv.shape)
        conv = BatchNormalization(axis=-1)(conv)
        #conv = SpatialDropout2D(rate = dropout)(conv)
        
        conv = Conv2D(filters = self.filters_audio* 2, kernel_size = (3,3), strides = (1,1), padding = "same", dilation_rate = (1,1),
                      activation = "relu")(conv) ; print("conv ", conv.shape)
        conv = BatchNormalization(axis=-1)(conv)
        #conv = SpatialDropout2D(rate = dropout)(conv)
        
        conv = Conv2D(filters = self.filters_audio* 3, kernel_size = (3,3), strides = (1,1), padding = "same", dilation_rate = (1,1),
                      activation = "relu")(conv) ; print("conv ", conv.shape)
        conv = BatchNormalization(axis=-1)(conv)
        #conv = SpatialDropout2D(rate = dropout)(conv)
        
        conv = Conv2D(filters = self.filters_audio* 3, kernel_size = (5,5), strides = (1,1), padding = "same", dilation_rate = (1,1),
                      activation = "relu")(conv) ; print("conv ", conv.shape)
        conv = BatchNormalization(axis=-1)(conv)
        #conv = SpatialDropout2D(rate = dropout)(conv)
        
        conv = Conv2D(filters = self.filters_audio* 3, kernel_size = (5,5), strides = (1,1), padding = "same", dilation_rate = (1,1),
                      activation = "relu")(conv) ; print("conv ", conv.shape)
        conv = BatchNormalization(axis=-1)(conv)
        #conv = SpatialDropout2D(rate = dropout)(conv)
        
        conv = Conv2D(filters = self.filters_audio//12, kernel_size = (5,5), strides = (1,1), padding = "same", dilation_rate = (1,1),
                      activation = "relu")(conv) ; print("conv ", conv.shape)
        audio_stream = BatchNormalization(axis=-1)(conv)
        print('audio_stream', audio_stream.shape)

        '''stream_1 = self.conv1(ip_embeddings_1)
        stream_1 = self.bn1(stream_1)
        stream_1 = self.conv2(stream_1)
        stream_1 = self.bn2(stream_1)
        stream_1 = self.conv3(stream_1)
        stream_1 = self.bn3(stream_1)
        stream_1 = self.conv4(stream_1)
        stream_1 = self.bn4(stream_1)
        stream_1 = self.conv5(stream_1)
        stream_1 = self.bn5(stream_1)
        stream_1 = self.conv6(stream_1)
        stream_1 = self.bn6(stream_1)
        h,w = stream_1.shape[1], stream_1.shape[2]
        c=stream_1.shape[3]
        print(h,w,c)
        re=Lambda(lambda x: tf.reshape(x,shape=(-1,h*w,c)))(stream_1)
        print(re.shape)
        stream_2 = self.conv7(re) 
        video_stream_1 = self.conv8(stream_2)
        print(video_stream_1.shape)'''
        
        #self.lipnet_model.load_weights('/Users/manideepkolla/Downloads/unseen-weights178.h5')
        
        '''x = self.lipnet_model.layers[-2].output
        #x = Model(inputs = ip_embeddings_1, outputs=x).output
        x = self.conv7(x)
        video_stream_1 = self.conv8(x)
        print(video_stream_1.shape)'''
        
        lipnet_model = LipNet(input_shape = (500,50,100,3), pretrained=lipnet_pretrained)
        x = lipnet_model.output
        x = Dense(128, kernel_initializer='he_normal', name='dense2')(x)
        x = Dense(256, kernel_initializer='he_normal', name='dense3')(x)
        x = self.conv7(x)
        video_stream_1 = self.conv8(x)

#         stream_2 = self.conv1(ip_embeddings_2)
#         stream_2 = self.bn1(stream_2)
#         stream_2 = self.conv2(stream_2)
#         stream_2 = self.bn2(stream_2)
#         stream_2 = self.conv3(stream_2)
#         stream_2 = self.bn3(stream_2)
#         stream_2 = self.conv4(stream_2)
#         stream_2 = self.bn4(stream_2)
#         stream_2 = self.conv5(stream_2)
#         stream_2 = self.bn5(stream_2)
#         stream_2 = self.conv6(stream_2)
#         stream_2 = self.bn6(stream_2)
#         stream_2 = self.conv7(stream_2)
#         video_stream_2 = self.conv8(stream_2)

        audio_flatten = TimeDistributed(Flatten())(audio_stream) 
        print(audio_flatten.shape)
        video_flatten_1 = TimeDistributed(Flatten())(video_stream_1)
        print(video_flatten_1.shape)
        #video_flatten_2 = TimeDistributed(Flatten())(video_stream_2)

        #print("video Streams ", video_stream_1.shape, video_stream_2.shape)
        #print("Flatten Streams", video_flatten_1.shape, video_flatten_2.shape, audio_flatten.shape)

        concated = concatenate([audio_flatten, video_flatten_1], axis = 2) 
        print("concat shape ", concated.shape)

        lstm = Bidirectional(LSTM(units = 64, return_sequences = True, activation = "tanh"))(concated)   
        #;print("lstm", lstm.shape)

        flatten = Flatten()(lstm) 
        #;print("flatten ", flatten.shape)

        dense = Dense(100, activation = "relu")(flatten)

        dense = Dense(2 * self.audio_ip_shape[0] * self.audio_ip_shape[1], activation = "sigmoid")(dense) 
        #;print("dense final ",dense.shape)

        combo_mask = Reshape([2 , self.audio_ip_shape[0], self.audio_ip_shape[1]])(dense) 
        #; print("combo_mask ", combo_mask.shape)
        mask_1 = Lambda(lambda x : x[:,0])(combo_mask) 
        #;print("mask 1 ", mask_1.shape)
        #mask_2 = Lambda(lambda x : x[:,1])(combo_mask) 
        #;print("mask 2 ", mask_2.shape)

        output_mag_1 = Lambda(lambda x : tf.multiply(x[0], x[1]), name = "mask_multiply_1")([ip_magnitude, mask_1])#; print("output_mag_1", output_mag_1.shape)
        #output_mag_2 = Lambda(lambda x : tf.multiply(x[0], x[1]), name = "mask_multiply_2")([ip_magnitude, mask_2]) ; print("output_mag_2", output_mag_2.shape)

        output_mag_1 = Lambda(lambda x : tf.expand_dims(x, axis= -1), name= "expand_dim_1")(output_mag_1)#; print("output_mag_expand_1", output_mag_1.shape)
        #output_mag_2 = Lambda(lambda x : tf.expand_dims(x, axis= -1), name= "expand_dim_2")(output_mag_2) ; print("output_mag_expand_2", output_mag_2.shape)

        output_final_1 = Lambda(lambda x : tf.concat(values=[x[0], x[1]], axis = -1),name="concat_mag_phase_1")([output_mag_1, ip_phase]) 
        #; print("output_final_1 ", output_final_1.shape)
        #output_final_2 = Lambda(lambda x : tf.concat(values=[x[0], x[1]], axis = -1),name="concat_mag_phase_2")([output_mag_2, ip_phase]) ; print("output_final_2 ", output_final_2.shape)

        model = Model([ip, lipnet_model.input], [output_final_1])

        return model

In [8]:
model = VideoModel(256,96,(298,257,2),(500,50,100,3)).FullModel(lipnet_pretrained = None)

conv  (?, 298, 257, 96)
conv  (?, 298, 257, 96)
conv  (?, 298, 257, 96)
conv  (?, 298, 257, 192)
conv  (?, 298, 257, 192)
conv  (?, 298, 257, 288)
conv  (?, 298, 257, 288)
conv  (?, 298, 257, 288)
conv  (?, 298, 257, 8)
audio_stream (?, 298, 257, 8)
(?, 298, 2056)
(?, 298, 256)
concat shape  (?, 298, 2312)


In [11]:
# Compile the model
lrate = 0.0001
model.compile(optimizer = Adam(lr=lrate), loss = l2_loss, metrics=['accuracy'])

In [12]:
folders_list = sorted(glob.glob('/Users/manideepkolla/Downloads/test_fold/output/*'), key=numericalSort)

In [13]:
# DataGenerator 

def DataGenerator(lips_filelist, masks_filelist, spects_filelist, batch_size):

    L = len(files)

    #this line is just to make the generator infinite, keras needs that
    while True:

        batch_start = 0
        batch_end = batch_size
        while batch_start < L:
            limit = min(batch_end, L)
            
            X_lips = np.asarray([get_video_frames(fname) for fname in lips_filelist[batch_start:limit]])
            
            X_mask = np.asarray([np.load(fname) for fname in masks_filelist[batch_start:limit]])
            
            X_spect = np.asarray([np.load(fname) for fname in spects_filelist[batch_start:limit]])
            
            #X = seq.augment_images(X)
            
            yield X_lips, X_spect, X_mask

            batch_start += batch_size
            batch_end += batch_size


In [14]:
# callcack
metrics = Metrics()
learningratescheduler = learningratescheduler()
earlystopping = earlystopping()
reducelronplateau = reducelronplateau()

In [15]:
# Path to save model checkpoints

path = ''

try:
    os.mkdir('/home/manideep/models/'+ path)
except OSError:
    pass

filepath='/home/manideep/models/' +  path+ '/weights-best.hdf5'
checkpoint_save_weights = ModelCheckpoint(filepath, monitor='val_acc', save_best_only=True, mode='max')

In [None]:
# Fit Generator

batch_size = 16
epochs = 20
history = model.fit_generator(DataGenerator(lips_filelist, masks_filelist, spects_filelist, batch_size),
                steps_per_epoch = np.ceil((len(lips_filelist)/float(batch_size)),
                epochs=epochs,
                validation_data=DataGenerator(lips_filelist_val, masks_filelist_val, spects_filelist_val, batch_size), 
                validation_steps = np.ceil((len(lips_filelist_val)/float(batch_size)),
                callbacks=[earlystopping, learningratescheduler, checkpoint_save_weights], verbose = 1)
