<a href="https://colab.research.google.com/github/ngolla/video-captioning/blob/master/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pickle
from google.colab import drive
from pathlib import Path
import numpy as np

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
drive_path=Path('/content/drive/My Drive')
list(drive_path.glob('*'))

[PosixPath('/content/drive/My Drive/Colab Notebooks'),
 PosixPath('/content/drive/My Drive/YouTubeClips.tar'),
 PosixPath('/content/drive/My Drive/video_corpus.csv'),
 PosixPath('/content/drive/My Drive/.ipynb_checkpoints'),
 PosixPath('/content/drive/My Drive/VideoArrays'),
 PosixPath('/content/drive/My Drive/pickle_train.dat'),
 PosixPath('/content/drive/My Drive/pickle_test.dat')]

In [None]:
train_file=drive_path.joinpath('pickle_train.dat')
val_file=drive_path.joinpath('pickle_test.dat')

In [None]:
(vid_test, cap_test)=pickle.load(open( val_file, "rb" ))
(vid_train, cap_train)=pickle.load(open( train_file, "rb" ))

In [None]:
print(len(vid_train), vid_train[0].shape)
print(len(vid_test), vid_test[0].shape)

64639 (80, 4096)
16160 (80, 4096)


In [None]:
vid=np.array(vid_test[:500])
cap=np.array(cap_test[:500])

In [None]:
print(vid.ndim, vid.shape )
print(cap.ndim, cap.shape)

3 (500, 80, 4096)
2 (500, 89)


In [None]:
vid.shape[1], vid.shape[2]

(80, 4096)

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, LSTM, Dense

# returns train, inference_encoder and inference_decoder models
def basic_enc_dec(n_input, n_output, n_units):
    # define training encoder
    encoder_inputs = Input(shape=(None, n_input))
    encoder = LSTM(n_units, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]

    # define training decoder
    decoder_inputs = Input(shape=(None, n_output))
    decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(n_output, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)

    # define inference decoder
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

    # return all models
    return model, encoder_model, decoder_model

In [None]:
vocab_size=11161
dim_embedding=64
maxlen=80

In [None]:
model, enc, dec = basic_enc_dec(4096, vocab_size, maxlen)

In [None]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 4096)] 0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 11161) 0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 80), (None,  1336640     input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 80), ( 3597440     input_2[0][0]                    
                                                                 lstm[0][1]            

In [None]:
cap

array([  2,  25,   4, 469, 528,   3,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [None]:

x2 = np.hstack([np.zeros((500, 1)), np.array(cap)])
x2 = x2[:, :-1]

In [None]:
#Convert to 1652x42x1000
from keras.utils.np_utils import to_categorical   

x2_in = to_categorical(x2, num_classes = vocab_size)
outputs = to_categorical(cap, num_classes = vocab_size)
print(x2_in.shape, outputs.shape)

(500, 89, 11161) (500, 89, 11161)


In [None]:
from tensorflow.keras.optimizers import RMSprop
model.compile(optimizer=RMSprop(lr=7e-4), loss='categorical_crossentropy')
history=model.fit([vid, x2_in], outputs, epochs = 1)

In [None]:
history.history

{'loss': [4.78618049621582]}