In [1]:
import tensorflow as tf
import numpy as np
import os

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from model_builder import ModelBuilder
from model_saver import ModelSaver, model_trainer_from_checkpoint, model_predictor_from_checkpoint
from model_trainer import ModelTrainer
from model_predictor import ModelPredictor

In [3]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

## Dataset building tools

We choose the local path corresponding to our execution environment:
- '.' for local execution
- '/content/drive/My Drive' for colab execution

In [4]:
local_path = '.'
tmp_path = '/tmp'

In [5]:
data_path = os.path.join(local_path, 'data')
dataset_path = os.path.join(data_path, 'MSR-VTT')
video_path_zip = os.path.join(dataset_path, 'train-video.zip')

video_path_folder = os.path.join(dataset_path, 'train-video')

video_numpy_features_folder = os.path.join(dataset_path, 'features')

json_path_zip = os.path.join(dataset_path, 'train_2017.zip')
json_file_name = 'videodatainfo_2017.json'

inception_model_path = os.path.join(data_path, 'inceptionv3.h5')

checkpoints_path_folder = os.path.join(data_path, 'checkpoints')

## Preprocessing videos

Creates a MsrVttSelector and a MsrVttExtractor.

  - MsrVttSelector is used to select videos among categories and to retrieve their captions
  - MstVttExtractor is used to extract frames from a video or to extract its features it through inceptionv3 model

In [6]:
import msr_vtt_extractor
import msr_vtt_selector

from msr_vtt_selector import MsrVttSelector, Category
from msr_vtt_extractor import MsrVttExtractor

selector = MsrVttSelector(json_path_zip)
extractor = MsrVttExtractor(video_path_zip, inception_model_path)

Instructions for updating:
Colocations handled automatically by placer.


#### n_frames ####

We've to choose the amount of frames we will store for all videos. This number should be standardized, so we can easiy train our dataset without having to worry about categories.

For the whole dataset, we choose to select the mean number of frames of all videos. Small videos will be pre-padded with zeroes while big videos will be cropped randomly.

In [7]:
def get_n_frames_mean(selector, extractor):
    all_videos = selector.select_videos(limit=None, categories=None, random=False)
    total_n_frames = extractor.get_n_frames(all_videos)
    n_frames = int(np.round(np.mean(total_n_frames)))
    return n_frames

## Retrieving preprocessed data

Use pre_processor.video_retriever_generator

In [8]:
from video_pre_processor import VideoPreProcessor

pre_processor = VideoPreProcessor(selector, extractor, video_numpy_features_folder)

# Training a new model

### Data selection

In [8]:
n_frames = 408 # Skips the iteration over all the videos

categories = [Category.SPORT]

videos_names = selector.select_videos(limit=20, categories=categories)
n_videos = len(videos_names)
n_captions_per_video = 2

test_ratio = 0.2
n_train_videos = int(np.round((1.0-test_ratio)*n_videos))
n_test_videos = n_videos-n_train_videos

train_videos_names = videos_names[n_test_videos:]
test_videos_names = videos_names[:n_test_videos]

### Defining model parameters

In [9]:
enc_units = 128
dec_units = 128
rnn_layers = 2

embedding_dims = 512

learning_rate = 0.001

dropout_rate = 0.2

### Model creation

In [10]:
batch_size = 32

reset_graph()

model_builder = ModelBuilder(train_videos_names, test_videos_names, 
                             n_captions_per_video, n_frames, 
                             pre_processor.video_retriever_generator,
                             selector, extractor)

model_builder.create_model(enc_units, dec_units, rnn_layers, embedding_dims,
                           learning_rate, dropout_rate)

model_builder.prepare_training(batch_size, shuffle=False)

Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, w

### Model training

In [14]:
test_prefix = "test"
model_saver = ModelSaver.from_generated_filename(model_builder, checkpoints_path_folder, test_prefix)

model_trainer = ModelTrainer(model_saver, model_builder)
model_trainer.train(10)

File will be saved in file ./data/checkpoints/test_20_2_32_2019-09-03_16:00:19 at each progression
Epoch 1/10 ; Batch loss: 5.266677 ; Best loss: 5.266677 ; Batch accuracy: 00.00% ; Test accuracy: 14.58% ; Time: 7s
Epoch 2/10 ; Batch loss: 5.014540 ; Best loss: 5.014540 ; Batch accuracy: 09.18% ; Test accuracy: 18.75% ; Time: 5s
Epoch 3/10 ; Batch loss: 4.812503 ; Best loss: 4.812503 ; Batch accuracy: 11.72% ; Test accuracy: 14.58% ; Time: 5s
Epoch 4/10 ; Batch loss: 4.608752 ; Best loss: 4.608752 ; Batch accuracy: 11.72% ; Test accuracy: 16.67% ; Time: 5s
Epoch 5/10 ; Batch loss: 4.479976 ; Best loss: 4.479976 ; Batch accuracy: 12.50% ; Test accuracy: 16.67% ; Time: 5s
Epoch 6/10 ; Batch loss: 4.438438 ; Best loss: 4.438438 ; Batch accuracy: 12.30% ; Test accuracy: 16.67% ; Time: 5s
Epoch 7/10 ; Batch loss: 4.344067 ; Best loss: 4.344067 ; Batch accuracy: 12.30% ; Test accuracy: 16.67% ; Time: 5s
Epoch 8/10 ; Batch loss: 4.264196 ; Best loss: 4.264196 ; Batch accuracy: 12.30% ; Test a

# Recovering a model

In [9]:
checkpoint_name = "test_20_2_32_2019-09-03_16:00:19"
checkpoint_file = os.path.join(checkpoints_path_folder, checkpoint_name)

## Training

In [14]:
reset_graph()

mt = model_trainer_from_checkpoint(checkpoint_file, pre_processor.video_retriever_generator, selector, extractor)

mt.train(20)

INFO:tensorflow:Restoring parameters from ./data/checkpoints/test_20_2_32_2019-09-03_16:00:19
File will be saved in file ./data/checkpoints/test_20_2_32_2019-09-03_16:00:19 at each progression
Epoch 11/20 ; Batch loss: 4.073833 ; Best loss: 4.073833 ; Batch accuracy: 13.09% ; Test accuracy: 16.67% ; Time: 6s
Epoch 12/20 ; Batch loss: 4.005154 ; Best loss: 4.005154 ; Batch accuracy: 13.09% ; Test accuracy: 16.67% ; Time: 5s
Epoch 13/20 ; Batch loss: 3.938041 ; Best loss: 3.938041 ; Batch accuracy: 13.28% ; Test accuracy: 16.67% ; Time: 5s
Epoch 14/20 ; Batch loss: 3.862349 ; Best loss: 3.862349 ; Batch accuracy: 13.87% ; Test accuracy: 16.67% ; Time: 5s


KeyboardInterrupt: 

## Inference

In [11]:
reset_graph()

mp = model_predictor_from_checkpoint(checkpoint_file, pre_processor.video_retriever_generator, selector, extractor)

mp.predict(True)

INFO:tensorflow:Restoring parameters from ./data/checkpoints/test_20_2_32_2019-09-03_16:00:19
Accuracy 16.67%
---------------
video8
Truth:  a bald guy talks and swivels in a chair 
Predictions:  a a a a the the 
---------------
video13
Truth:  a crazy man interacts with abraham lincoln 
Predictions:  a a a a a a 
---------------
video19
Truth:  two men during a tenis match the commentator is speaking french 
Predictions:  a a a a a 
---------------
video21
Truth:  there is a man is talking about a product 
Predictions:  a a a running running running 
