# Hands-on

## Setup

In [1]:
# ffmpeg - Working with videos, audios, and streaming media files
!sudo apt install -y ffmpeg
# mediapy - dev dependency; displays videos/images in the notebook
!pip install -q mediapy
!pip uninstall -q -y opencv-python-headless
!pip install -q "opencv-python-headless<4.3"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 1.3.1 requires opencv-python-headless>=4.1.1, but you have opencv-python-headless 3.4.18.65 which is incompatible.
qudida 0.0.4 requires opencv-python-headless>=4.0.1, but you have opencv-python-headless 3.4.18.65 which is incompatible.[0m[31m
[0m

In [2]:
# Import libraries
import pathlib

import matplotlib as mpl
import matplotlib.pyplot as plt
import mediapy as media
import numpy as np
import PIL

import tensorflow as tf
import tensorflow_hub as hub
import tqdm

mpl.rcParams.update({
    'font.size': 10,
})

In [3]:
# Getting the kinetics 600 label
# Printing the first few labels

labels_path = tf.keras.utils.get_file(
    fname="labels.txt",
    origin="https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt"
)
labels_path = pathlib.Path(labels_path)

lines = labels_path.read_text().splitlines()
KINETICS_600_LABELS = np.array([line.strip() for line in lines])
KINETICS_600_LABELS[:20]

Downloading data from https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt


array(['abseiling', 'acting in play', 'adjusting glasses', 'air drumming',
       'alligator wrestling', 'answering questions', 'applauding',
       'applying cream', 'archaeological excavation', 'archery',
       'arguing', 'arm wrestling', 'arranging flowers',
       'assembling bicycle', 'assembling computer',
       'attending conference', 'auctioning', 'backflip (human)',
       'baking cookies', 'bandaging'], dtype='<U49')

In [4]:
cartwheel_url = 'https://media1.giphy.com/media/v1.Y2lkPTc5MGI3NjExZHoxc3JlZjA3bTg0aTZzNzBmdnk0enNpdm1haTVxajZmMDRpNGN4cyZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/IhsEJ88BJ3MQl9UAoZ/giphy.gif'

cartwheel_path = tf.keras.utils.get_file(
    fname='cartwheel.gif',
    origin=cartwheel_url,
    cache_dir='.',
    cache_subdir='.',
)

Downloading data from https://media1.giphy.com/media/v1.Y2lkPTc5MGI3NjExZHoxc3JlZjA3bTg0aTZzNzBmdnk0enNpdm1haTVxajZmMDRpNGN4cyZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/IhsEJ88BJ3MQl9UAoZ/giphy.gif


In [5]:
# @title Loading a gif
def load_gif(file_path, image_size=(224, 224)):
    # Load a gif file, convert it to a TF tensor
    raw = tf.io.read_file(file_path)
    video = tf.io.decode_gif(raw)
    # Resize the video
    video = tf.image.resize(video, image_size)
    # change dtype to a float32
    # Hub models always want images ormalized to [0, 1]
    video = tf.cast(video, tf.float32) / 255
    return video

In [6]:
cartwheel = load_gif(cartwheel_path)
cartwheel.shape

TensorShape([76, 224, 224, 3])

In [7]:
media.show_video(cartwheel.numpy(), fps=5)

0
This browser does not support the video tag.


## How to use the model

### Base model

In [8]:
%%time
id = 'a2'
mode = 'base'
version = '3'
hub_url = f'https://tfhub.dev/tensorflow/movinet/{id}/{mode}/kinetics-600/classification/{version}'
model = hub.load(hub_url)

CPU times: user 25.7 s, sys: 1.25 s, total: 27 s
Wall time: 35.9 s


In [9]:
sig = model.signatures['serving_default']
print(sig.pretty_printed_signature())

Input Parameters:
  image (KEYWORD_ONLY): TensorSpec(shape=(None, None, None, None, 3), dtype=tf.float32, name='image')
Output Type:
  Dict[['classifier_head', TensorSpec(shape=(None, 600), dtype=tf.float32, name='classifier_head')]]
Captures:
  133628456388560: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133628625100240: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133628625099888: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133628625099184: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133628625099536: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133628456387856: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133628625098832: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133628625097776: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133628625098128: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133628625098480: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133628456388208: TensorSpec(shape=(), dtype=tf.resourc

In [10]:
# Adding the outer batch dimension to the video
# warmup
sig(image = cartwheel[tf.newaxis, :1])

{'classifier_head': <tf.Tensor: shape=(1, 600), dtype=float32, numpy=
 array([[ 4.10832852e-01, -1.75833368e+00,  7.21597135e-01,
         -1.99860299e+00, -3.28061581e+00,  2.11836314e+00,
          4.08527613e+00,  2.53223634e+00,  2.31717277e+00,
         -1.90893590e+00,  1.77989674e+00,  1.36765051e+00,
         -1.55218279e+00,  6.24242961e-01, -1.41566801e+00,
          1.99393201e+00,  1.92371225e+00, -5.23871899e-01,
         -4.85216141e-01, -8.07527125e-01, -1.82610130e+00,
         -4.09451008e+00,  1.52345586e+00, -1.96385741e-01,
          2.55342931e-01, -3.34995836e-01, -1.60672709e-01,
         -4.49170637e+00,  1.37373209e+00, -3.67361474e+00,
          1.58103204e+00, -2.12662384e-01, -6.87839329e-01,
          1.62189794e+00, -1.63132775e+00, -2.17364120e+00,
          3.14513493e+00, -7.07009315e-01,  1.70156753e+00,
         -2.57661968e-01, -2.05417585e+00,  3.04886603e+00,
         -1.49553049e+00, -1.81551540e+00, -1.62883198e+00,
         -1.95249343e+00, -1.8

In [11]:
%%time
logits = sig(image = cartwheel[tf.newaxis, ...])
logits = logits['classifier_head'][0]

print(logits.shape)
print()

(600,)

CPU times: user 3min 54s, sys: 2.32 s, total: 3min 57s
Wall time: 2min 29s


In [12]:
#@title Get top k probabilities from predictions
def get_top_k(probs, k=5, label_map=KINETICS_600_LABELS):
    # Sort predictions to find top_k
    top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]
    # Collect labels of top_k predictions
    top_labels = tf.gather(label_map, top_predictions, axis=-1)
    # Decode labels
    top_labels = [label.decode('utf-8') for label in top_labels.numpy()]
    # top_k probabilities of the predictions
    top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()
    return tuple(zip(top_labels, top_probs))


In [13]:
probs = tf.nn.softmax(logits, axis=-1)
for label, p in get_top_k(probs):
    print(f'{label:^20s}: {p:.3f}')

    cartwheeling    : 0.841
gymnastics tumbling : 0.046
 standing on hands  : 0.021
   somersaulting    : 0.014
      capoeira      : 0.012


In [14]:
media.show_video(cartwheel.numpy(), fps=5)

0
This browser does not support the video tag.


In [15]:
temp = cartwheel[tf.newaxis, ...]
len(cartwheel)

76

### Stream mode

In [16]:
%%time
id = 'a2'
mode = 'stream'
version = '3'
hub_url = f'https://tfhub.dev/tensorflow/movinet/{id}/{mode}/kinetics-600/classification/{version}'
model = hub.load(hub_url)



CPU times: user 58.6 s, sys: 3.17 s, total: 1min 1s
Wall time: 1min 2s


In [17]:
list(model.signatures.keys())

['call', 'init_states']

In [18]:
lines = model.signatures['init_states'].pretty_printed_signature().splitlines()
print(".\n".join(lines))

Input Parameters:.
  input_shape (KEYWORD_ONLY): TensorSpec(shape=(5,), dtype=tf.int32, name='input_shape').
Output Type:.
  Dict[['state/b4/l5/pool_frame_count', TensorSpec(shape=(1,), dtype=tf.int32, name='state/b4/l5/pool_frame_count')], ['state/b2/l3/pool_buffer', TensorSpec(shape=(None, 1, 1, 1, 192), dtype=tf.float32, name='state/b2/l3/pool_buffer')], ['state/b1/l0/pool_frame_count', TensorSpec(shape=(1,), dtype=tf.int32, name='state/b1/l0/pool_frame_count')], ['state/b1/l3/stream_buffer', TensorSpec(shape=(None, 2, None, None, 96), dtype=tf.float32, name='state/b1/l3/stream_buffer')], ['state/b4/l2/pool_frame_count', TensorSpec(shape=(1,), dtype=tf.int32, name='state/b4/l2/pool_frame_count')], ['state/b2/l3/stream_buffer', TensorSpec(shape=(None, 2, None, None, 192), dtype=tf.float32, name='state/b2/l3/stream_buffer')], ['state/b3/l4/pool_buffer', TensorSpec(shape=(None, 1, 1, 1, 144), dtype=tf.float32, name='state/b3/l4/pool_buffer')], ['state/b4/l5/stream_buffer', TensorSpec(s

In [19]:
initial_state = model.init_states(cartwheel[tf.newaxis, ...].shape)

In [20]:
type(initial_state)

dict

In [21]:
list(sorted(initial_state.keys()))[:11]

['state/b0/l0/pool_buffer',
 'state/b0/l0/pool_frame_count',
 'state/b0/l1/pool_buffer',
 'state/b0/l1/pool_frame_count',
 'state/b0/l1/stream_buffer',
 'state/b0/l2/pool_buffer',
 'state/b0/l2/pool_frame_count',
 'state/b0/l2/stream_buffer',
 'state/b1/l0/pool_buffer',
 'state/b1/l0/pool_frame_count',
 'state/b1/l0/stream_buffer']

In [22]:
inputs = initial_state.copy()
inputs['image'] = cartwheel[tf.newaxis, :1, ...]

In [23]:
# Warmup
model(inputs)

(<tf.Tensor: shape=(1, 600), dtype=float32, numpy=
 array([[-5.4424059e-01, -1.4139214e+00, -9.4616842e-01,  3.0706948e-01,
         -3.0820560e+00, -1.2935945e+00, -1.8437758e-02,  7.9541296e-01,
         -2.7010844e+00, -3.9414942e-01, -1.7788147e+00,  3.4326550e-01,
          3.6524820e-01, -2.6189642e+00,  1.7339330e+00,  4.4522768e-01,
         -1.0959081e-01,  1.7278574e+00,  3.3872125e-01,  2.9968151e-01,
          1.5009127e+00,  9.3883801e-01, -8.0360019e-01,  1.0193111e+00,
          9.4160032e-01, -4.6122795e-01, -3.1735560e-01, -1.3849061e+00,
          6.3721669e-01,  1.7818847e+00, -2.1413667e+00, -3.1612298e+00,
         -5.5048782e-01,  9.2964059e-01,  2.1252563e+00, -1.4527906e-01,
         -1.7507973e+00,  2.1174765e-01, -3.9273128e-01, -3.6453104e+00,
          4.0273142e+00, -5.4673725e-01, -1.6316873e+00, -6.2158716e-01,
         -8.5247022e-01, -1.0556502e+00,  5.9268236e-01,  9.5503521e-01,
         -6.1570776e-01, -8.4954834e-01, -4.4800627e-01,  5.4065812e-01,


In [24]:
# Predictions with probabilities after first frame
logits, new_state = model(inputs)
logits = logits[0]
probs = tf.nn.softmax(logits, axis=-1)

for label, prob in get_top_k(probs):
    print(f'{label:^20s}: {prob:.3f}')
print()

    riding camel    : 0.160
    clam digging    : 0.130
    cartwheeling    : 0.125
building sandcastle : 0.114
  walking the dog   : 0.087



In [25]:
# Running in loop passing new frames and updated states
%%time
state = initial_state
all_logits = [] # History

frames = tf.split(cartwheel[tf.newaxis], cartwheel.shape[0], axis=1)

for frame in tqdm.tqdm(frames):
    logits, state = model({**state, 'image': frame})
    all_logits.append(logits)

probabilities = tf.nn.softmax(all_logits, axis=-1)

100%|██████████| 76/76 [00:07<00:00, 10.39it/s]

CPU times: user 9.11 s, sys: 524 ms, total: 9.63 s
Wall time: 7.38 s





In [26]:
for label, prob in get_top_k(probabilities[-1][0]):
    print(f'{label:>20s}: {prob:.3f}')

        cartwheeling: 0.400
 gymnastics tumbling: 0.357
          contorting: 0.026
       somersaulting: 0.022
   standing on hands: 0.018
