https://www.tensorflow.org/hub/tutorials/movinet


!sudo apt install -y ffmpeg

!pip install -q mediapy

!pip uninstall -q -y opencv-python-headless

!pip install -q "opencv-python-headless<4.3"

In [2]:
# Import libraries
import pathlib
import cv2

import matplotlib as mpl
import matplotlib.pyplot as plt
import mediapy as media
import numpy as np
import PIL

import tensorflow as tf
import tensorflow_hub as hub
import tqdm

mpl.rcParams.update({
    'font.size': 10,
})

In [4]:
labels_path = tf.keras.utils.get_file(
    fname='labels.txt',
    origin='https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt'
)
labels_path = pathlib.Path(labels_path)

lines = labels_path.read_text().splitlines()
KINETICS_600_LABELS = np.array([line.strip() for line in lines])
KINETICS_600_LABELS[:20]

array(['abseiling', 'acting in play', 'adjusting glasses', 'air drumming',
       'alligator wrestling', 'answering questions', 'applauding',
       'applying cream', 'archaeological excavation', 'archery',
       'arguing', 'arm wrestling', 'arranging flowers',
       'assembling bicycle', 'assembling computer',
       'attending conference', 'auctioning', 'backflip (human)',
       'baking cookies', 'bandaging'], dtype='<U49')

In [5]:
jumpingjack_url = 'https://github.com/tensorflow/models/raw/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/jumpingjack.gif'
jumpingjack_path = tf.keras.utils.get_file(
    fname='jumpingjack.gif',
    origin=jumpingjack_url,
    cache_dir='.', cache_subdir='.',
)

In [19]:
# Read and process a video
def load_gif(file_path, image_size=(224, 224)):
  """Loads a gif file into a TF tensor.

  Use images resized to match what's expected by your model.
  The model pages say the "A2" models expect 224 x 224 images at 5 fps

  Args:
    file_path: path to the location of a gif file.
    image_size: a tuple of target size.

  Returns:
    a video of the gif file
  """
  # Load a gif file, convert it to a TF tensor
  raw = tf.io.read_file(file_path)
  video = tf.io.decode_gif(raw)
  # Resize the video
  video = tf.image.resize(video, image_size)
  # change dtype to a float32
  # Hub models always want images normalized to [0,1]
  # ref: https://www.tensorflow.org/hub/common_signatures/images#input
  video = tf.cast(video, tf.float32) / 255.
  return video

In [55]:
import json
with open('bite_frame_indexes.json') as f:
    bite_frame_indexes = json.load(f)
bite_frame_indexes

[508, 947, 1201, 1578, 1959, 2492, 2881, 3136, 3377, 3727, 5149, 6310, 6848]

In [60]:
from extract_face import extract_face

def preprocess_frame(frame):
    # Resize the frame to the input size expected by the MoViNet model
    # frame = cv2.resize(frame, (224, 224))  # Adjust size if your model uses different dimensions
    # # Normalize the frame (0-255 to 0-1)
    try:
        frame = extract_face(frame)
        frame = frame / 255.0
        # frame = frame[0:720, 280:1000]
        frame = cv2.resize(frame, (224, 224))
        return 1 - frame
    except:
        return np.zeros((224, 224, 3))

def load_bite_clip():
    cap = cv2.VideoCapture('raw_session.mp4')
    frames = {}
    frame_number = -1

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_number += 1
        if frame_number % 500 == 0:
            print(frame_number)
        if frame_number % 6 != 0:  # Expects 5fps, source is 30, skip every 6
            continue
        # if frame_number < 1050 or frame_number % 6 != 0:
        #     continue
        # if frame_number > 1700:
        #     break

        frame = cv2.flip(frame, 1)
        processed_frame = preprocess_frame(frame)
        frames[frame_number] = processed_frame

    cap.release()

    return frames

bite_clip_all_frames = load_bite_clip()

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500


In [61]:
frames_per_clip = 6
frame_step = 6

bites = []
non_bites = []
current_clip = []
for i, frame in bite_clip_all_frames.items():
    if len(current_clip) < frames_per_clip:
        current_clip.append(frame)  # Collect frames for the current clip
    else:  # If the current clip is full, add it to the list of clips and start a new clip
        first_index = i - (frames_per_clip * frame_step)
        last_index = i - 1
        is_bite = False
        for bite_frame_index in bite_frame_indexes:
            if first_index <= bite_frame_index <= last_index:
                is_bite = True
                break
        if is_bite:
            bites.append(tf.cast(current_clip, tf.float32))
        else:
            non_bites.append(tf.cast(current_clip, tf.float32))
        current_clip = [frame]

print('Bites:', len(bites))
print('Non bites:', len(non_bites))

train_bites = bites[:int(len(bites) * 0.8)]
train_non_bites = non_bites[:int(len(non_bites) * 0.8)]
test_bites = bites[int(len(bites) * 0.8):]
test_non_bites = non_bites[int(len(non_bites) * 0.8):]

print('Train bites:', len(train_bites))
print('Train non bites:', len(train_non_bites))
print('Test bites:', len(test_bites))
print('Test non bites:', len(test_non_bites))

Bites: 13
Non bites: 206
Train bites: 10
Train non bites: 164
Test bites: 3
Test non bites: 42


In [63]:
train_bites[0].numpy().dtype

dtype('float32')

In [40]:
# jumpingjack=load_gif(jumpingjack_path)
# jumpingjack.shape
bite_clip.shape

TensorShape([50, 224, 224, 3])

In [9]:
# @title
# Get top_k labels and probabilities
def get_top_k(probs, k=5, label_map=KINETICS_600_LABELS):
  """Outputs the top k model labels and probabilities on the given video.

  Args:
    probs: probability tensor of shape (num_frames, num_classes) that represents
      the probability of each class on each frame.
    k: the number of top predictions to select.
    label_map: a list of labels to map logit indices to label strings.

  Returns:
    a tuple of the top-k labels and probabilities.
  """
  # Sort predictions to find top_k
  top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]
  # collect the labels of top_k predictions
  top_labels = tf.gather(label_map, top_predictions, axis=-1)
  # decode lablels
  top_labels = [label.decode('utf8') for label in top_labels.numpy()]
  # top_k probabilities of the predictions
  top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()
  return tuple(zip(top_labels, top_probs))

# Base Model

In [85]:
# @title
id = 'a0'
mode = 'base'
version = '3'
hub_url = f'https://tfhub.dev/tensorflow/movinet/{id}/{mode}/kinetics-600/classification/{version}'
model = hub.load(hub_url)

In [86]:
# @title
sig = model.signatures['serving_default']
print(sig.pretty_printed_signature())

signature_wrapper(*, image)
  Args:
    image: float32 Tensor, shape=(None, None, None, None, 3)
  Returns:
    {'classifier_head': <1>}
      <1>: float32 Tensor, shape=(None, 600)


In [87]:
# @title
sig(image = train_bites[0][tf.newaxis, :1])

{'classifier_head': <tf.Tensor: shape=(1, 600), dtype=float32, numpy=
 array([[-1.92758405e+00, -5.88330507e+00, -2.53679007e-01,
         -2.52303123e-01, -5.84325075e+00,  2.47122097e+00,
          5.24489880e+00,  5.84276104e+00,  2.61101818e+00,
          7.57387924e+00, -1.57510221e+00, -3.12703443e+00,
         -6.11972761e+00,  8.09002972e+00,  5.55060744e-01,
         -4.27211332e+00, -6.54242945e+00, -3.32864523e+00,
          1.90738440e+00,  4.52020693e+00, -1.74083507e+00,
         -1.75236440e+00,  2.11266088e+00, -6.73124909e-01,
          3.03988719e+00,  2.37972069e+00,  4.97288179e+00,
          1.02900493e+00,  1.52709949e+00,  3.64214563e+00,
          4.56970453e+00, -3.86290699e-01, -3.78517151e+00,
         -2.77088547e+00,  1.74946249e-01, -3.30913949e+00,
         -3.13091326e+00, -4.54574251e+00, -1.93967581e+00,
         -8.58452320e+00, -2.43540955e+00, -3.12130404e+00,
         -1.21681368e+00,  2.59994054e+00,  5.90099573e-01,
         -5.32514381e+00,  1.2

In [88]:
# @title
logits = sig(image = train_bites[1][tf.newaxis, ...])
logits = logits['classifier_head'][0]

print(logits.shape)
print()

(600,)



In [89]:
# @title
probs = tf.nn.softmax(logits, axis=-1)
for label, p in get_top_k(probs):
  print(f'{label:20s}: {p:.3f}')

blowing bubble gum  : 0.234
beatboxing          : 0.221
chewing gum         : 0.127
sticking tongue out : 0.034
scrambling eggs     : 0.011


# Stream Model

In [50]:
%%time
id = 'a2'
mode = 'stream'
version = '3'
hub_url = f'https://tfhub.dev/tensorflow/movinet/{id}/{mode}/kinetics-600/classification/{version}'
model = hub.load(hub_url)

CPU times: total: 36.7 s
Wall time: 3min 7s


In [11]:
list(model.signatures.keys())

['call', 'init_states']

In [12]:
lines = model.signatures['init_states'].pretty_printed_signature().splitlines()
print('.\n'.join(lines))

signature_wrapper(*, input_shape).
  Args:.
    input_shape: int32 Tensor, shape=(5,).
  Returns:.
    {'state/b0/l0/pool_buffer': <1>, 'state/b0/l0/pool_frame_count': <2>, 'state/b0/l1/pool_buffer': <3>, 'state/b0/l1/pool_frame_count': <4>, 'state/b0/l1/stream_buffer': <5>, 'state/b0/l2/pool_buffer': <6>, 'state/b0/l2/pool_frame_count': <7>, 'state/b0/l2/stream_buffer': <8>, 'state/b1/l0/pool_buffer': <9>, 'state/b1/l0/pool_frame_count': <10>, 'state/b1/l0/stream_buffer': <11>, 'state/b1/l1/pool_buffer': <12>, 'state/b1/l1/pool_frame_count': <13>, 'state/b1/l1/stream_buffer': <14>, 'state/b1/l2/pool_buffer': <15>, 'state/b1/l2/pool_frame_count': <16>, 'state/b1/l2/stream_buffer': <17>, 'state/b1/l3/pool_buffer': <18>, 'state/b1/l3/pool_frame_count': <19>, 'state/b1/l3/stream_buffer': <20>, 'state/b1/l4/pool_buffer': <21>, 'state/b1/l4/pool_frame_count': <22>, 'state/b1/l4/stream_buffer': <23>, 'state/b2/l0/pool_buffer': <24>, 'state/b2/l0/pool_frame_count': <25>, 'state/b2/l0/stream_b

In [26]:
# Get top_k labels and probabilities predicted using MoViNets streaming model
# def get_top_k_streaming_labels(probs, k=5, label_map=KINETICS_600_LABELS):
#   """Returns the top-k labels over an entire video sequence.

#   Args:
#     probs: probability tensor of shape (num_frames, num_classes) that represents
#       the probability of each class on each frame.
#     k: the number of top predictions to select.
#     label_map: a list of labels to map logit indices to label strings.

#   Returns:
#     a tuple of the top-k probabilities, labels, and logit indices
#   """
#   top_categories_last = tf.argsort(probs, -1, 'DESCENDING')[-1, :1]
#   # Sort predictions to find top_k
#   categories = tf.argsort(probs, -1, 'DESCENDING')[:, :k]
#   categories = tf.reshape(categories, [-1])

#   counts = sorted([
#       (i.numpy(), tf.reduce_sum(tf.cast(categories == i, tf.int32)).numpy())
#       for i in tf.unique(categories)[0]
#   ], key=lambda x: x[1], reverse=True)

#   top_probs_idx = tf.constant([i for i, _ in counts[:k]])
#   top_probs_idx = tf.concat([top_categories_last, top_probs_idx], 0)
#   # find unique indices of categories
#   top_probs_idx = tf.unique(top_probs_idx)[0][:k+1]
#   # top_k probabilities of the predictions
#   top_probs = tf.gather(probs, top_probs_idx, axis=-1)
#   top_probs = tf.transpose(top_probs, perm=(1, 0))
#   # collect the labels of top_k predictions
#   top_labels = tf.gather(label_map, top_probs_idx, axis=0)
#   # decode the top_k labels
#   top_labels = [label.decode('utf8') for label in top_labels.numpy()]

#   return top_probs, top_labels, top_probs_idx

def get_top_k_labels(probs, frame_i, k=5, label_map=KINETICS_600_LABELS):
  """Returns the top-k labels over an entire video sequence.

  Args:
    probs: probability tensor of shape (num_frames, num_classes) that represents
      the probability of each class on each frame.
    k: the number of top predictions to select.
    label_map: a list of labels to map logit indices to label strings.

  Returns:
    a tuple of the top-k probabilities, labels, and logit indices
  """
  top_categories_last = tf.argsort(probs, -1, 'DESCENDING')[frame_i, :1]
  # Sort predictions to find top_k
  categories = tf.argsort(probs, -1, 'DESCENDING')[:, :k]
  categories = tf.reshape(categories, [-1])

  counts = sorted([
      (i.numpy(), tf.reduce_sum(tf.cast(categories == i, tf.int32)).numpy())
      for i in tf.unique(categories)[0]
  ], key=lambda x: x[1], reverse=True)

  top_probs_idx = tf.constant([i for i, _ in counts[:k]])
  top_probs_idx = tf.concat([top_categories_last, top_probs_idx], 0)
  # find unique indices of categories
  top_probs_idx = tf.unique(top_probs_idx)[0][:k+1]
  # top_k probabilities of the predictions
  top_probs = tf.gather(probs, top_probs_idx, axis=-1)
  top_probs = tf.transpose(top_probs, perm=(1, 0))
  # collect the labels of top_k predictions
  top_labels = tf.gather(label_map, top_probs_idx, axis=0)
  # decode the top_k labels
  top_labels = [label.decode('utf8') for label in top_labels.numpy()]

  return top_probs, top_labels


# Plot top_k predictions at a given time step
def plot_streaming_top_preds_at_step(
    step=None,
    image=None,
    legend_loc='lower left',
    duration_seconds=10,
    figure_height=500,
    playhead_scale=0.8,
    grid_alpha=0.3):
  """Generates a plot of the top video model predictions at a given time step.

  Args:
    top_probs: a tensor of shape (k, num_frames) representing the top-k
      probabilities over all frames.
    top_labels: a list of length k that represents the top-k label strings.
    step: the current time step in the range [0, num_frames].
    image: the image frame to display at the current time step.
    legend_loc: the placement location of the legend.
    duration_seconds: the total duration of the video.
    figure_height: the output figure height.
    playhead_scale: scale value for the playhead.
    grid_alpha: alpha value for the gridlines.

  Returns:
    A tuple of the output numpy image, figure, and axes.
  """

#   top_probs, top_labels = get_top_k_labels(probs, step)

  # find number of top_k labels and frames in the video
  # Visualize frames and top_k probabilities of streaming video
  fig = plt.figure(figsize=(6.5, 7), dpi=300)
  gs = mpl.gridspec.GridSpec(8, 1)
  ax2 = plt.subplot(gs[:-3, :])
  ax = plt.subplot(gs[-3:, :])
  # display the frame
  if image is not None:
    ax2.imshow(image, interpolation='nearest')
    ax2.axis('off')
  # x-axis (frame number)
#   preview_line_x = tf.linspace(0., duration_seconds, num_frames)
  # y-axis (top_k probabilities)
#   preview_line_y = top_probs

#   line_x = preview_line_x[:step+1]
#   line_y = preview_line_y[:, :step+1]

#   for i in range(num_labels):
#     ax.plot(preview_line_x, preview_line_y[i], label=None, linewidth='1.5',
#             linestyle=':', color='gray')
#     ax.plot(line_x, line_y[i], label=top_labels[i], linewidth='2.0')


#   ax.grid(which='major', linestyle=':', linewidth='1.0', alpha=grid_alpha)
#   ax.grid(which='minor', linestyle=':', linewidth='0.5', alpha=grid_alpha)

#   min_height = tf.reduce_min(top_probs) * playhead_scale
#   max_height = tf.reduce_max(top_probs)
#   ax.vlines(preview_line_x[step], min_height, max_height, colors='red')
#   ax.scatter(preview_line_x[step], max_height, color='red')

#   ax.legend(loc=legend_loc)

#   plt.xlim(0, duration_seconds)
#   plt.ylabel('Probability')
#   plt.xlabel('Time (s)')
#   plt.yscale('log')

  final_probs = probs[step]
  y = 0.8
  for label, p in get_top_k(final_probs):
    # print(f'{label:20s}: {p:.3f}')
    # print(f'{label:20s}', end=', ')
    plt.text(0, y, f'{label:20s}', fontsize=35, color='red')
    y -= 0.2
  plt.text(0, y, f'Frame: {step}', fontsize=35, color='red')

#   print()

#   plt.text(0, 0.8, "Hello, World 1!", fontsize=35, color='red')
#   plt.text(0, 0.6, "Hello, World 2!", fontsize=35, color='red')
#   plt.text(0, 0.4, "Hello, World 3!", fontsize=35, color='red')
#   plt.text(0, 0.2, "Hello, World 4!", fontsize=35, color='red')
#   plt.text(0, 0, "Hello, World!5", fontsize=35, color='red')


  fig.tight_layout()
  fig.canvas.draw()

  data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
  plt.close()

  figure_width = int(figure_height * data.shape[1] / data.shape[0])
  image = PIL.Image.fromarray(data).resize([figure_width, figure_height])
  image = np.array(image)

  return image

# Plotting top_k predictions from MoViNets streaming model
def plot_streaming_top_preds(
    probs,
    video,
    top_k=5,
    video_fps=25.,
    figure_height=500,
    use_progbar=True):
  """Generates a video plot of the top video model predictions.

  Args:
    probs: probability tensor of shape (num_frames, num_classes) that represents
      the probability of each class on each frame.
    video: the video to display in the plot.
    top_k: the number of top predictions to select.
    video_fps: the input video fps.
    figure_fps: the output video fps.
    figure_height: the height of the output video.
    use_progbar: display a progress bar.

  Returns:
    A numpy array representing the output video.
  """
  # select number of frames per second
  video_fps = 8.
  # select height of the image
  figure_height = 500
  # number of time steps of the given video
  steps = video.shape[0]
  # estimate duration of the video (in seconds)
  duration = steps / video_fps
  # estimate top_k probabilities and corresponding labels
#   top_probs, top_labels, _ = get_top_k_streaming_labels(probs, k=top_k)

  images = []
  step_generator = tqdm.trange(steps) if use_progbar else range(steps)
  for i in step_generator:
    image = plot_streaming_top_preds_at_step(
        step=i,
        image=video[i],
        duration_seconds=duration,
        figure_height=figure_height,
    )
    images.append(image)

  return np.array(images)

# Fine Tuning

In [67]:
import random

batch_size = 8
num_frames = 6

class ClipGenerator:
    def __init__(self, training):
        self.training = training

    def __call__(self):
        if self.training:
            all_clips = [(clip, 0) for clip in train_non_bites] + [(clip, 1) for clip in train_bites]
        else:
            all_clips = [(clip, 0) for clip in test_non_bites] + [(clip, 1) for clip in test_bites]
        random.shuffle(all_clips)
        for clip, label in all_clips:
            yield clip, label


output_signature = (tf.TensorSpec(shape = (None, None, None, 3), dtype = tf.float32),
                    tf.TensorSpec(shape = (), dtype = tf.int16))

train_ds = tf.data.Dataset.from_generator(ClipGenerator(True), output_signature = output_signature)
train_ds = train_ds.batch(batch_size)

test_ds = tf.data.Dataset.from_generator(ClipGenerator(False), output_signature = output_signature)
test_ds = test_ds.batch(batch_size)

In [68]:
for frames, labels in train_ds.take(10):
  print(labels)
print(f"Shape: {frames.shape}")
print(f"Label: {labels.shape}")

tf.Tensor([0 0 0 0 1 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 1 0], shape=(8,), dtype=int16)
tf.Tensor([1 0 0 0 0 1 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 1 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 1 0 1 0 0 0], shape=(8,), dtype=int16)
Shape: (8, 6, 224, 224, 3)
Label: (8,)


In [69]:

tf.keras.backend.clear_session()

# Skip to here

In [51]:
init_states = model.init_states(bite_clip[tf.newaxis].shape)

In [28]:
# Insert your video clip here
video = 1 - bite_clip
# video = tf.gather(video, indices=[2, 1, 0], axis=-1)
images = tf.split(video[tf.newaxis], video.shape[0], axis=1)

all_logits = []

# To run on a video, pass in one frame at a time
states = init_states
for i, image in enumerate(images):
  print(f'{i}/{len(images)}')
  # predictions for each frame
  if i > 0 and i % 20 == 0:
    states = init_states
  logits, states = model({**states, 'image': image})

  for label, p in get_top_k(logits[0]):
    print(label, end=', ')
  print()

  all_logits.append(logits)

# concatenating all the logits
logits = tf.concat(all_logits, 0)
# estimating probabilities
probs = tf.nn.softmax(logits, axis=-1)

0/75
playing squash or racquetball, scrubbing face, applying cream, lunge, zumba, 
1/75
answering questions, yoga, news anchoring, raising eyebrows, attending conference, 
2/75
answering questions, chewing gum, yoga, news anchoring, burping, 
3/75
chewing gum, answering questions, eating burger, tasting food, eating chips, 
4/75
chewing gum, answering questions, eating burger, tasting food, eating chips, 
5/75
chewing gum, answering questions, eating burger, eating watermelon, tasting food, 
6/75
answering questions, chewing gum, eating burger, eating watermelon, attending conference, 
7/75
answering questions, chewing gum, singing, attending conference, eating watermelon, 
8/75
answering questions, eating watermelon, attending conference, chewing gum, tasting food, 
9/75
eating watermelon, tasting food, sucking lolly, eating chips, eating carrots, 
10/75
tasting food, sucking lolly, eating watermelon, eating carrots, eating ice cream, 
11/75
tasting food, sucking lolly, eating ice cre

In [54]:
video = 1 - bite_clip
images = tf.split(video[tf.newaxis], video.shape[0], axis=1)

all_logits = []

# To run on a video, pass in one frame at a time
states = init_states
for i, image in enumerate(images):
    print(f'{i}/{len(images)}')
    # predictions for each frame
    if i > 0 and i % 5 == 0:
        states = init_states
    logits, states = model({**states, 'image': image})

    for label, p in get_top_k(logits[0]):
        print(label, p, end=', ')
    print()

    all_logits.append(logits)

# concatenating all the logits
logits = tf.concat(all_logits, 0)
# estimating probabilities
probs = tf.nn.softmax(logits, axis=-1)


0/109
staring 5.196523, raising eyebrows 4.741813, lunge 4.3045793, spray painting 4.0851145, playing squash or racquetball 3.7177255, 
1/109
chewing gum 9.992247, eating chips 9.763977, eating hotdog 9.054871, blowing bubble gum 8.813029, staring 8.161706, 
2/109
chewing gum 21.292713, eating chips 20.507023, eating hotdog 18.845299, blowing bubble gum 16.13454, eating burger 15.714641, 
3/109
chewing gum 22.756895, eating chips 22.327326, eating hotdog 20.309887, eating burger 16.99799, blowing bubble gum 16.773079, 
4/109
chewing gum 22.73225, eating chips 22.248854, eating hotdog 20.13458, beatboxing 17.16296, eating burger 16.556318, 
5/109
staring 5.590745, whistling 5.4148397, lunge 4.755303, raising eyebrows 4.2304707, crossing eyes 3.829576, 
6/109
raising eyebrows 7.7490196, staring 7.7115245, chewing gum 7.644267, whistling 6.5392447, eating hotdog 6.45524, 
7/109
chewing gum 10.532325, eating hotdog 9.142977, eating chips 9.1263685, blowing bubble gum 8.567889, eating burge

KeyboardInterrupt: 

In [16]:
final_probs = probs[-1]
print('Top_k predictions and their probablities\n')
for label, p in get_top_k(final_probs):
  print(f'{label:20s}: {p:.3f}')

Top_k predictions and their probablities

chewing gum         : 0.118
eating chips        : 0.058
blowing bubble gum  : 0.052
answering questions : 0.039
staring             : 0.035


In [15]:
# Generate a plot and output to a video tensor
plot_video = plot_streaming_top_preds(probs, video, video_fps=8., top_k=15)

  0%|          | 0/75 [00:00<?, ?it/s]

  data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
100%|██████████| 75/75 [00:37<00:00,  1.97it/s]


In [41]:
import cv2

def save_images(images, filename, fps=20.0):
    # Define the codec and create VideoWriter object
    # 'XVID' is a commonly used codec. You can replace it with 'MJPG', 'X264', etc.
    # Use 'mp4v' codec for MP4 files
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    output_file = f'{filename}.mp4'  # Output video file name
    height, width, channels = images[0].shape

    # Initialize the video writer
    video_writer = cv2.VideoWriter(output_file, fourcc, fps, (width, height))

    # Loop through each image and write it to the video
    for image in images:
        video_writer.write(image)

    # Release the video writer object
    video_writer.release()

In [145]:
bite_clip = np.fromfile('bite_clip.raw', dtype=np.float32) * 255
bite_clip = bite_clip.astype(np.uint8).reshape((8, 224, 224, 3))
bite_clip.shape

(8, 224, 224, 3)

In [None]:
# @title
# Get top_k labels and probabilities
def get_top_k(probs, k=5, label_map=KINETICS_600_LABELS):
  """Outputs the top k model labels and probabilities on the given video.

  Args:
    probs: probability tensor of shape (num_frames, num_classes) that represents
      the probability of each class on each frame.
    k: the number of top predictions to select.
    label_map: a list of labels to map logit indices to label strings.

  Returns:
    a tuple of the top-k labels and probabilities.
  """
  # Sort predictions to find top_k
  top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]
  # collect the labels of top_k predictions
  top_labels = tf.gather(label_map, top_predictions, axis=-1)
  # decode lablels
  top_labels = [label.decode('utf8') for label in top_labels.numpy()]
  # top_k probabilities of the predictions
  top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()
  return tuple(zip(top_labels, top_probs))

In [66]:
# For gif format, set codec='gif'
# save_images(plot_video, 'biteclip_probabilities_v4', fps=8)
save_images(1 - (train_bites[3].numpy() * 255).astype(np.uint8), 'bite_clip', fps=5)
# save_images((jumpingjack.numpy() * 255).astype(np.uint8), 'jjack', fps=3)
# media.show_video(plot_video, fps=3)