In [19]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os

# Preparing train data

In [None]:
df = pd.read_csv('tag.csv',encoding_errors='ignore')

In [21]:
rooms = []
for i in range(len(df)):
    item_folder = "./train/" + df.loc[i]['item'] + "/"
    natural_bool=df.loc[i]['natural']
    rooms.append((natural_bool, item_folder))

In [22]:
train_df = pd.DataFrame(data=rooms, columns=['tag', 'video_folder'])
print(train_df.head())
print(train_df.tail())

         tag           video_folder
0    natural          ./train/bear/
1    natural  ./train/bike-packing/
2    natural     ./train/blackswan/
3  unnatural     ./train/bmx-bumps/
4    natural     ./train/bmx-trees/
          tag           video_folder
84  unnatural         ./train/train/
85  unnatural       ./train/tuk-tuk/
86  unnatural   ./train/upside-down/
87  unnatural  ./train/varanus-cage/
88  unnatural       ./train/walking/


# Preparing test data

In [23]:
df = pd.read_csv('test_tag.csv',encoding_errors='ignore')

In [24]:
rooms = []
for i in range(len(df)):
    item_folder = "./test/" + df.loc[i]['item'] + "/"
    natural_bool=df.loc[i]['natural']
    rooms.append((natural_bool, item_folder))

In [25]:
test_df = pd.DataFrame(data=rooms, columns=['tag', 'video_folder'])
print(test_df.head())

         tag            video_folder
0  unnatural             ./test/bus/
1    natural           ./test/camel/
2    natural  ./test/car-roundabout/
3    natural      ./test/car-shadow/


In [26]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

In [27]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
  except RuntimeError as e:
    print(e)

# Data prepartion

In [28]:
print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

Total videos for training: 89
Total videos for testing: 4


# feed the videos to a network

In [29]:
import natsort
import glob
# make frame array
# path = "./train/blackswan/"
def load_frame(path):
    if os.path.isdir(path):
        images = natsort.natsorted(glob.glob(os.path.join(path,"*jpg")))
        images_read = [cv2.imread(frame) for frame in images]
#         print(images_read[0].shape)
#         print(np.array(images_read))
        return np.array(images_read)
# load_frame(path)

## feature extraction

In [30]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(480, 641, 3), #choose later
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((480, 641, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()    
    

## label encoding

In [31]:
label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["tag"]))

In [32]:
print(label_processor.get_vocabulary())

labels = train_df["tag"].values
labels = label_processor(labels[..., None]).numpy()
labels

['natural', 'unnatural']


array([[0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
    

In [33]:
# define hyperparameters

IMG_WIDTH = 641
IMG_HEIGHT = 480
BATCH_SIZE = 64
EPOCHS = 100

MAX_SEQ_LENGTH = 200
NUM_FEATURES = 2048

In [34]:
train_df

Unnamed: 0,tag,video_folder
0,natural,./train/bear/
1,natural,./train/bike-packing/
2,natural,./train/blackswan/
3,unnatural,./train/bmx-bumps/
4,natural,./train/bmx-trees/
...,...,...
84,unnatural,./train/train/
85,unnatural,./train/tuk-tuk/
86,unnatural,./train/upside-down/
87,unnatural,./train/varanus-cage/


In [35]:
import pdb
def prepare_all_videos(df):
    num_samples = len(df)
    video_paths = df['video_folder'].tolist()
    labels = df["tag"].values
    
    labels = label_processor(labels[..., None]).numpy()
    
    #storing
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool") # 145,20
    frame_features = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") #145,20,2048
    
    for idx, path in enumerate(video_paths):
        print(path)
        frames = load_frame(path)
        frames = frames [None, ...] #??
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            print(length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()
    return (frame_features, frame_masks), labels



In [36]:
import time
start = time.time()
train_data, train_labels = prepare_all_videos(train_df)
test_data, test_labels = prepare_all_videos(test_df)

# test_data, test_labels = prepare_all_videos(test_df, "test")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")



print(f"train_labels in train set: {train_labels.shape}")

print(f"test_labels in train set: {test_labels.shape}")
print("time :", time.time() - start)

# print(f"test_labels in train set: {test_labels.shape}")


./train/bear/
82
./train/bike-packing/
69
./train/blackswan/
50


./train/bmx-bumps/
90
./train/bmx-trees/
80


./train/boat/
73
./train/boxing-fisheye/
65


./train/breakdance/
43
./train/breakdance-flare/
70


./train/bus/
64
./train/camel/
43
./train/car-roundabout/
40


./train/car-shadow/
40
./train/car-turn/
80
./train/cat-girl/
60


./train/classic-car/
63
./train/color-run/
82


./train/cows/
60
./train/crossing/
52
./train/dance-jump/
50


./train/dance-twirl/
80
./train/dancing/
50


./train/disc-jockey/
76
./train/dog/
52
./train/dog-agility/
25


./train/dog-gooses/
86
./train/dogs-jump/
60


./train/dogs-scale/
70
./train/drift-chicane/
46
./train/drift-straight/
50


./train/drift-turn/
64
./train/drone/
50
./train/elephant/
68


./train/flamingo/
80
./train/goat/
80


./train/gold-fish/
66
./train/hike/
80


./train/hockey/
75
./train/horsejump-high/
47


./train/horsejump-low/
60
./train/india/
81


./train/judo/
34
./train/kid-football/
50
./train/kite-surf/
48
./train/kite-walk/
80


./train/koala/
25
./train/lab-coat/
47
./train/lady-running/
65


./train/libby/
49
./train/lindy-hop/
73


./train/loading/
50
./train/longboard/
52
./train/lucia/
70


./train/mallard-fly/
70
./train/mallard-water/
52


./train/mbike-trick/
75
./train/miami-surf/
59
./train/motocross-bumps/
60


./train/motocross-jump/
40
./train/motorbike/
43
./train/night-race/
46


./train/paragliding/
70
./train/paragliding-launch/
80


./train/parkour/
75
./train/pigs/
79


./train/planes-water/
38
./train/rallye/
50
./train/rhino/
71


./train/rollerblade/
35
./train/schoolgirls/
38
./train/scooter-black/
43
./train/scooter-board/
91


./train/scooter-gray/
68
./train/sheep/
68


./train/skate-park/
80
./train/snowboard/
66


./train/soapbox/
80
./train/soccerball/
48


./train/stroller/
79
./train/stunt/
71


./train/surf/
35
./train/swing/
60
./train/tennis/
50


./train/tractor-sand/
76
./train/train/
80


./train/tuk-tuk/
59
./train/upside-down/
65
./train/varanus-cage/
49


./train/walking/
40
./test/bus/
64
./test/camel/
43


./test/car-roundabout/
40
./test/car-shadow/
40
Frame features in train set: (89, 200, 2048)
Frame masks in train set: (89, 200)
train_labels in train set: (89, 1)
test_labels in train set: (4, 1)
time : 3287.4905433654785


# The sequence model

In [38]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(frame_features_input, mask=mask_input)
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model

EPOCHS = 30
# Utility for running experiments.
def run_experiment():
    filepath = "./checkpoint/"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    pdb.set_trace()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


_, sequence_model = run_experiment()

> [1;32mc:\users\vml_sub\appdata\local\temp\ipykernel_568\2344175082.py[0m(33)[0;36mrun_experiment[1;34m()[0m

ipdb> seq_model
<keras.engine.functional.Functional object at 0x000001740A2848E0>
ipdb> history = seq_model.fit(         [train_data[0], train_data[1]],         train_labels,         validation_split=0.3,         epochs=EPOCHS,         callbacks=[checkpoint],     )
Epoch 1/30
Epoch 1: val_loss improved from inf to 0.69062, saving model to .\checkpoint
*** UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbe in position 86: invalid start byte
--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user
Epoch 1/30
Epoch 1: val_loss improved from 0.69062 to 0.66692, saving model to .\checkpoint


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbe in position 86: invalid start byte

# Inference

In [23]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_frameameameameamerameramerame(path)
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames

test_video = "./test/camel"
print(f"Test video path: {test_video}")

test_frames = sequence_prediction(test_video)


Test video path: dataset/test/dancing/dancing (23.mp4
  dancing: 33.42%
  exercise: 33.42%
  yoga: 33.16%


In [24]:
from IPython.display import HTML

HTML("""
    <video alt="test" width="520" height="440" controls>
        <source src="dataset/test/dancing/dancing (23.mp4" type="video/mp4" style="height:300px;width:300px">
    </video>
""")
