# TransNet: A deep network for fast detection of common shot transitions
This repository contains code for paper *TransNet: A deep network for fast detection of common shot transitions*.

If you use it in your work, please cite:


    @article{soucek2019transnet,
        title={TransNet: A deep network for fast detection of common shot transitions},
        author={Sou{\v{c}}ek, Tom{\'a}{\v{s}} and Moravec, Jaroslav and Loko{\v{c}}, Jakub},
        journal={arXiv preprint arXiv:1906.03363},
        year={2019}
    }

In [1]:
import ffmpeg
import numpy as np
import pandas as pd
import os
import tensorflow as tf
tf.compat.v1.disable_v2_behavior()

from transnet import TransNetParams, TransNet
from transnet_utils import draw_video_with_predictions, scenes_from_predictions

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# initialize the network
params = TransNetParams()
params.CHECKPOINT_PATH = "./model/transnet_model-F16_L3_S2_D256"

net = TransNet(params)

[TransNet] Creating ops.
           Input (?, ?, 27, 48, 3)
           SDDCNN_1
Instructions for updating:
Colocations handled automatically by placer.
           > DDCNN_1 (?, ?, 27, 48, 64)
           > DDCNN_2 (?, ?, 27, 48, 64)
           MaxPool (?, ?, 13, 24, 64)
           SDDCNN_2
           > DDCNN_1 (?, ?, 13, 24, 128)
           > DDCNN_2 (?, ?, 13, 24, 128)
           MaxPool (?, ?, 6, 12, 128)
           SDDCNN_3
           > DDCNN_1 (?, ?, 6, 12, 256)
           > DDCNN_2 (?, ?, 6, 12, 256)
           MaxPool (?, ?, 3, 6, 256)
           Flatten (?, ?, 4608)
           Dense (?, ?, 256)
           Logits (?, ?, 2)
           Predictions (?, ?)
[TransNet] Network built.
[TransNet] Found 4614850 trainable parameters.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./model/transnet_model-F16_L3_S2_D256
[TransNet] Parameters restored from 'transnet_model-F16_L3_S2_D256'.


In [3]:
# read all videos

dir_path = '/media/madziegielewska/Seagate Expansion Drive/MAGISTERKA/diploma-project/videos'
files = []

for path in os.listdir(dir_path):
    if os.path.isfile(os.path.join(dir_path, path)):
        files.append(path)

files.sort()
len(files)

897

In [5]:
dir = f"/media/madziegielewska/Seagate Expansion Drive/MAGISTERKA/diploma-project/"

In [13]:
event_change_test = pd.DataFrame(columns=["video", "frame"])

results = []
count = 0

for file in files:
    print(f"{len(files)-count} left")
    file_path = f"{dir}videos/{file}"

    filename, file_extension = os.path.splitext(f'{file}')
    output_path_preds = f"{dir}Event-Boundary-Detection/transnet1_results/predictions/{filename}"
    output_path_sce = f"{dir}Event-Boundary-Detection/transnet1_results/scenes/{filename}"

    video_stream, err = (
        ffmpeg
        .input(f'{file_path}')
        .output('pipe:', format='rawvideo', pix_fmt='rgb24', s='{}x{}'.format(params.INPUT_WIDTH, params.INPUT_HEIGHT))
        .run(capture_stdout=True)
    )
    video = np.frombuffer(video_stream, np.uint8).reshape([-1, params.INPUT_HEIGHT, params.INPUT_WIDTH, 3])

    # predict transitions using the neural network
    predictions = net.predict_video(video)
    np.savetxt(output_path_preds + "_preds.txt",predictions, fmt="%.6f")

    event_change_pred = np.array(predictions).argmax()
    print(event_change_pred)

    df = pd.DataFrame([{"video": f'{file}', "frame": f'{event_change_pred}'}])
    event_change_test = pd.concat([event_change_test, df])

    count += 1

# save test preprocessed data to csv
event_change_test.to_csv('test_data_transnet1.csv', index=False)

897 left
[TransNet] Processing video frames 1042/1042
488
896 left
[TransNet] Processing video frames 1129/1129
532
895 left
[TransNet] Processing video frames 1373/1373
786
894 left
[TransNet] Processing video frames 919/919
905
893 left
[TransNet] Processing video frames 1110/1110
547
892 left
[TransNet] Processing video frames 1107/1107
69
891 left
[TransNet] Processing video frames 1410/1410
558
890 left
[TransNet] Processing video frames 916/916
384
889 left
[TransNet] Processing video frames 838/838
199
888 left
[TransNet] Processing video frames 1162/1162
683
887 left
[TransNet] Processing video frames 1146/1146
475
886 left
[TransNet] Processing video frames 1070/1070
461
885 left
[TransNet] Processing video frames 637/637
295
884 left
[TransNet] Processing video frames 863/863
423
883 left
[TransNet] Processing video frames 787/787
349
882 left
[TransNet] Processing video frames 687/687
628
881 left
[TransNet] Processing video frames 936/936
418
880 left
[TransNet] Processing 

In [14]:
# read true data

df = pd.read_csv(f'{dir}Event-Boundary-Detection/data.csv')

videos = df['video'].tolist()
event_change_true = df['frame'].tolist()

In [18]:
# read test data

df = pd.read_csv(f'{dir}Event-Boundary-Detection/test_data_transnet1.csv')

videos = df['video'].tolist()
event_change_test = df['frame'].tolist()

In [19]:
# compare results

correct_detection = 0

for true_change, test_change in zip(event_change_true, event_change_test):
    if (true_change == test_change):
        correct_detection += 1
        #print("correct")
    else:
        # include measurement uncertainty
        lower_tolerance = 10
        upper_tolerance = 10

        if(true_change-lower_tolerance < test_change < true_change+upper_tolerance):
            correct_detection += 1


print("correct detection: ", correct_detection)
print("incorrect detection: ", len(event_change_true) - correct_detection)
print(f"accuracy: {round(correct_detection/len(event_change_true)*100, 2)}%")

correct detection:  344
incorrect detection:  553
accuracy: 38.35%
