In [None]:
 !python3 -m pip install --upgrade --user ortools

In [None]:
from os import PathLike
from pathlib import Path
from typing import List
import cv2
import numpy as np
import torch
from PIL import Image
from numpy import linalg
from torch import nn
from torchvision import transforms, models
import csv
import os
import h5py
import json
from google.colab import drive
from typing import Iterable, List
from ortools.algorithms.python import knapsack_solver
import numpy as np
import h5py
import json
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
feat_size = 1024
normalization_rate = 1e-10

In [None]:
# Code for Feature Extraction

class FeatureExtractor(object):
    def __init__(self):
        # pytorch recommended parameters
        self.preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        # use googlenet model with imagenet weights
        self.model = models.googlenet(weights ='DEFAULT')
        # remove last 2 layers for feature extraction
        self.model = nn.Sequential(*list(self.model.children())[:-2])
        self.model = self.model.cuda().eval()

    def run(self, img: np.ndarray) -> np.ndarray:
        # process frames to extract features
        img = Image.fromarray(img)
        img = self.preprocess(img)
        batch = img.unsqueeze(0)
        with torch.no_grad():
            feat = self.model(batch.cuda())
            feat = feat.squeeze().cpu().numpy()

        assert feat.shape == (feat_size,), f'Invalid feature shape {feat.shape}: expected {feat_size}'
        # normalize frame features
        feat /= linalg.norm(feat) + normalization_rate
        return feat


In [None]:
class VideoPreprocessor(object):
    # initialize feature extractor and sample rate for downsampling sequence
    def __init__(self, sample_rate: int) -> None:
        self.model = FeatureExtractor()
        self.sample_rate = sample_rate
    # function to read video and downsample the video
    def get_features(self, video_path: PathLike):
        video_path = Path(video_path)
        cap = cv2.VideoCapture(str(video_path))
        assert cap is not None, f'Cannot open video: {video_path}'

        features = []
        n_frames = 0
        # downsample the video
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            if n_frames % self.sample_rate == 0:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                feat = self.model.run(frame)
                features.append(feat)

            n_frames += 1

        cap.release()

        features = np.array(features)
        return n_frames, features
    # extract change points and other features
    def kts(self, n_frames, features):
        seq_len = len(features)
        picks = np.arange(0, seq_len) * self.sample_rate

        # compute change points using KTS
        kernel = np.matmul(features, features.T)
        change_points, _ = cpd_auto(kernel, seq_len - 1, 1, verbose=False)
        change_points *= self.sample_rate
        change_points = np.hstack((0, change_points, n_frames))
        begin_frames = change_points[:-1]
        end_frames = change_points[1:]
        change_points = np.vstack((begin_frames, end_frames - 1)).T

        n_frame_per_seg = end_frames - begin_frames
        return change_points, n_frame_per_seg, picks
    # run the following functions for feature extraction
    def run(self, video_path: PathLike):
        n_frames, features = self.get_features(video_path)
        cps, nfps, picks = self.kts(n_frames, features)
        return n_frames, features, cps, nfps, picks

In [None]:
# create dataset with new features
video_titles = []
with open(order_path) as fd:
    reader = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in reader:
        if(row[1] != "video_id"):
            video_titles.append(row[1])


In [None]:
# define knapsack and the conversion of keyframe scores to keyshot summaries
def knapsack(values: Iterable[int],
             weights: Iterable[int],
             capacity: int
             ) -> List[int]:
    solver = knapsack_solver.KnapsackSolver(
        knapsack_solver.SolverType.KNAPSACK_DYNAMIC_PROGRAMMING_SOLVER, 'test'
    )

    values = list(values)
    weights = list(weights)
    capacity = int(capacity)

    solver.init(values, [weights], [capacity])
    solver.solve()
    packed_items = [x for x in range(0, len(weights))
                    if solver.best_solution_contains(x)]

    return packed_items

def get_keyshot_summ(frame_scores: np.ndarray,
                     cps: np.ndarray,
                     n_frames: int,
                     nfps: np.ndarray,
                     picks: np.ndarray,
                     proportion: float = 0.2
                     ) -> np.ndarray:


    # Assign scores to video shots as the average of the frames.
    seg_scores = np.zeros(len(cps), dtype=np.int32)
    for seg_idx, (first, last) in enumerate(cps):
        scores = frame_scores[first:last + 1]
        seg_scores[seg_idx] = int(1000 * scores.mean())

    # Apply knapsack algorithm to find the best shots
    limits = int(n_frames * proportion)
    packed = knapsack(seg_scores, nfps, limits)

    # Get key-shot based summary
    summary = np.zeros(n_frames, dtype=np.bool_)
    for seg_idx in packed:
        first, last = cps[seg_idx]
        summary[first:last + 1] = True

    return summary

In [None]:
#initialize preprocessor for video frames
video_proc = VideoPreprocessor(15)

#create dataset file and write features of the dataset
with h5py.File(file_name, 'w') as h5out:
    for idx, video_title in enumerate(video_titles):

        video_key = f'video_{idx+1}'
        print("working on video ", video_title, " ", video_key, " of ", len(video_titles))

        video_path = os.path.join(videos_path, video_title + ".mp4")
        n_frames, features, cps, nfps, picks = video_proc.run(video_path)

        label_path = os.path.join(anno_path, video_title +".json")

        #get video labels (annotations)
        with open(label_path, 'r') as file:
            json_data = json.load(file)
            user_summary = np.array(json_data["user_summary"],dtype=np.float32)

            # Ensure Equal Shape
            _, label_n_frames = user_summary.shape

            print(user_summary.shape)
            if (label_n_frames < n_frames):
                print(f'Invalid label of size {label_n_frames}: expected {n_frames}')
                user_summary = np.pad(user_summary, ((0, 0), (0,  n_frames - label_n_frames)), mode='constant', constant_values=1)
            elif (label_n_frames > n_frames):
                print(f'Invalid label of size {label_n_frames}: expected {n_frames}')
                user_summary = user_summary[:, :n_frames]

            # # Normalize the mean of the annotations
            # user_summary -= user_summary.min()
            # user_summary /= user_summary.max()

            gtscore = np.mean(user_summary[:, ::15], axis=0)

            # convert annotations into summaries
            bool_summary = []
            for summary in user_summary:
                bool_summary.append(get_keyshot_summ(summary,cps,n_frames,nfps,picks))
            bool_summary = np.array(bool_summary,dtype=np.float32)


        #write dataset to h5 file
        h5out.create_dataset(f'{video_key}/features', data=features)
        h5out.create_dataset(f'{video_key}/gtscore', data=gtscore)
        h5out.create_dataset(f'{video_key}/user_summary', data=bool_summary)
        h5out.create_dataset(f'{video_key}/change_points', data=cps)
        h5out.create_dataset(f'{video_key}/n_frame_per_seg', data=nfps)
        h5out.create_dataset(f'{video_key}/n_frames', data=n_frames)
        h5out.create_dataset(f'{video_key}/picks', data=picks)
        h5out.create_dataset(f'{video_key}/video_name', data=video_title)

    print(f'Dataset saved to {file_name}')


Downloading: "https://download.pytorch.org/models/googlenet-1378be20.pth" to /root/.cache/torch/hub/checkpoints/googlenet-1378be20.pth
100%|██████████| 49.7M/49.7M [00:00<00:00, 159MB/s]


working on video  AwmHb44_ouw   video_1  of  50
(20, 10597)
working on video  98MoyGZKHXc   video_2  of  50
(20, 4688)
working on video  J0nA4VgnoCo   video_3  of  50
(20, 14019)
working on video  gzDbaEs1Rlg   video_4  of  50
(20, 7210)
working on video  XzYM3PfTM4w   video_5  of  50
(20, 3327)
working on video  HT5vyqe0Xaw   video_6  of  50
(20, 9671)
working on video  sTEELN-vY30   video_7  of  50
(20, 4468)
working on video  vdmoEJ5YbrQ   video_8  of  50
(20, 9870)
working on video  xwqBXPGE9pQ   video_9  of  50
(20, 7010)
working on video  akI8YFjEmUw   video_10  of  50
(20, 3995)
working on video  i3wAGJaaktw   video_11  of  50
(20, 4700)
working on video  Bhxk-O1Y7Ho   video_12  of  50
(20, 13511)
working on video  0tmA_C6XwfM   video_13  of  50
(20, 3532)
working on video  3eYKfiOEJNs   video_14  of  50
(20, 4853)
working on video  xxdtq8mxegs   video_15  of  50
(20, 4324)
working on video  WG0MBPpPC6I   video_16  of  50
(20, 9534)
Invalid label of size 9534: expected 9535
work