In [1]:
import json
import logging
import os
import yaml

import hydra
import openai
import pandas as pd
import whisper
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

from video_summarization.src.asr import get_text_recognition
from video_summarization.src.ic import get_image_caption
from video_summarization.src.io import save_audio, get_images
from video_summarization.src.llm import create_prompt, transform_predictions, fill_preds, merge_results, \
    create_batch_prompt
from video_summarization.src.metric import eval_metric_dummy, eval_metric_f1_canonical

In [2]:
with open("../confs/predict.yaml") as f:
    cfg = yaml.safe_load(f.read())
cfg

{'dataset_path': '/home/mark/PycharmProjects/video_summarization/datasets',
 'work_path': '/home/mark/PycharmProjects/video_summarization/workspace',
 'open_ai_env_name': 'OPENAI_APIKEY',
 'video_path': 'videos/Bhxk-O1Y7Ho.mp4',
 'image_caption': {'fps': 1,
  'image_size': [224, 224],
  'model_name': 'nlpconnect/vit-gpt2-image-captioning',
  'feature_extractor': 'nlpconnect/vit-gpt2-image-captioning',
  'tokenizer': 'nlpconnect/vit-gpt2-image-captioning',
  'kwargs': {'max_length': 10, 'num_beams': 2}},
 'llm': {'model': 'text-davinci-003',
  'temperature': 0.7,
  'max_tokens': 256,
  'top_p': 1,
  'frequency_penalty': 0,
  'presence_penalty': 0},
 'asr': 'tiny.en',
 'annotation_path': 'annotation.parquet',
 'max_tokens_in_batch': 2000}

In [3]:
%%time
openai.api_key = os.getenv(cfg["open_ai_env_name"])

vit_model = VisionEncoderDecoderModel.from_pretrained(cfg["image_caption"]["model_name"])
vit_feature_extractor = ViTImageProcessor.from_pretrained(cfg["image_caption"]["feature_extractor"])
vit_tokenizer = AutoTokenizer.from_pretrained(cfg["image_caption"]["tokenizer"])

asr_model = whisper.load_model(cfg["asr"])

labels_df = pd.read_parquet(os.path.join(cfg["dataset_path"], cfg["annotation_path"]))
print("Load labels")

video_path = os.path.join(cfg["dataset_path"], cfg["video_path"])
ic_kwargs = cfg["image_caption"]["kwargs"]

audio_path = save_audio(video_path, target_path=cfg["work_path"])
print("Converted video to image")
images = get_images(video_path, cfg["image_caption"]["fps"], cfg["image_caption"]["image_size"])
ic_result = get_image_caption(images, vit_feature_extractor, vit_tokenizer, vit_model, **ic_kwargs)
print("Got image caption")

asr_result = get_text_recognition(asr_model, audio_path)

all_texts = merge_results(ic_result, asr_result, cfg['max_tokens_in_batch'])

Load labels
MoviePy - Writing audio in /home/mark/PycharmProjects/video_summarization/workspace/Bhxk-O1Y7Ho.mp3


                                                                                                                                                                                                           

MoviePy - Done.
Converted video to image
Got image caption
CPU times: user 19min 14s, sys: 2min 20s, total: 21min 35s
Wall time: 3min 54s


In [4]:
all_texts

Unnamed: 0,second,text_ic,wc_ic,cwc_ic,index_ic,index_asr,start_second,end_second,text_asr,wc_asr,cwc_asr,wc_all,batch
0,0,a person is taking a picture of themselves,8,8,0.0,0.0,0.0,2.0,I totally jacked up his bum.,6.0,6.0,14.0,0
1,1,a woman in a white dress is holding a,9,17,0.0,0.0,0.0,2.0,I totally jacked up his bum.,6.0,6.0,29.0,0
2,2,a white cat is laying on a bed,8,25,0.0,0.0,0.0,2.0,I totally jacked up his bum.,6.0,6.0,43.0,0
3,3,a black and white photo of a clock on,9,34,,,,,,,,52.0,0
4,4,a city at night with a lot of traffic,9,43,,,,,,,,61.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,461,a white dog with a black collar sitting on,9,4006,,,,,,,,8693.0,4
462,462,a white dog wearing a white bow tie,8,4014,,,,,,,,8701.0,4
463,463,a white polar bear laying on the ground,8,4022,,,,,,,,8709.0,4
464,464,a white dog with a black and white stripe,9,4031,,,,,,,,8718.0,4


In [51]:
FPS = 29
filename = os.path.basename(video_path).split(".")[0]
annotation = labels_df[labels_df['video_id'] == filename].annotation.values[0]

def create_batch_annotation_prompt(annotation, start, end):
    
    return """
Second Importance 
{}
""".format("\n".join([f"{num} {annotation[i]}" for i, num in enumerate(range(start, end+1))]))


prompts = []
answers = []
for batch_id in range(all_texts['batch'].nunique()):
    batch_df = all_texts[all_texts['batch'] == batch_id]
    batch_df = batch_df[['second', 'text_ic', 'text_asr']].fillna("")
    batch_df = batch_df.rename({'text_ic': 'image description', 'text_asr': 'speech recognition'})
    print("Batch: start {}, end {}".format(batch_df['second'].min(), batch_df['second'].max()))
    batch_prompt = create_batch_prompt(batch_df)
    
    start = batch_df['second'].min() * FPS
    end = batch_df['second'].max() * FPS
    batch_ann = annotation[start:end+1:FPS]
    print("Batch size", batch_df.shape[0], "ann size", batch_ann.shape[0])
    
    answers.append(create_batch_annotation_prompt(batch_ann, batch_df['second'].min(), batch_df['second'].max()))
    prompts.append(batch_prompt)

Batch: start 0, end 97
Batch size 98 ann size 98
Batch: start 98, end 198
Batch size 101 ann size 101
Batch: start 199, end 321
Batch size 123 ann size 123
Batch: start 322, end 418
Batch size 97 ann size 97
Batch: start 419, end 465
Batch size 47 ann size 47


In [52]:
print(batch_prompt)


        Summarize the video by description data

         second                                          text_ic                                                                                      text_asr
    419  a woman is looking at the camera while brushing                               And I want his head to grow out and I don't know. I have a plan
    420           a woman is sitting at a counter with a                                 But I'm just too tired to talk about it. So I'm gonna say bye
    421             a woman is looking at her cell phone                                 But I'm just too tired to talk about it. So I'm gonna say bye
    422           a woman is looking at her phone in the                                 But I'm just too tired to talk about it. So I'm gonna say bye
    423        a woman sitting at a counter in a kitchen                                                                                              
    424           a woman is looking

In [53]:
print(answers[-1])


Second Importance 
419 0.24
420 0.24
421 0.24
422 0.22999999999999998
423 0.22999999999999998
424 0.25
425 0.25
426 0.22999999999999998
427 0.22999999999999998
428 0.22999999999999998
429 0.22999999999999998
430 0.22000000000000003
431 0.22000000000000003
432 0.24
433 0.24
434 0.24
435 0.26
436 0.26
437 0.25
438 0.25
439 0.27999999999999997
440 0.27999999999999997
441 0.51
442 0.51
443 0.5
444 0.5
445 0.48
446 0.48
447 0.45999999999999996
448 0.45999999999999996
449 0.6
450 0.6
451 0.55
452 0.55
453 0.63
454 0.63
455 0.35
456 0.35
457 0.64
458 0.64
459 0.55
460 0.55
461 0.61
462 0.61
463 0.5599999999999999
464 0.5599999999999999
465 0.5599999999999999



In [56]:
instructions = [{'instruction': p, 'output': a} for p,a in zip(prompts, answers)]
instructions

[{'instruction': "\n        Summarize the video by description data\n\n         second                                             text_ic                                                                                                       text_asr\n      0          a person is taking a picture of themselves                                                                                   I totally jacked up his bum.\n      1               a woman in a white dress is holding a                                                                                   I totally jacked up his bum.\n      2                      a white cat is laying on a bed                                                                                   I totally jacked up his bum.\n      3               a black and white photo of a clock on                                                                                                               \n      4               a city at night with a lot of traffic  

In [64]:
with open("instructions.json", "w") as f:
    json.dump(instructions, f)