In [56]:
import os
import yaml

import hydra
import openai
import pandas as pd
import numpy as np
import whisper
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

from video_summarization.src.asr import get_text_recognition
from video_summarization.src.ic import get_image_caption
from video_summarization.src.io import save_audio, get_images
from video_summarization.src.llm import create_prompt, transform_predictions, fill_preds
from video_summarization.src.metric import eval_metric

In [4]:
!pwd

/home/mark/PycharmProjects/video_summarization/notebooks


In [11]:
with open("../confs/predict.yaml") as f:
    cfg = yaml.safe_load(f.read())
cfg

{'dataset_path': '/home/mark/PycharmProjects/video_summarization/datasets',
 'work_path': '/home/mark/PycharmProjects/video_summarization/workspace',
 'open_ai_env_name': 'OPENAI_APIKEY',
 'video_path': 'videos/Bhxk-O1Y7Ho.mp4',
 'image_caption': {'fps': 1,
  'image_size': [224, 224],
  'model_name': 'nlpconnect/vit-gpt2-image-captioning',
  'feature_extractor': 'nlpconnect/vit-gpt2-image-captioning',
  'tokenizer': 'nlpconnect/vit-gpt2-image-captioning',
  'kwargs': {'max_length': 10, 'num_beams': 2}},
 'llm': {'model': 'text-davinci-003',
  'temperature': 0.7,
  'max_tokens': 256,
  'top_p': 1,
  'frequency_penalty': 0,
  'presence_penalty': 0},
 'asr': 'tiny.en',
 'annotation_path': 'annotation.parquet'}

In [12]:
openai.api_key = os.getenv(cfg["open_ai_env_name"])

vit_model = VisionEncoderDecoderModel.from_pretrained(cfg["image_caption"]["model_name"])
vit_feature_extractor = ViTImageProcessor.from_pretrained(cfg["image_caption"]["feature_extractor"])
vit_tokenizer = AutoTokenizer.from_pretrained(cfg["image_caption"]["tokenizer"])

asr_model = whisper.load_model(cfg["asr"])

labels_df = pd.read_parquet(os.path.join(cfg["dataset_path"], cfg["annotation_path"]))
print("Load labels")

video_path = os.path.join(cfg["dataset_path"], cfg["video_path"])
ic_kwargs = cfg["image_caption"]["kwargs"]

audio_path = save_audio(video_path, target_path=cfg["work_path"])
print("Converted video to image")
images = get_images(video_path, cfg["image_caption"]["fps"], cfg["image_caption"]["image_size"])
ic_result = get_image_caption(images, vit_feature_extractor, vit_tokenizer, vit_model, **ic_kwargs)
print("Got image caption")

asr_result = get_text_recognition(asr_model, audio_path)
prompt = create_prompt(asr_result, ic_result)
print(prompt)

Load labels
MoviePy - Writing audio in /home/mark/PycharmProjects/video_summarization/workspace/Bhxk-O1Y7Ho.mp3


                                                                                                                                                                                                                               

MoviePy - Done.
Converted video to image
Got image caption

        Summarize the video by description data
        Decription of each image per second:
         second                                                text
      0          a person is taking a picture of themselves
      1               a woman in a white dress is holding a
      2                      a white cat is laying on a bed
      3               a black and white photo of a clock on
      4               a city at night with a lot of traffic
      5               a city at night with a lot of traffic
      6               a city at night with a lot of traffic
      7               a city at night with a lot of traffic
      8            a woman in a black shirt and black pants
      9        a woman is brushing her teeth in the kitchen
     10              a woman is looking at her phone in the
     11              a woman is looking at her phone in the
     12            a woman in a white shirt and black pants

In [27]:
ic_result['wc'] = ic_result['text'].str.split().apply(len)
ic_result['cwc'] = ic_result['wc'].cumsum()
ic_result

Unnamed: 0,second,text,wc,cwc
0,0,a person is taking a picture of themselves,8,8
1,1,a woman in a white dress is holding a,9,17
2,2,a white cat is laying on a bed,8,25
3,3,a black and white photo of a clock on,9,34
4,4,a city at night with a lot of traffic,9,43
...,...,...,...,...
461,461,a white dog with a black collar sitting on,9,4006
462,462,a white dog wearing a white bow tie,8,4014
463,463,a white polar bear laying on the ground,8,4022
464,464,a white dog with a black and white stripe,9,4031


In [31]:
asr_result['wc'] = asr_result['text'].str.split().apply(len)
asr_result['cwc'] = asr_result['wc'].cumsum()
asr_result

Unnamed: 0,start_second,end_second,text,wc,cwc
0,0.00,2.00,I totally jacked up his bum.,6,6
1,7.00,10.60,Good morning. Happy Saturday. I do not look m...,12,18
2,10.60,14.82,I think I feel a little bit better or I'm jus...,14,32
3,15.76,18.12,Jugs that it doesn't hurt,5,37
4,20.44,25.84,"But yeah, I'm gonna give me a bath and a trim...",21,58
...,...,...,...,...,...
92,427.18,429.18,To all your friends,4,1006
93,430.22,437.18,Not bad guys. It was crazy. I got a bath a bl...,18,1024
94,437.82,443.74,"Sprout because it's a knot, but I don't have ...",19,1043
95,445.02,447.02,You can't give them the kisses my bobs,8,1051


In [91]:
def find_ind(x, asr_result):
    start = asr_result['start_second'].values
    end = asr_result['end_second'].values
    res = np.where((start <= x) & (x <= end))[0]
    return np.nan if res.shape[0] == 0 else res[0]


ic_result['index'] = ic_result['second'].apply(lambda x: find_ind(x, asr_result))
ic_result.head(30)

Unnamed: 0,second,text,wc,cwc,merge_i,index
0,0,a person is taking a picture of themselves,8,8,0.0,0.0
1,1,a woman in a white dress is holding a,9,17,0.0,0.0
2,2,a white cat is laying on a bed,8,25,,0.0
3,3,a black and white photo of a clock on,9,34,,
4,4,a city at night with a lot of traffic,9,43,,
5,5,a city at night with a lot of traffic,9,52,,
6,6,a city at night with a lot of traffic,9,61,,
7,7,a city at night with a lot of traffic,9,70,1.0,1.0
8,8,a woman in a black shirt and black pants,9,79,1.0,1.0
9,9,a woman is brushing her teeth in the kitchen,9,88,1.0,1.0


In [144]:
max_tokens = 2000

all_texts = ic_result.join(asr_result.reset_index(), how='left', on='index', lsuffix='_ic', rsuffix='_asr')
all_texts['wc_all'] = (all_texts['wc_ic'] + all_texts['wc_asr'].fillna(0)).cumsum()
all_texts['batch'] = (all_texts['wc_all'] // max_tokens).astype(int)
all_texts

Unnamed: 0,second,text_ic,wc_ic,cwc_ic,merge_i,index_ic,index_asr,start_second,end_second,text_asr,wc_asr,cwc_asr,wc_all,batch
0,0,a person is taking a picture of themselves,8,8,0.0,0.0,0.0,0.0,2.0,I totally jacked up his bum.,6.0,6.0,14.0,0
1,1,a woman in a white dress is holding a,9,17,0.0,0.0,0.0,0.0,2.0,I totally jacked up his bum.,6.0,6.0,29.0,0
2,2,a white cat is laying on a bed,8,25,,0.0,0.0,0.0,2.0,I totally jacked up his bum.,6.0,6.0,43.0,0
3,3,a black and white photo of a clock on,9,34,,,,,,,,,52.0,0
4,4,a city at night with a lot of traffic,9,43,,,,,,,,,61.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,461,a white dog with a black collar sitting on,9,4006,,,,,,,,,8596.0,4
462,462,a white dog wearing a white bow tie,8,4014,,,,,,,,,8604.0,4
463,463,a white polar bear laying on the ground,8,4022,,,,,,,,,8612.0,4
464,464,a white dog with a black and white stripe,9,4031,,,,,,,,,8621.0,4


In [149]:
all_texts['batch'].value_counts()

batch
2    120
1    109
0     98
3     98
4     41
Name: count, dtype: int64

In [159]:
# Testing inference on batch
def create_batch_prompt(batch):
    return """
        Summarize the video by description data
        
        {batch}
        
        Return summarization as table with columns: second, importance. importance should be from 0 to 1.
        Summarization should has following structure:
        Second Importance
        <start_second>  <importance>
        ....
        <end_second>   <importance>
        """.format(batch=batch.to_string(index=None))

tdf = []
for batch_id in range(all_texts['batch'].nunique()):
    batch_df = all_texts[all_texts['batch'] == batch_id]
    batch_df = batch_df[['second', 'text_ic', 'text_asr']].fillna("")
    batch_df = batch_df.rename({'text_ic': 'image description', 'text_asr': 'speech recognition'})
    print("Batch: start {}, end {}".format(batch_df['second'].min(), batch_df['second'].max()))
    batch_prompt = create_batch_prompt(batch_df)
    
    response = openai.Completion.create(**dict({"prompt": batch_prompt}, **dict(cfg["llm"])))
    print("Response from gpt:", response['choices'][0]['text'])
    batch_tdf = transform_predictions(response['choices'][0]['text'])
    tdf.append(batch_tdf)
    
tdf = pd.concat(tdf)
tdf

Batch: start 0, end 97
Response from gpt: 
        0   1
        7   0.9
        16  0.8
        21  0.7
        25  0.6
        28  0.5
        31  0.4
        37  0.3
        41  0.2
        49  0.1
        59  0
        92  0
Batch: start 98, end 206
Response from gpt: 
        98   1.00
        199  0.50
        206  0.00
Batch: start 207, end 326
Response from gpt: 
        207  1.0
        215  0.8
        236  0.6
        241  0.4
        250  0.2
        288  0.0
Batch: start 327, end 424
Response from gpt: 
        327 0.7
        358 0.7
        385 0.7
        402 0.7
        421 0.7
        424 0.7
Batch: start 425, end 465
Response from gpt: 
        425 0.8
        435 0.8
        437 0.7
        438 0.7
        439 0.7
        440 0.7
        441 0.7
        442 0.7
        443 0.7
        446 0.7
        456 0.7
        457 0.7
        459 0.7
        460 0.7
        462 0.7
        463 0.7
        464 0.7
        465 0.7


Unnamed: 0,second,prob
0,0.0,1.0
1,7.0,0.9
2,16.0,0.8
3,21.0,0.7
4,25.0,0.6
5,28.0,0.5
6,31.0,0.4
7,37.0,0.3
8,41.0,0.2
9,49.0,0.1


In [162]:
preds = fill_preds(ic_result, tdf)
preds

Unnamed: 0,second,prob
0,0,1.0
1,1,1.0
2,2,1.0
3,3,1.0
4,4,1.0
...,...,...
461,461,0.7
462,462,0.7
463,463,0.7
464,464,0.7


In [163]:
tdf

Unnamed: 0,second,prob
0,0.0,1.0
1,7.0,0.9
2,16.0,0.8
3,21.0,0.7
4,25.0,0.6
5,28.0,0.5
6,31.0,0.4
7,37.0,0.3
8,41.0,0.2
9,49.0,0.1


In [197]:
filename = os.path.basename(video_path).split(".")[0]
annotation = labels_df[labels_df['video_id'] == filename].annotation.values[0]

result_metric = eval_metric(preds, annotation, t=0.1)
result_metric

0.9073632283369051

In [208]:
import numpy as np
from sklearn.metrics import f1_score


def spread_preds(preds, annotation):
    post_preds = preds.copy()
    post_preds['reps'] = annotation.shape[0] // preds['second'].max()
    post_preds = post_preds.loc[post_preds.index.repeat(post_preds.reps)]
    return post_preds['prob'].values


def eval_metric_dummy(preds, annotation, t=0.5):
    post_preds = spread_preds(preds, annotation)

    same_len = min(post_preds.shape[0], annotation.shape[0])
    y_true = np.where(annotation[:same_len] >= t, 1, 0)
    y_pred = np.where(post_preds[:same_len] >= t, 1, 0)
    return f1_score(y_true, y_pred)

eval_metric_dummy(preds, annotation, t=0.1)

0.9073632283369051

In [231]:
def eval_metric_f1_canonical(preds, annotation, labels_count=5):
    """labels_count должен задаваться отдельно"""
    labels = np.linspace(0, 1, labels_count+1)[1:]
    post_preds = spread_preds(preds, annotation)
    
    same_len = min(post_preds.shape[0], annotation.shape[0])
    transform = np.vectorize(lambda x: (x >= labels).sum())
    y_true = transform(annotation[:same_len])
    y_pred = transform(post_preds[:same_len])
    return f1_score(y_true, y_pred, average='micro')
    

eval_metric_f1_canonical(preds, annotation)

0.09695803419436015