In [1]:
import glob
import os
from itertools import chain
import pandas as pd
import csv


# Merge the KeyMap

In [2]:
map_keyframe_list = ['map-keyframes/*.csv']
file_list_list = [glob.glob(folder_path) for folder_path in map_keyframe_list]
file_list = list(chain(*file_list_list))


csv_files = [file.split('/')[1]  for file in file_list]
print(csv_files[:3])
print(len(csv_files))

# 738
# Batch 1: 299
# Batch 2: 439
# Frame index ???

['L04_V023.csv', 'L05_V023.csv', 'L19_V026.csv']
738


In [5]:
f'{5:04}'

'0005'

In [7]:
"keyframes/L01_V001/{:04}.jpg".format(12)

'0012'

In [13]:
merged_data = []
for csv_file in csv_files:
    video_name = csv_file.split('.csv')[0]
    csv_file_path = os.path.join('map-keyframes', csv_file)

    # Read the CSV file and extract the 'frame_idx' values
    with open(csv_file_path, 'r') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            frame_idx = row['frame_idx']
            n = int(row['n'])
            path = "keyframes/{}/{:04}.jpg".format(video_name, n)
            merged_data.append({'video_name': video_name, 'frame_idx': frame_idx, 'n': n, 'path': path})

In [14]:
output_csv_path = 'merged_keyframes.csv'
# Write the merged data to a new CSV file
with open(output_csv_path, 'w', newline='') as output_file:
    fieldnames = ['video_name', 'frame_idx', 'n', 'path']
    csv_writer = csv.DictWriter(output_file, fieldnames=fieldnames)
    csv_writer.writeheader()
    csv_writer.writerows(merged_data)

print(f'Merged CSV file saved to {output_csv_path}')

Merged CSV file saved to merged_keyframes.csv


In [6]:
df = pd.read_csv('merged_keyframes.csv')
df.head(5)
filtered_df = df[df['video_name'].str.contains('L01_')]
len(filtered_df)
# filtered_df.to_csv('merged_keyframes1.csv', index=False)

7658

In [15]:
df = pd.read_csv('merged_keyframes.csv')
print(len(df))

df = df.sort_values(by=['video_name', 'frame_idx'])
print(df.head(5))

df.to_csv('merged_keyframes.csv', index=False)

# 100 / 202148 = thấp
# 202.148

202148
    video_name  frame_idx  n                         path
961   L01_V001          0  1  keyframes/L01_V001/0001.jpg
962   L01_V001        100  2  keyframes/L01_V001/0002.jpg
963   L01_V001        271  3  keyframes/L01_V001/0003.jpg
964   L01_V001        335  4  keyframes/L01_V001/0004.jpg
965   L01_V001        346  5  keyframes/L01_V001/0005.jpg


In [7]:
400.962482213974/7658*202148

10584.194809949127

In [8]:
10584.194809949127 / 60 / 60

2.9400541138747576

# Random Solution

In [10]:
df = pd.read_csv('merged_keyframes.csv')

for i in range(10,12):
    query_df = df.sample(n=100, random_state=42)
    query_df.to_csv(f'query-p1-{i}.csv', index=False, header=False)

# More advanced approach

In [3]:
df = pd.read_csv('merged_keyframes.csv')
df.head(5)

Unnamed: 0,video_name,frame_idx,n,path
0,L01_V001,0,1,keyframes/L01_V001/0001.jpg
1,L01_V001,100,2,keyframes/L01_V001/0002.jpg
2,L01_V001,271,3,keyframes/L01_V001/0003.jpg
3,L01_V001,335,4,keyframes/L01_V001/0004.jpg
4,L01_V001,346,5,keyframes/L01_V001/0005.jpg


In [None]:
# 236
# 249

In [4]:
import torch
from PIL import Image

from lavis.models import load_model_and_preprocess
from lavis.processors import load_processor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, vis_processors, text_processors = load_model_and_preprocess("blip_image_text_matching", "large", device=device, is_eval=True)
# model, vis_processors, text_processors = load_model_and_preprocess("blip_image_text_matching", "base", device=device, is_eval=True)
# model, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", device=device, is_eval=True)
# model, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "coco", device=device, is_eval=True)

def get_score(path: str, caption: str):
    # "../docs/_static/merlion.png"
    # caption = "that is dog"
    raw_image = Image.open(path).convert("RGB")
    img = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
    txt = text_processors["eval"](caption)

    itm_output = model({"image": img, "text_input": txt}, match_head="itm")
    itm_scores = torch.nn.functional.softmax(itm_output, dim=1)
    score = itm_scores[:, 1].item()
    
    return score

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
caption = "The video clip shows a woman wearing a yellow shirt disposing of trash into a garbage bin. The garbage bin is dark green, and its lid is red. The trash being placed into the bin appears to be 1 kilogram of baby spinach."
df['score'] = df['path'].apply(lambda path: get_score(path, caption))

result = df.sort_values(by='score', ascending=False)
print(result.head(10))

result.to_csv('submission/', index=False)
result[['video_name', 'frame_idx']].to_csv('submission/', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['score'] = tmp['path'].apply(lambda path: get_score(path, caption))


In [None]:
import argparse
parser = argparse.ArgumentParser(description="Read caption from file")
parser.add_argument("--caption", help="Path argument of caption")
args = parser.parse_args()

caption_path = args.caption

In [None]:
# 1 folder queries

In [12]:
with open('queries/query-1.txt', "r") as file:
    file_contents = file.read()

In [13]:
file_contents

'Đoạn video về một người phụ nữ mặc áo màu vàng đang bỏ rác vào thùng rác. Thùng rác màu xanh lá đậm và nắp thùng màu đỏ. Rác đang bỏ vào thùng cho biết đó là 1kg baby spinach.'