In [4]:
import os
import re
import numpy as np
import pandas as pd
import psycopg2
from PIL import Image
from pymilvus import MilvusClient
import torch
from transformers import AutoProcessor as HF_AutoProcessor, BlipForQuestionAnswering

# ================== Config ==================
DB_PARAMS = {
    "dbname": "video_frame",
    "user": "postgres",
    "password": "123",
    "host": "localhost",
    "port": "5432"
}

CSV_DIR   = r"D:\Big_project_2025\Video_Similarity_Search\data\csv"
FRAME_DIR = r"D:\Big_project_2025\Video_Similarity_Search\data\key_frame"
VIDEO_DIR = r"D:\Big_project_2025\Video_Similarity_Search\data\video"
MODEL_DIR = r"D:\Big_project_2025\huggingface_cache"

# ================== Kết nối DB + Milvus ==================
conn = psycopg2.connect(**DB_PARAMS)
cur = conn.cursor()
client = MilvusClient(uri="http://localhost:19530")


In [5]:
def hhmmss_to_seconds(s: str) -> float:
    s = s.strip().lower()
    if ":" in s:
        parts = [float(x) for x in s.split(":")]
        if len(parts) == 2: return parts[0]*60 + parts[1]
        elif len(parts) == 3: return parts[0]*3600 + parts[1]*60 + parts[2]
    m = re.search(r"(\d+(?:\.\d+)?)\s*(s|giây|sec)", s)
    if m: return float(m.group(1))
    return None

def extract_time_from_question(q: str):
    q_low = q.lower()
    m_ts = re.search(r"(\d{1,2}:\d{2}(:\d{2})?)", q_low)
    if m_ts: return hhmmss_to_seconds(m_ts.group(1))
    m_s = re.search(r"(\d+(?:\.\d+)?)\s*(s|giây|sec)", q_low)
    if m_s: return float(m_s.group(1))
    return None

def load_video_csv(video_name: str):
    csv_path = os.path.join(CSV_DIR, f"{video_name}.csv")
    df = pd.read_csv(csv_path)
    return df

def seconds_to_frame_idx(video_name: str, seconds_value: float) -> int:
    df = load_video_csv(video_name).sort_values("pts_time").reset_index(drop=True)
    frame_idx = np.interp(seconds_value, df["pts_time"], df["frame_idx"])
    return int(round(frame_idx))

def frame_idx_to_seconds(video_name: str, frame_idx: int) -> float:
    df = load_video_csv(video_name).sort_values("frame_idx").reset_index(drop=True)
    seconds_val = np.interp(frame_idx, df["frame_idx"], df["pts_time"])
    return float(seconds_val)

def get_frame_image_path(video_name: str, frame_idx: int):
    target_sec = frame_idx_to_seconds(video_name, frame_idx)
    cur.execute("""
        SELECT fm.frame_path, ABS(fm.pts_time - %s) AS diff
        FROM frame_mappings fm
        JOIN videos v ON fm.video_id = v.id
        WHERE v.video_path LIKE %s
        ORDER BY diff ASC
        LIMIT 1
    """, (target_sec, f"%{video_name}%"))
    row = cur.fetchone()
    if row: return row[0]
    return None


In [7]:
# ========== Cell 3: Load BLIP VQA Base (CPU tối ưu) ==========
import torch
from transformers import BlipForQuestionAnswering, AutoProcessor
from PIL import Image

# Với máy bạn thì luôn chạy CPU
device = "cpu"
print(f"👉 Đang chạy trên: {device.upper()}")

MODEL_DIR = r"D:\Big_project_2025\huggingface_cache"
MODEL_NAME = "Salesforce/blip-vqa-base"

# Load processor
vqa_processor = AutoProcessor.from_pretrained(MODEL_NAME, cache_dir=MODEL_DIR)

# Load model với cấu hình tiết kiệm RAM
vqa_model = BlipForQuestionAnswering.from_pretrained(
    MODEL_NAME,
    cache_dir=MODEL_DIR,
    torch_dtype=torch.float32,   # CPU nên để float32
    low_cpu_mem_usage=True       # giảm chiếm dụng bộ nhớ
).to(device)

print("✅ Load thành công BLIP-VQA-BASE (CPU mode)")

# ========== Hàm chạy VQA ==========
def run_vqa(image_path: str, question: str) -> str:
    """
    Thực hiện Visual Question Answering:
    - image_path: đường dẫn tới ảnh (frame)
    - question: câu hỏi
    """
    image = Image.open(image_path).convert("RGB")
    inputs = vqa_processor(image, question, return_tensors="pt").to(device)

    with torch.no_grad():
        out = vqa_model.generate(**inputs)
        answer = vqa_processor.decode(out[0], skip_special_tokens=True)

    return answer


👉 Đang chạy trên: CPU


OSError: The paging file is too small for this operation to complete. (os error 1455)