In [34]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance
import re
from typing import List, Dict


# Load environment variables
load_dotenv()

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "srt_subtitles")



In [35]:
# Function to parse .srt file
def parse_srt(file_path: str, movie_name: str, idx:int=0) -> List[Dict[str, str]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    pattern = re.compile(r'(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\s+(.+?)(?=\n\d+\n|\Z)', re.DOTALL)
    matches = pattern.findall(content)
    subtitles = []
    for _, start, end, text in matches:
        subtitles.append({
            "index": int(idx),
            "start": start,
            "end": end,
            "movie_name": movie_name,
            "text": text.replace('\n', ' ').strip()
        })
        idx += 1
    return subtitles


SRT_PATH = "/Users/enkhbat_1/projects/ai-video-ge/movie-reels/srt_files/"
raw_srt_file = parse_srt(f"{SRT_PATH}Shawshank_Redemption_1.srt",movie_name="Shawshank Redemption")
print(raw_srt_file[1:10])  

[{'index': 1, 'start': '00:02:08,080', 'end': '00:02:12,240', 'movie_name': 'Shawshank Redemption', 'text': '...the confrontation you had with your wife the night she was murdered.'}, {'index': 2, 'start': '00:02:16,720', 'end': '00:02:18,320', 'movie_name': 'Shawshank Redemption', 'text': 'It was very bitter.'}, {'index': 3, 'start': '00:02:18,640', 'end': '00:02:22,760', 'movie_name': 'Shawshank Redemption', 'text': 'She said she was glad I knew, that she hated all the sneaking around.'}, {'index': 4, 'start': '00:02:24,280', 'end': '00:02:27,600', 'movie_name': 'Shawshank Redemption', 'text': 'And she said that she wanted a divorce in Reno.'}, {'index': 5, 'start': '00:02:27,840', 'end': '00:02:31,360', 'movie_name': 'Shawshank Redemption', 'text': '-What was your response? -I told her I would not grant one.'}, {'index': 6, 'start': '00:02:31,560', 'end': '00:02:34,600', 'movie_name': 'Shawshank Redemption', 'text': '"I\'ll see you in hell before I see you in Reno."'}, {'index': 7, 

In [36]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

def prepare_documents(raw_srt_file: List[dict]) -> List[Document]:
    """
    Convert raw SRT file data to LangChain Document objects.
    
    Args:
        raw_srt_file (list): List of dictionaries containing subtitle data.
        
    Returns:
        list: List of LangChain Document objects.
    """

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=100, chunk_overlap=10, add_start_index=True
    )
    return text_splitter.split_documents([
        Document(
            page_content=sub["text"],
            metadata={
                "index": sub["index"],
                "start": sub["start"],
                "end": sub["end"],
                "movie_name": sub["movie_name"]
            }
        )
        for sub in raw_srt_file
    ])

# Convert raw_srt_file to LangChain Document objects
all_splits = prepare_documents(raw_srt_file)

len(all_splits)  # Total number of splits

930

In [37]:
from langchain_huggingface import HuggingFaceEmbeddings



embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B",
                                   cache_folder= "/Users/enkhbat_1/projects/ai-video-ge/cache-models/")


In [38]:
len(embeddings.embed_query(all_splits[2].page_content))  # Example embedding for the first split

1024

In [39]:

client = QdrantClient(
    url=QDRANT_URL,  # Use QDRANT_URL from environment variables
    api_key=QDRANT_API_KEY
)

In [None]:
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant import RetrievalMode
vector_store = QdrantVectorStore.from_documents(
    all_splits,embeddings,
    url = QDRANT_URL,
    api_key = QDRANT_API_KEY,
    collection_name="srt_subtitles",
    distance=Distance.COSINE , # or Distance.EUCLID
    retrieval_mode=RetrievalMode.DENSE
)

In [None]:
from pprint import pprint

retrieve = client.query_points(
    collection_name="srt_subtitles",
    query=embeddings.embed_query("And for the briefest of moments..."),
    limit=3
)
pprint(retrieve.dict())

{'points': [{'id': '0028713f-62cc-4215-8bed-c97f2d004727',
             'order_value': None,
             'payload': {'metadata': {'end': '01:07:14,840',
                                      'index': 882,
                                      'movie_name': 'Shawshank Redemption',
                                      'start': '01:07:12,720',
                                      'start_index': 0},
                         'page_content': 'And for the briefest of moments...'},
             'score': 1.0,
             'shard_key': None,
             'vector': None,
             'version': 28},
            {'id': '5efa9f07-0c09-4b23-8bd0-192b0201ca8a',
             'order_value': None,
             'payload': {'metadata': {'end': '01:07:14,840',
                                      'index': 882,
                                      'movie_name': 'Shawshank Redemption',
                                      'start': '01:07:12,720',
                                      'start_index': 0},

/var/folders/3y/3k3hgv3j1nd15tryq2p3kdgh0000gn/T/ipykernel_56614/3378288144.py:8: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  pprint(retrieve.dict())


In [None]:

# Perform a similarity search

query = "And for the briefest of moments..."
found_docs = vector_store.similarity_search(query, k=3)
pprint(found_docs)

[Document(metadata={'index': 882, 'start': '01:07:12,720', 'end': '01:07:14,840', 'movie_name': 'Shawshank Redemption', 'start_index': 0, '_id': '0028713f-62cc-4215-8bed-c97f2d004727', '_collection_name': 'srt_subtitles'}, page_content='And for the briefest of moments...'),
 Document(metadata={'index': 882, 'start': '01:07:12,720', 'end': '01:07:14,840', 'movie_name': 'Shawshank Redemption', 'start_index': 0, '_id': '5efa9f07-0c09-4b23-8bd0-192b0201ca8a', '_collection_name': 'srt_subtitles'}, page_content='And for the briefest of moments...'),
 Document(metadata={'index': 497, 'start': '00:37:18,560', 'end': '00:37:20,640', 'movie_name': 'Shawshank Redemption', 'start_index': 0, '_id': '6edd0830-af42-4f2a-88b5-ec33e42820f4', '_collection_name': 'srt_subtitles'}, page_content='...if only for a short while.')]


In [None]:
### Chat with Gemini model
import os

from langchain.chat_models import init_chat_model

# Use Gemini instead of Gemma for system prompt support
model = init_chat_model(model="gemini-1.5-flash",
                         model_provider="google_genai", 
                         api_key=os.environ["GEMINI_API_KEY"])


from langchain_core.messages import HumanMessage

# For Gemma models, combine system instruction with human message
messages = [
    HumanMessage("Translate the following from English into Italian: hi!"),
]

model.invoke(messages).content
