In [8]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from langchain_core.documents import Document
from qdrant_client.http.models import PointStruct, VectorParams, Distance
import re
from pprint import pprint
from typing import List, Dict


# Load environment variables
load_dotenv()

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "srt_subtitles")



In [9]:
# Function to parse .srt file
def parse_srt(file_path: str, movie_name: str, idx:int=0) -> List[Dict[str, str]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    pattern = re.compile(r'(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\s+(.+?)(?=\n\d+\n|\Z)', re.DOTALL)
    matches = pattern.findall(content)
    subtitles = []
    for _, start, end, text in matches:
        subtitles.append({
            "index": int(idx),
            "start": start,
            "end": end,
            "movie_name": movie_name,
            "text": text.replace('\n', ' ').strip()
        })
        idx += 1
    return subtitles


SRT_PATH = "/Users/enkhbat_1/projects/ai-video-ge/movie-reels/srt_files/"
raw_srt_file = parse_srt(f"{SRT_PATH}Shawshank_Redemption_1.srt",movie_name="Shawshank Redemption")
print(raw_srt_file[1:10])  

[{'index': 1, 'start': '00:02:08,080', 'end': '00:02:12,240', 'movie_name': 'Shawshank Redemption', 'text': '...the confrontation you had with your wife the night she was murdered.'}, {'index': 2, 'start': '00:02:16,720', 'end': '00:02:18,320', 'movie_name': 'Shawshank Redemption', 'text': 'It was very bitter.'}, {'index': 3, 'start': '00:02:18,640', 'end': '00:02:22,760', 'movie_name': 'Shawshank Redemption', 'text': 'She said she was glad I knew, that she hated all the sneaking around.'}, {'index': 4, 'start': '00:02:24,280', 'end': '00:02:27,600', 'movie_name': 'Shawshank Redemption', 'text': 'And she said that she wanted a divorce in Reno.'}, {'index': 5, 'start': '00:02:27,840', 'end': '00:02:31,360', 'movie_name': 'Shawshank Redemption', 'text': '-What was your response? -I told her I would not grant one.'}, {'index': 6, 'start': '00:02:31,560', 'end': '00:02:34,600', 'movie_name': 'Shawshank Redemption', 'text': '"I\'ll see you in hell before I see you in Reno."'}, {'index': 7, 

In [None]:
def concatenate_subtitles_by_chunks(subtitles, chunk_size=3):
    """
    Concatenate subtitles into chunks of specified size
    
    Args:
        subtitles: List of subtitle dictionaries
        chunk_size: Number of subtitles to combine in each chunk
    
    Returns:
        List of concatenated subtitle chunks
    """
    def time_to_seconds(time_str):
        """Convert 'HH:MM:SS,mmm' format to seconds"""
        time_part, ms = time_str.split(',')
        h, m, s = map(int, time_part.split(':'))
        return h * 3600 + m * 60 + s + int(ms) / 1000
    
    def seconds_to_time(seconds):
        """Convert seconds back to 'HH:MM:SS,mmm' format"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        ms = int((seconds % 1) * 1000)
        return f"{hours:02d}:{minutes:02d}:{secs:02d},{ms:03d}"
    
    concatenated_subtitles = []
    
    # Process subtitles in chunks
    for i in range(0, len(subtitles), chunk_size):
        chunk = subtitles[i:i + chunk_size]
        
        if not chunk:
            continue
            
        # Combine text from all subtitles in chunk
        combined_text = ' '.join(subtitle['text'] for subtitle in chunk)
        
        # Get start and end times
        start_time = chunk[0]['start']
        end_time = chunk[-1]['end']
        
        # Calculate duration
        start_seconds = time_to_seconds(start_time)
        end_seconds = time_to_seconds(end_time)
        total_duration = end_seconds - start_seconds
        
        # Create combined subtitle entry
        combined_subtitle = {
            'index': len(concatenated_subtitles),
            'start': start_time,
            'end': end_time,
            'movie_name': chunk[0]['movie_name'],
            'text': combined_text,
            'duration_seconds': round(total_duration, 3),
            'duration_minutes': round(total_duration / 60, 3),
            'original_count': len(chunk)
        }
        
        concatenated_subtitles.append(combined_subtitle)
    
    return concatenated_subtitles

 # Combine subtitles into chunks of 10
combined_srt = concatenate_subtitles_by_chunks(raw_srt_file, chunk_size=10)
pprint(combined_srt[0])

{'duration_minutes': 0.625,
 'duration_seconds': 37.48,
 'end': '00:02:43,480',
 'index': 0,
 'movie_name': 'Shawshank Redemption',
 'original_count': 10,
 'start': '00:02:06,000',
 'text': 'Mr. Dufresne, describe... ...the confrontation you had with your '
         'wife the night she was murdered. It was very bitter. She said she '
         'was glad I knew, that she hated all the sneaking around. And she '
         'said that she wanted a divorce in Reno. -What was your response? -I '
         'told her I would not grant one. "I\'ll see you in hell before I see '
         'you in Reno." Those were your words, according to your neighbors. If '
         "they say so. I really don't remember. I was upset."}
{'end': '00:02:07,880',
 'index': 0,
 'movie_name': 'Shawshank Redemption',
 'start': '00:02:06,000',
 'text': 'Mr. Dufresne, describe...'}


In [27]:

from langchain_text_splitters import RecursiveCharacterTextSplitter


def prepare_documents(raw_srt_file: List[dict]) -> List[Document]:
    """
    Convert raw SRT file data to LangChain Document objects.
    
    Args:
        raw_srt_file (list): List of dictionaries containing subtitle data.
        
    Returns:
        list: List of LangChain Document objects.
    """

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=100, chunk_overlap=10, add_start_index=True
    )
    return text_splitter.split_documents([
        Document(page_content=sub["text"],metadata=sub)
        for sub in raw_srt_file
    ])

# Convert raw_srt_file to LangChain Document objects
all_splits = prepare_documents(combined_srt)

len(all_splits)  # Total number of splits

print(all_splits[0:3])  # Print the content of the first split

[Document(metadata={'index': 0, 'start': '00:02:06,000', 'end': '00:02:43,480', 'movie_name': 'Shawshank Redemption', 'text': 'Mr. Dufresne, describe... ...the confrontation you had with your wife the night she was murdered. It was very bitter. She said she was glad I knew, that she hated all the sneaking around. And she said that she wanted a divorce in Reno. -What was your response? -I told her I would not grant one. "I\'ll see you in hell before I see you in Reno." Those were your words, according to your neighbors. If they say so. I really don\'t remember. I was upset.', 'duration_seconds': 37.48, 'duration_minutes': 0.625, 'original_count': 10, 'start_index': 0}, page_content='Mr. Dufresne, describe... ...the confrontation you had with your wife the night she was murdered. It'), Document(metadata={'index': 0, 'start': '00:02:06,000', 'end': '00:02:43,480', 'movie_name': 'Shawshank Redemption', 'text': 'Mr. Dufresne, describe... ...the confrontation you had with your wife the night

In [18]:
from langchain_huggingface import HuggingFaceEmbeddings



embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B",
                                   cache_folder= "/Users/enkhbat_1/projects/ai-video-ge/cache-models/")


  from .autonotebook import tqdm as notebook_tqdm


In [28]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from qdrant_client.models import NamedSparseVector, SparseVector

def vectorize_documents(documents: List[Document], embedding_model:HuggingFaceEmbeddings,labels:List[str]) -> List[PointStruct]:
    """
    Convert LangChain Document objects to Qdrant PointStructs with embeddings.

    Args:
        documents (list): List of LangChain Document objects.
        embedding_model: Embedding model to generate vector embeddings.
        labels (list): List of labels for the documents.

    Returns:
        list: List of PointStructs with embeddings.
    """
    label_embeddings = {label: embedding_model.embed_query(label) for label in labels}
    points = []
    for doc in documents:
        dense_vector = embedding_model.embed_query(doc.page_content)
        sparse_vector = {
            label: float(cosine_similarity(np.asarray(dense_vector).reshape(1, -1), np.asarray(label_embeddings[label]).reshape(1, -1))[0, 0]) for label in labels
        }
        
        points.append(
            {
                "id": doc.metadata["index"],
                "vector": dense_vector,
                "sparse_vector": sparse_vector,
                "payload": {
                    "start": doc.metadata["start"],
                    "end": doc.metadata["end"],
                    "movie_name": doc.metadata["movie_name"],
                    "text": doc.page_content
                }
            }
        )
    return points

vectorized_points = vectorize_documents(all_splits, embeddings, labels=["action", "motivational", "comedy","scary","si-fi","romantic","drama","thriller"])

# print(embeddings.embed_query("This is a test sentence.").reshape(1,-1))  # Example embedding
# print(vectorized_points[0])  # Print the first vectorized point

In [30]:
filtered_points = list(filter(lambda x: x['sparse_vector']['comedy']>0.5, vectorized_points)) 
print(filtered_points[0]['payload']) # Filter points with sparse_vector > 0.7

{'start': '00:08:05,440', 'end': '00:08:30,600', 'movie_name': 'Shawshank Redemption', 'text': 'Damn near anything within reason.'}


In [38]:
len(embeddings.embed_query(all_splits[2].page_content))  # Example embedding for the first split

1024

In [None]:

client = QdrantClient(
    url=QDRANT_URL,  # Use QDRANT_URL from environment variables
    api_key=QDRANT_API_KEY
)


# client.delete_collection(QDRANT_COLLECTION)  # Delete the collection if it exists

True

In [None]:
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant import RetrievalMode

# Insert the documents into Qdrant
vector_store = QdrantVectorStore.from_documents(
    all_splits,embeddings,
    url = QDRANT_URL,
    api_key = QDRANT_API_KEY,
    collection_name="srt_subtitles",
    retrieval_mode=RetrievalMode.DENSE
)

ValueError: 'sparse_embedding' cannot be None when retrieval mode is 'sparse'

In [None]:

from qdrant_client import models

# Perform a similarity search
retrieve = client.query_points(
    collection_name="srt_subtitles",
    query=embeddings.embed_query("Earn money"),
    limit=3,
     search_params=models.SearchParams(hnsw_ef=128, exact=False), # Explore 128 values before returning results
)
pprint(retrieve.points)

[ScoredPoint(id='e779bacb-cb85-4694-81c5-e3a156769110', version=20, score=0.6050934, payload={'page_content': 'Waste of money, if you ask me.', 'metadata': {'index': 340, 'start': '00:26:43,600', 'end': '00:26:45,560', 'movie_name': 'Shawshank Redemption', 'start_index': 0}}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='cc523f6f-9089-4418-8880-3c9b8d17152b', version=5, score=0.6050934, payload={'page_content': 'Waste of money, if you ask me.', 'metadata': {'index': 340, 'start': '00:26:43,600', 'end': '00:26:45,560', 'movie_name': 'Shawshank Redemption', 'start_index': 0}}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='c9ee01a2-0442-46e8-ae51-fbb6706394f5', version=6, score=0.6033416, payload={'page_content': 'A million bucks?', 'metadata': {'index': 420, 'start': '00:33:07,120', 'end': '00:33:08,920', 'movie_name': 'Shawshank Redemption', 'start_index': 0}}, vector=None, shard_key=None, order_value=None)]


In [68]:

# Perform a similarity search

query = "Earn money"
found_docs = vector_store.similarity_search(query, k=3)
pprint(found_docs)

[Document(metadata={'index': 340, 'start': '00:26:43,600', 'end': '00:26:45,560', 'movie_name': 'Shawshank Redemption', 'start_index': 0, '_id': 'e779bacb-cb85-4694-81c5-e3a156769110', '_collection_name': 'srt_subtitles'}, page_content='Waste of money, if you ask me.'),
 Document(metadata={'index': 340, 'start': '00:26:43,600', 'end': '00:26:45,560', 'movie_name': 'Shawshank Redemption', 'start_index': 0, '_id': 'cc523f6f-9089-4418-8880-3c9b8d17152b', '_collection_name': 'srt_subtitles'}, page_content='Waste of money, if you ask me.'),
 Document(metadata={'index': 420, 'start': '00:33:07,120', 'end': '00:33:08,920', 'movie_name': 'Shawshank Redemption', 'start_index': 0, '_id': 'c9ee01a2-0442-46e8-ae51-fbb6706394f5', '_collection_name': 'srt_subtitles'}, page_content='A million bucks?')]


In [None]:
### Chat with Gemini model
import os

from langchain.chat_models import init_chat_model

# Use Gemini instead of Gemma for system prompt support
model = init_chat_model(model="gemini-1.5-flash",
                         model_provider="google_genai", 
                         api_key=os.environ["GEMINI_API_KEY"])


from langchain_core.messages import HumanMessage

# For Gemma models, combine system instruction with human message
messages = [
    HumanMessage("Translate the following from English into Italian: hi!"),
]

model.invoke(messages).content
