In [13]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from langchain_core.documents import Document
from qdrant_client.http.models import PointStruct, VectorParams, Distance
import re
from pprint import pprint
from typing import List, Dict


# Load environment variables
load_dotenv()

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "srt_subtitles")



In [None]:
def get_files_matching_regex(directory: str, pattern: str) -> List[str]:
    """
    Get all file names in a directory that match a given regex pattern.
    
    Args:
        directory (str): Path to the directory to search
        pattern (str): Regex pattern to match filenames
        
    Returns:
        List[str]: List of matching file paths
    """
    compiled_pattern = re.compile(pattern)
    matching_files = []
    
    try:
        for filename in os.listdir(directory):
            if compiled_pattern.match(filename):
                matching_files.append(os.path.join(directory, filename))
    except FileNotFoundError:
        print(f"Directory not found: {directory}")
    except PermissionError:
        print(f"Permission denied accessing: {directory}")
    
    return matching_files


def clean_html_tags(text: str) -> str:
    """Remove HTML tags from text"""
    html_pattern = re.compile(r'<[^>]+>')
    return html_pattern.sub('', text)


# Function to parse .srt file
def parse_srt(file_path: str, movie_name: str, idx:int=0) -> List[Dict[str, str]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    pattern = re.compile(r'(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\s+(.+?)(?=\n\d+\n|\Z)', re.DOTALL)
    matches = pattern.findall(content)
    subtitles = []
    for _, start, end, text in matches:
        subtitles.append({
            "index": int(idx),
            "start": start,
            "end": end,
            "movie_name": movie_name,
            "text": clean_html_tags(text).replace('\n', ' ').strip(),
            "file_name": os.path.basename(file_path)
        })
        idx += 1
    return subtitles


SRT_PATH = "/Users/enkhbat_1/projects/ai-video-ge/movie-reels/srt_files/"

serial_name = "Buddha"
# Get all SRT files matching the regex for the serial
file_names = get_files_matching_regex(SRT_PATH, r"Buddha Episode.*\.srt")


raw_srt_file = []
for file_name in file_names:
    movie_name = serial_name
    subtitles = parse_srt(file_name, movie_name)
    raw_srt_file.extend(subtitles)

print(len(raw_srt_file))





27200


In [57]:
def concat_subtitles_by_chunks(subtitles, chunk_size=3):
    """
    Concatenate subtitles into chunks of specified size
    
    Args:
        subtitles: List of subtitle dictionaries
        chunk_size: Number of subtitles to combine in each chunk
    
    Returns:
        List of concatenated subtitle chunks
    """
    def time_to_seconds(time_str):
        """Convert 'HH:MM:SS,mmm' format to seconds"""
        time_part, ms = time_str.split(',')
        h, m, s = map(int, time_part.split(':'))
        return h * 3600 + m * 60 + s + int(ms) / 1000
    
    def seconds_to_time(seconds):
        """Convert seconds back to 'HH:MM:SS,mmm' format"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        ms = int((seconds % 1) * 1000)
        return f"{hours:02d}:{minutes:02d}:{secs:02d},{ms:03d}"
    
    concatenated_subtitles = []
    
    # Process subtitles in chunks
    for i in range(0, len(subtitles), chunk_size):
        chunk = subtitles[i:i + chunk_size]
        
        if not chunk:
            continue
            
        # Combine text from all subtitles in chunk
        combined_text = ' '.join(subtitle['text'] for subtitle in chunk)
        
        # Get start and end times
        start_time = chunk[0]['start']
        end_time = chunk[-1]['end']
        
        # Calculate duration
        start_seconds = time_to_seconds(start_time)
        end_seconds = time_to_seconds(end_time)
        total_duration = end_seconds - start_seconds
        
        # Create combined subtitle entry
        combined_subtitle = {
            'index': len(concatenated_subtitles),
            'start': start_time,
            'end': end_time,
            'movie_name': chunk[0]['movie_name'],
            'text': combined_text,
            'duration_seconds': round(total_duration, 3),
            'duration_minutes': round(total_duration / 60, 3),
            'original_count': len(chunk)
        }
        
        concatenated_subtitles.append(combined_subtitle)
    
    return concatenated_subtitles

 # Combine subtitles into chunks of 10
combined_srt = concat_subtitles_by_chunks(raw_srt_file, chunk_size=20)
pprint(combined_srt[0]['text'])

('Tales of ancient history... SCRIPTURES ...are aptly called as the Puranas. '
 'The Puranas are replete with the history of ancient India. The universe was '
 'created ages ago and for the good of mankind, there born a great man in '
 'every era. In the Treta Yuga, Lord Rama appeared with a message of paramount '
 'obedience and set an example for the whole world to follow. In the Dwapar '
 'Yuga, Lord Krishna explained the principle of duty through the Bhagavad Gita '
 'and established the reign of righteousness. As time passed, unrighteousness '
 'overpowered righteousness. Dishonesty, discrimination, inequality, vices, '
 'avarice and blind faith were on the rise. Fear led the mankind into the '
 'abyss of ignorance. To destroy this darkness, another great man was to be '
 "born. This time, that great man was called... Buddha. It's been ages since "
 'the epic battle of Mahabharata. That epic struggle broke up India into many '
 'parts. Two thousand five hundred years ago, in north

In [82]:

from langchain_text_splitters import RecursiveCharacterTextSplitter


def prepare_documents_with_text_splitter(raw_srt_file: List[dict]) -> List[Document]:
    """
    Convert raw SRT file data to LangChain Document objects.
    
    Args:
        raw_srt_file (list): List of dictionaries containing subtitle data.
        
    Returns:
        list: List of LangChain Document objects.
    """

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=100, add_start_index=True
    )
    return text_splitter.split_documents([
        Document(page_content=sub["text"],metadata=sub)
        for sub in raw_srt_file
    ])

def prepare_documents(raw_srt_file: List[dict]) -> List[Document]:
    """
    Convert raw SRT file data to LangChain Document objects.
    
    Args:
        raw_srt_file (list): List of dictionaries containing subtitle data.
        
    Returns:
        list: List of LangChain Document objects.
    """

    return [Document(page_content=sub["text"], metadata=sub)
                   for sub in raw_srt_file]

# Convert raw_srt_file to LangChain Document objects
all_splits = prepare_documents(combined_srt)

len(all_splits)  # Total number of splits

pprint(all_splits[1])  # Print the content of the first split


Document(metadata={'index': 1, 'start': '00:01:38,720', 'end': '00:03:59,440', 'movie_name': 'Buddha', 'text': "near Magadha and Vaishali lay a republic called Kapilavastu. It was ruled by a mighty warrior from the Shakya clan King Suddhodana of the Sun dynasty. -Gopal. -Yes? I think we have some good news. Laushika, you're right. That's why the bells are tolling. The king and his enemies Faced each other A fierce battle ensued The brave won in the end. Who was the brave one? He is none other Than King Suddhodana Your Grace. Your Grace. Laushika, I was going to send for you, The bells are... There's good news, Your Grace. King Suddhodana defeated the tribal chiefs and is returning to Kapilavastu. God has answered my prayers. Did you convey this message to Queen Mahamaya? No, she's not in her chamber at the moment. All right. Make arrangements to distribute food to the poor. I'll personally give this good news to my sister.", 'duration_seconds': 140.72, 'duration_minutes': 2.345, 'origi

In [79]:
from langchain_huggingface import HuggingFaceEmbeddings



embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B",
                                   cache_folder= "/Users/enkhbat_1/projects/ai-video-ge/cache-models/")


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from qdrant_client.models import NamedSparseVector, SparseVector

def vectorize_documents(documents: List[Document], embedding_model:HuggingFaceEmbeddings,labels:List[str]) -> List[PointStruct]:
    """
    Convert LangChain Document objects to Qdrant PointStructs with embeddings.

    Args:
        documents (list): List of LangChain Document objects.
        embedding_model: Embedding model to generate vector embeddings.
        labels (list): List of labels for the documents.

    Returns:
        list: List of PointStructs with embeddings.
    """
    label_embeddings = {label: embedding_model.embed_query(label) for label in labels}
    points = []
    for doc in documents:
        dense_vector = embedding_model.embed_query(doc.page_content)
        sparse_vector = {
            label: float(cosine_similarity(np.asarray(dense_vector).reshape(1, -1), np.asarray(label_embeddings[label]).reshape(1, -1))[0, 0]) for label in labels
        }
        
        points.append(
            {
                "id": doc.metadata["index"],
                "vector": dense_vector,
                "sparse_vector": sparse_vector,
                "payload": {
                    "start": doc.metadata["start"],
                    "end": doc.metadata["end"],
                    "movie_name": doc.metadata["movie_name"],
                    "text": doc.page_content
                }
            }
        )
    return points

vectorized_points = vectorize_documents(all_splits, embeddings, labels=["action", "motivational", "comedy","scary","si-fi","romantic","drama","thriller"])

# print(embeddings.embed_query("This is a test sentence.").reshape(1,-1))  # Example embedding
# print(vectorized_points[0])  # Print the first vectorized point

In [None]:
filtered_points = list(filter(lambda x: x['sparse_vector']['comedy']>0.6, vectorized_points)) 
pprint(filtered_points[1]) # Filter points with sparse_vector > 0.7


pprint([vectorized_points[596]])

{'id': 596,
 'payload': {'end': '00:41:21,480',
             'movie_name': 'Buddha',
             'start': '00:39:46,240',
             'text': 'ceremony'},
 'sparse_vector': {'action': 0.6438201447921093,
                   'comedy': 0.6220840536694847,
                   'drama': 0.6545292430096739,
                   'motivational': 0.5785369844225026,
                   'romantic': 0.612277956591749,
                   'scary': 0.5155247981233781,
                   'si-fi': 0.5167277961278736,
                   'thriller': 0.5226626599765123},
 'vector': [-0.026107044890522957,
            0.0026379297487437725,
            -0.010905607603490353,
            -0.0503990463912487,
            0.061609625816345215,
            -0.053950175642967224,
            -0.0007583479164168239,
            0.04040965810418129,
            -0.01283191330730915,
            0.014037621207535267,
            -0.05315282568335533,
            -0.07843805104494095,
            0.12287121266126633,

In [None]:
pprint(vectorized_points[0]['payload']['text']) 
pprint(all_splits[0].page_content)  # Print the content of the first split
pprint(all_splits[0].metadata)  # Print the metadata of the first split

('Tales of ancient history... SCRIPTURES ...are aptly called as the Puranas. '
 'The Puranas are replete')
('Tales of ancient history... SCRIPTURES ...are aptly called as the Puranas. '
 'The Puranas are replete')
('Tales of ancient history... SCRIPTURES ...are aptly called as the Puranas. '
 'The Puranas are replete with the history of ancient India. The universe was '
 'created ages ago and for the good of mankind, there born a great man in '
 'every era. In the Treta Yuga, Lord Rama appeared with a message of paramount '
 'obedience and set an example for the whole world to follow. In the Dwapar '
 'Yuga, Lord Krishna explained the principle of duty through the Bhagavad Gita '
 'and established the reign of righteousness. As time passed, unrighteousness '
 'overpowered righteousness. Dishonesty, discrimination, inequality, vices, '
 'avarice and blind faith were on the rise. Fear led the mankind into the '
 'abyss of ignorance. To destroy this darkness, another great man was to be '

In [38]:
len(embeddings.embed_query(all_splits[2].page_content))  # Example embedding for the first split

1024

In [None]:

client = QdrantClient(
    url=QDRANT_URL,  # Use QDRANT_URL from environment variables
    api_key=QDRANT_API_KEY
)


# client.delete_collection(QDRANT_COLLECTION)  # Delete the collection if it exists

True

In [None]:
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant import RetrievalMode

# Insert the documents into Qdrant
vector_store = QdrantVectorStore.from_documents(
    all_splits,embeddings,
    url = QDRANT_URL,
    api_key = QDRANT_API_KEY,
    collection_name="srt_subtitles",
    retrieval_mode=RetrievalMode.DENSE
)

ValueError: 'sparse_embedding' cannot be None when retrieval mode is 'sparse'

In [None]:

from qdrant_client import models

# Perform a similarity search
retrieve = client.query_points(
    collection_name="srt_subtitles",
    query=embeddings.embed_query("Earn money"),
    limit=3,
     search_params=models.SearchParams(hnsw_ef=128, exact=False), # Explore 128 values before returning results
)
pprint(retrieve.points)

[ScoredPoint(id='e779bacb-cb85-4694-81c5-e3a156769110', version=20, score=0.6050934, payload={'page_content': 'Waste of money, if you ask me.', 'metadata': {'index': 340, 'start': '00:26:43,600', 'end': '00:26:45,560', 'movie_name': 'Shawshank Redemption', 'start_index': 0}}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='cc523f6f-9089-4418-8880-3c9b8d17152b', version=5, score=0.6050934, payload={'page_content': 'Waste of money, if you ask me.', 'metadata': {'index': 340, 'start': '00:26:43,600', 'end': '00:26:45,560', 'movie_name': 'Shawshank Redemption', 'start_index': 0}}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='c9ee01a2-0442-46e8-ae51-fbb6706394f5', version=6, score=0.6033416, payload={'page_content': 'A million bucks?', 'metadata': {'index': 420, 'start': '00:33:07,120', 'end': '00:33:08,920', 'movie_name': 'Shawshank Redemption', 'start_index': 0}}, vector=None, shard_key=None, order_value=None)]


In [68]:

# Perform a similarity search

query = "Earn money"
found_docs = vector_store.similarity_search(query, k=3)
pprint(found_docs)

[Document(metadata={'index': 340, 'start': '00:26:43,600', 'end': '00:26:45,560', 'movie_name': 'Shawshank Redemption', 'start_index': 0, '_id': 'e779bacb-cb85-4694-81c5-e3a156769110', '_collection_name': 'srt_subtitles'}, page_content='Waste of money, if you ask me.'),
 Document(metadata={'index': 340, 'start': '00:26:43,600', 'end': '00:26:45,560', 'movie_name': 'Shawshank Redemption', 'start_index': 0, '_id': 'cc523f6f-9089-4418-8880-3c9b8d17152b', '_collection_name': 'srt_subtitles'}, page_content='Waste of money, if you ask me.'),
 Document(metadata={'index': 420, 'start': '00:33:07,120', 'end': '00:33:08,920', 'movie_name': 'Shawshank Redemption', 'start_index': 0, '_id': 'c9ee01a2-0442-46e8-ae51-fbb6706394f5', '_collection_name': 'srt_subtitles'}, page_content='A million bucks?')]


In [None]:
### Chat with Gemini model
import os

from langchain.chat_models import init_chat_model

# Use Gemini instead of Gemma for system prompt support
model = init_chat_model(model="gemini-1.5-flash",
                         model_provider="google_genai", 
                         api_key=os.environ["GEMINI_API_KEY"])


from langchain_core.messages import HumanMessage

# For Gemma models, combine system instruction with human message
messages = [
    HumanMessage("Translate the following from English into Italian: hi!"),
]

model.invoke(messages).content
