In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import sqlite3
import pandas as pd
import zipfile
import io
import re
import os

conn = sqlite3.connect(r'/content/drive/MyDrive/Innomatics/Copy of eng_subtitles_database.db')
df = pd.read_sql_query("SELECT num,name,content FROM zipfiles LIMIT 100", conn)
def decode_method(binary_data):
    """Extracts and decodes subtitle content from a compressed binary file."""
    try:
        with io.BytesIO(binary_data) as f:
            with zipfile.ZipFile(f, 'r') as zip_file:
                file_name = zip_file.namelist()[0]
                subtitle_content = zip_file.read(file_name)
                return subtitle_content.decode('utf-8', errors='replace')
    except Exception as e:
        print(f"Error decoding ZIP file: {e}")
        return None
df['file_content'] = df['content'].apply(decode_method)

def clean_text(text):
    """Cleans subtitle text by removing timestamps, numbers, and unwanted characters."""
    if pd.isna(text):
        return ""
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df["cleaned_text"] = df["file_content"].apply(clean_text)

output_path = "/content/drive/MyDrive/Innomatics/cleaned_subtitles.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df[["num","name", "cleaned_text"]].to_csv(output_path, index=False)
print(f"Processed subtitle data saved to {output_path}")


In [None]:

!pip install langchain_huggingface
!pip install langchain_chroma
!pip install --upgrade numpy
!pip install --upgrade --force-reinstall sentence-transformers transformers


In [None]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma


EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"


CHROMA_DB_PATH = "/content/drive/MyDrive/Innomatics/chroma_db"


# Load processed subtitle data
df = pd.read_csv("/content/drive/MyDrive/Innomatics/cleaned_subtitles.csv")

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Prepare chunks for embedding
chunks = []
for _, row in df.iterrows():
    num = str(row["num"])
    name = str(row["name"])
    text = row["cleaned_text"]

    if pd.isna(text) or not text.strip():
        continue  # Skip empty texts

    # Split text into smaller chunks
    split_chunks = text_splitter.create_documents([text], metadatas=[{"num": num, "name":name}])
    chunks.extend(split_chunks)


def store_embeddings(chunks):
    # Load the embedding model
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Store the embeddings in ChromaDB
    db = Chroma.from_documents(chunks, embeddings, persist_directory="/content/drive/MyDrive/Innomatics/chroma_db")

    print("✅ Embeddings stored successfully in ChromaDB!")
    return db


def load_vector_store():
    """Load the stored ChromaDB vector store."""
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    db = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embeddings)
    return db


# Store embeddings in ChromaDB
store_embeddings(chunks)


In [None]:

!pip install streamlit
!pip install assemblyai


In [None]:
%%writefile app.py
import streamlit as st
import assemblyai as aai
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import tempfile

# Initialize AssemblyAI
aai.settings.api_key = "key"
transcriber = aai.Transcriber()

# Load embeddings model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load ChromaDB
db = Chroma(persist_directory="/content/drive/MyDrive/Innomatics/chroma_db", embedding_function=embeddings)

def transcribe_audio(audio_path):
    """Convert audio to text using AssemblyAI."""
    transcript = transcriber.transcribe(audio_path)
    return transcript.text if transcript else ""

def retrieve_highest_score_chunk(query: str):
    """Retrieve the highest-score document chunk from ChromaDB with metadata."""
    results = db.similarity_search_with_score(query, k=5)  # Retrieve top 5 results first
    if results:
        highest_score_doc, highest_score = max(results, key=lambda x: x[1])
        return {
            "num": highest_score_doc.metadata.get("num", "Unknown"),
            "name": highest_score_doc.metadata.get("name", "Unknown"),
            "content": highest_score_doc.page_content,
            "score": highest_score  # Include similarity score
        }
    return None  # Return None if no results found

# Streamlit UI with styling
st.markdown(
    """
    <style>
        .main {background-color: #f5f7fa;}
        .stTextArea {border-radius: 10px; background-color: #eef2f3;}
        .stFileUploader {background-color: #d1e8ff; padding: 10px; border-radius: 10px;}
        .stSubheader {color: #ff5733; font-size: 18px;}
        .stWrite {color: #2ca02c; font-size: 16px;}
    </style>
    """,
    unsafe_allow_html=True
)

st.markdown("<h3 style='text-align: center;'>Enhancing Search Engine Relevance for Video Subtitles</h3>", unsafe_allow_html=True)

col1, col2 = st.columns([3, 1])

with col2:
    st.markdown('<div class="stFileUploader">', unsafe_allow_html=True)
    uploaded_file = st.file_uploader("Upload an MP3 file", type=["mp3"])
    st.markdown('</div>', unsafe_allow_html=True)

with col1:
    if uploaded_file is not None:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
            temp_audio.write(uploaded_file.read())
            temp_audio_path = temp_audio.name

        st.write("Transcribing audio...")
        transcript_text = transcribe_audio(temp_audio_path)

        if transcript_text:
            st.subheader("Transcript")
            st.text_area("Transcription Output", transcript_text, height=200)

            st.write("Searching for the most relevant subtitle...")
            highest_chunk = retrieve_highest_score_chunk(transcript_text)

            if highest_chunk:
                st.subheader("Title")
                subtitle_name = highest_chunk['name']
                subtitle_num = highest_chunk['num']
                subtitle_url = f"https://www.opensubtitles.org/en/subtitles/{subtitle_num}"

                st.write(f"🎬 {subtitle_name} (Score: {highest_chunk['score']:.4f})")
                st.markdown(f"[🔗 View Subtitle on OpenSubtitles]({subtitle_url})", unsafe_allow_html=True)
            else:
                st.write("No relevant subtitles found.")
        else:
            st.write("Failed to transcribe audio.")


In [None]:
!wget -q -O - ipv4.icanhazip.com

In [None]:
!streamlit run app.py & npx localtunnel --port 8501