<a href="https://colab.research.google.com/github/mothi4678/Data-science-with-GenAI/blob/main/cloningshazamapp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sqlite3
import pandas as pd
import numpy as np
# import chromadb
# import faiss  # Alternative to chromadb
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# import whisper
from faster_whisper import WhisperModel  # Alternative to whisper
import torch
# Load Database
def load_data(db_path): # /content/drive/MyDrive/dataset/eng_subtitles_database.db
    conn = sqlite3.connect("eng_subtitles_database.db")
    query = "SELECT num, name, content FROM zipfiles;"
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df
# Preprocess Subtitle Text
def preprocess_text(text):
    if isinstance(text, bytes):
        text = text.decode('latin-1')  # Decode binary content
    text = text.lower()
    text = text.replace('\n', ' ').replace('\r', ' ')
    return text
# Document Chunking
def chunk_text(text, chunk_size=500, overlap=100):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks
# Vectorization using TF-IDF
def vectorize_tfidf(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer
# Vectorization using BERT
def vectorize_bert(texts):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings
# Store Embeddings in ChromaDB
def store_embeddings(texts, db_name="chroma_subtitles"):
    chroma_client = chromadb.PersistentClient(path=db_name)
    collection = chroma_client.get_or_create_collection(name="subtitles")
    for i, text in enumerate(texts):
        collection.add(ids=[str(i)], documents=[text])
    return collection
# Audio to Text Conversion
def transcribe_audio(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result["text"]
# Search Query Execution
def search_query(query, collection, vectorizer, tfidf_matrix, method="tfidf"):
    if method == "tfidf":
        query_vec = vectorizer.transform([query])
        similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    else:
        model = SentenceTransformer("all-MiniLM-L6-v2")
        query_embedding = model.encode([query], convert_to_tensor=True)
        similarity = cosine_similarity(query_embedding.cpu().numpy(), tfidf_matrix.cpu().numpy()).flatten()
    top_indices = np.argsort(similarity)[::-1][:5]
    return top_indices, similarity[top_indices]
# Load Data
df = load_data("eng_subtitles_database.db")
df["content"] = df["content"].apply(preprocess_text)
df["chunks"] = df["content"].apply(lambda x: chunk_text(x))
# Flatten Chunks
all_chunks = [chunk for sublist in df["chunks"] for chunk in sublist]
# Vectorization
tfidf_matrix, vectorizer = vectorize_tfidf(all_chunks)
bert_embeddings = vectorize_bert(all_chunks)
# Store in ChromaDB
collection = store_embeddings(all_chunks)
print("Setup complete. Ready for search queries!")
# Example of search query
query = "What is the main topic?"
top_indices, similarities = search_query(query, collection, vectorizer, tfidf_matrix, method="tfidf")
print("Top 5 search results (TF-IDF):")
for i, index in enumerate(top_indices):
    print(f"Result {i+1}: Similarity={similarities[i]}, Chunk: {all_chunks[index][:100]}...") # print first 100 characters
top_indices_bert, similarities_bert = search_query(query, collection, bert_embeddings, tfidf_matrix, method="bert") # changed bert embeddings to tfidf_matrix
print("\nTop 5 search results (BERT):")
for i, index in enumerate(top_indices_bert):
    print(f"Result {i+1}: Similarity={similarities_bert[i]}, Chunk: {all_chunks[index][:100]}")

ModuleNotFoundError: No module named 'faster_whisper'