## Loading and Exploring Dataset from Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sqlite3

In [None]:
#Path to database in drive
db_path = '/content/drive/MyDrive/eng_subtitles_database.db'

In [None]:
#Connecting to database
conn = sqlite3.connect(db_path)

Exploring Dataset (Tables, Data)

In [None]:
#Reading table namaes in a database
import pandas as pd

table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print(table_names)

       name
0  zipfiles


In [None]:
#Loading data from 'zipfiles' table
df = pd.read_sql_query("SELECT * FROM zipfiles LIMIT 5;", conn)
print(df)

       num                                               name  \
0  9180533                         the.message.(1976).eng.1cd   
1  9180583  here.comes.the.grump.s01.e09.joltin.jack.in.bo...   
2  9180592    yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd   
3  9180594    yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd   
4  9180600                              broker.(2022).eng.1cd   

                                             content  
0  b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...  
1  b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...  
2  b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...  
3  b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...  
4  b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...  


Decoding the Binary Content

In [None]:
import zipfile
import io

In [None]:
count = 0

In [None]:
def extract_content(data):

  global count
  count += 1

  try:
    if data is None:
      return None
    #Ensuring data is in bytes format
    if isinstance(data, str):
      data = data.encode('latin-1')

    #Convering data into file-loke object in memory
    with io.BytesIO(data) as byte_stream:
      #open binary stream as a zip-archive
      with zipfile.ZipFile(byte_stream, 'r') as zf:
        subtitle_content = zf.read(zf.namelist()[0])
    return subtitle_content.decode('latin-1')
  except Exception as e:
        print(f"Error in extracting: {e}")
        return None

In [None]:
#Small sample for testing
query = "SELECT num, name, content FROM zipfiles LIMIT 5"
df = pd.read_sql_query(query,conn)

#Extract and decode the content
df['content'] = df['content'].apply(extract_content)

print(df[['num', 'name', 'content']])

       num                                               name  \
0  9180533                         the.message.(1976).eng.1cd   
1  9180583  here.comes.the.grump.s01.e09.joltin.jack.in.bo...   
2  9180592    yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd   
3  9180594    yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd   
4  9180600                              broker.(2022).eng.1cd   

                                             content  
0  1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...  
1  1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther...  
2  1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'...  
3  1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...  
4  ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...  


In [None]:
query = "SELECT num, name, content FROM zipfiles"
df_full = pd.read_sql(query, conn)

df_full['content'] = df_full['content'].apply(extract_content)

print(df_full.head())

       num                                               name  \
0  9180533                         the.message.(1976).eng.1cd   
1  9180583  here.comes.the.grump.s01.e09.joltin.jack.in.bo...   
2  9180592    yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd   
3  9180594    yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd   
4  9180600                              broker.(2022).eng.1cd   

                                             content  
0  1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...  
1  1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther...  
2  1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'...  
3  1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...  
4  ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...  


In [None]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82498 entries, 0 to 82497
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      82498 non-null  int64 
 1   name     82498 non-null  object
 2   content  82498 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


##Dataset Preprocessing
   Data Cleaning


*   Removing Timestamp
*   Removing special characters and digits (keep only letters and spaces)
*   Convert text to lowercase
*   Tokenize text into words
*   Remove stopwords






In [None]:
df_sample = df_full.sample(frac=0.3, random_state=42).reset_index(drop=True)
print(f"Sample size: {df_sample.shape}")

Sample size: (24749, 3)


In [None]:
df_sample.head()

Unnamed: 0,num,name,content
0,9251120,maybe.this.time.(2014).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."
1,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,"1\r\n00:00:09,275 --> 00:00:11,876\r\n¶ Oh, I ..."
2,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,"1\r\n00:00:07,140 --> 00:00:14,220\r\n<i>Timin..."
3,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,"1\r\n00:00:06,133 --> 00:00:08,900\r\n[etherea..."
4,9408707,battlebots.(2015).eng.1cd,"ï»¿1\r\n00:00:01,480 --> 00:00:03,570\r\n[Chri..."


In [None]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24749 entries, 0 to 24748
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      24749 non-null  int64 
 1   name     24749 non-null  object
 2   content  24749 non-null  object
dtypes: int64(1), object(2)
memory usage: 580.2+ KB


In [None]:
import re
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
def preprocess_text(text):

  # 1. removing timestamp
  text = re.sub(r'\d{2}:\d{2}:d{2},\d{3} --> \d{2}:\d{2}:d{2},\d{3}', '', text)

  # 2. removing special characters and digits
  text = re.sub(r'[^A-Za-z\s]', '', text)

  # 3. conversion to lower case
  text = text.lower()

  # 4. Tokenize the text into words
  words = nltk.word_tokenize(text)

  # 5. Removing stopwords
  stop_words = set(stopwords.words('english'))
  words = [word for word in words if word not in stop_words]

  # 6. Joining words back into a string
  cleaned_text = ' '.join(words)

  return cleaned_text

In [None]:
#applying above function to content column
df_sample['cleaned_content'] = df_sample['content'].apply(preprocess_text)

print(df_sample[['num', 'name', 'cleaned_content']].head())

       num                                               name  \
0  9251120                     maybe.this.time.(2014).eng.1cd   
1  9211589  down.the.shore.s01.e10.and.justice.for.all.(19...   
2  9380845  uncontrollably.fond.s01.e07.heartache.(2016).e...   
3  9301436  screen.two.s13.e04.the.precious.blood.(1996).e...   
4  9408707                          battlebots.(2015).eng.1cd   

                                     cleaned_content  
0  watch video online opensubtitles free browser ...  
1  oh know getting late dont wan na go home im hu...  
2  itiming subtitles uncontrollable lovebirds tea...  
3  ethereal music apiopensubtitlesorg deprecated ...  
4  chris oh minibots yelling oh leave little bots...  


In [None]:
# Printing first 10 rows of the dataframe with the selected columns
print(df_sample[['num', 'name', 'cleaned_content']].head(10).to_string(index=False))


    num                                                                       name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

## Generating Text Vectors

(1) Vectorizing the Subtitle Documents using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Initializing TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    max_df=0.85,
    min_df=5,
    stop_words='english',
    ngram_range=(1,2)
    )
# Applying TF-IDF to cleaned_content column
tfidf_matrix = tfidf_vectorizer.fit_transform(df_sample['cleaned_content'])
# Converting TF-IDF matrix to a data frame for easier viewing
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(tfidf_df.head())

        aah  aaron  abandoned      abby  ability      able  absolute  \
0  0.008235    0.0        0.0  0.011751      0.0  0.030878       0.0   
1  0.028025    0.0        0.0  0.000000      0.0  0.000000       0.0   
2  0.000000    0.0        0.0  0.000000      0.0  0.027633       0.0   
3  0.000000    0.0        0.0  0.000000      0.0  0.020455       0.0   
4  0.000000    0.0        0.0  0.000000      0.0  0.014234       0.0   

   absolutely  abuse   academy  ...  youre wrong  youre youre  youth  \
0    0.000000    0.0  0.000000  ...          0.0     0.000000    0.0   
1    0.000000    0.0  0.000000  ...          0.0     0.000000    0.0   
2    0.000000    0.0  0.023795  ...          0.0     0.000000    0.0   
3    0.000000    0.0  0.000000  ...          0.0     0.010664    0.0   
4    0.010348    0.0  0.000000  ...          0.0     0.000000    0.0   

      youve  youve got  youve seen       yup  zero      zone  zoom  
0  0.026569   0.004737    0.000000  0.000000   0.0  0.000000   0.

(2) Using BERT-based model for sentences

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.9/275.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.4.1


In [None]:
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (63.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86


In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
import numpy as np

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize the model (you can use other pre-trained models like 'paraphrase-MiniLM-L6-v2' for faster performance)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the cleaned documents
embeddings = model.encode(df_sample['cleaned_content'].tolist(), convert_to_tensor=True)

# Convert embeddings to numpy array for easier manipulation
embeddings_np = embeddings.cpu().detach().numpy()

# Check the shape of embeddings
print(embeddings_np.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(24749, 384)


In [None]:
model.save('/content/drive/MyDrive/saved_model')
np.save('/content/drive/MyDrive/embeddings.npy', embeddings_np)

In [None]:
embeddings_np = np.load('/content/drive/MyDrive/embeddings.npy')
print(embeddings_np.shape)

(24749, 384)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Creating and Storing Embeddings in ChromaDB

In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.21.0-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.31.1-py3-none-any.whl.metadata (1.6 kB)
Collectin

In [None]:
pip show chromadb


Name: chromadb
Version: 0.6.3
Summary: Chroma.
Home-page: https://github.com/chroma-core/chroma
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: /usr/local/lib/python3.11/dist-packages
Requires: bcrypt, build, chroma-hnswlib, fastapi, grpcio, httpx, importlib-resources, kubernetes, mmh3, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-instrumentation-fastapi, opentelemetry-sdk, orjson, overrides, posthog, pydantic, pypika, PyYAML, rich, tenacity, tokenizers, tqdm, typer, typing_extensions, uvicorn
Required-by: 


In [None]:
import importlib.util
print(importlib.util.find_spec("chromadb"))


ModuleSpec(name='chromadb', loader=<_frozen_importlib_external.SourceFileLoader object at 0x781958ece850>, origin='/usr/local/lib/python3.11/dist-packages/chromadb/__init__.py', submodule_search_locations=['/usr/local/lib/python3.11/dist-packages/chromadb'])


In [None]:
import chromadb
print(chromadb.__version__)


0.6.3


In [None]:
# Initializing ChromaDB client
client = chromadb.Client()
# Creating a collection
collection = client.create_collection(name='subtitle_collection1')
# Store embeddings in ChromaDB
for idx, embedding in enumerate(embeddings_np):
  collection.add(
      ids=[str(idx)],
      embeddings=[embedding.tolist()],
      metadatas=[{
          "doc_id": idx,
          "text": df_sample['cleaned_content'].iloc[idx]
      }]
  )

In [None]:
# Checking collection size
print(f"Number of documents in collection: {collection.count()}")

Number of documents in collection: 24749


In [None]:
# Function to chunk documents
def chunk_text(text, chunk_size=500, overlap_size=50):
  # Tokenizing the text
  tokens = text.split()

  chunks = []
  for i in range(0, len(tokens), chunk_size - overlap_size):
    chunk = tokens[i:i + chunk_size]
    chunks.append(" ".join(chunk))
  return chunks

# Initializing the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function for embedding chunks
def generate_embeddings_for_chunks(text, chunk_size=500, overlap_size=50):
  chunks = chunk_text(text, chunk_size, overlap_size)
  embeddings = model.encode(chunks, convert_to_tensor=True)
  # Converting embeddings into numpy array
  embeddings_np = embeddings.cpu().detach().numpy()
  return embeddings_np, chunks

document_text = df_sample['cleaned_content'].iloc[0]

embeddings_np, chunked_texts = generate_embeddings_for_chunks(document_text)

print(f"Embeddings shape: {embeddings_np.shape}")

for idx, chunk in enumerate(chunked_texts):
  print(f"Chunk {idx +1}: {chunk[:150]}...")
  print(f"Embedding:{embeddings_np[idx][:10]}...")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings shape: (12, 384)
Chunk 1: watch video online opensubtitles free browser extension osdblinkext iit couldve another summeri ibut set foot sandi ithat summer suddenly felt differe...
Embedding:[-0.07535624 -0.08715754  0.00428826 -0.06775937  0.00953814  0.04579948
  0.08148973 -0.03512756  0.02765187 -0.03218287]...
Chunk 2: mmhm one two three wow beautiful tonio like yes know favorite spot town fact beautiful thing ive ever seen mmm saw nice wow youre teasing true ive got...
Embedding:[-0.05228969 -0.0945009   0.08088917  0.01432562 -0.06435941  0.05368626
  0.08665136 -0.06066933  0.02750147 -0.01823075]...
Chunk 3: forget forget forget hello guys seven years ago lets move shoes wouldnt able move found happened didnt show got ship thats family told mean tell never...
Embedding:[-0.05972428 -0.1387379   0.03508682 -0.01586794 -0.00595995  0.05469533
  0.05849322 -0.02946528  0.03044028 -0.06629746]...
Chunk 4: party scene booming right ants italian took care left million mill

In [None]:
client = chromadb.Client()
collection_name = "subtitle_collection_with_chunks"
try:
  collection = client.create_collection(name=collection_name)
except chromadb.errors.UniqueConstraintError:
  collection = client.get_collection(name=collection_name)

def store_embeddings_in_chromadb(embeddings_np, chunked_texts, doc_id):
  for idx, (embedding, chunk_text) in enumerate(zip(embeddings_np, chunked_texts)):
    collection.add(
        ids=[f"{doc_id}_{idx}"],
        embeddings=[embedding.tolist()],
        metadatas=[{
            "doc_id": doc_id,
            "chunk_text": chunk_text,
            "chunk_idx": idx
        }]
    )

document_id = "document_1"
store_embeddings_in_chromadb(embeddings_np, chunked_texts, document_id)

print(f"Stored {len(embeddings_np)} chunks for document ID: {document_id}")


Stored 12 chunks for document ID: document_1


## Implementing Speech Recognition, Cosine Similarity for required results

In [None]:
!pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Downloading SpeechRecognition-3.14.1-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.1


Speech Recognition using Google Web Speech API for transcription

In [None]:
import speech_recognition as sr

def audio_text(audio_file_path):
  recognizer = sr.Recognizer()
  audio_file = sr.AudioFile(audio_file_path)

  with audio_file as source:
    audio = recognizer.record(source)

  try:
    # Google Web Speech API for transcription
    text = recognizer.recognize_google(audio)
    return text
  except sr.UnknownValueError:
    return "Sorry, I couldn't understand the audio."
  except sr.RequestError as e:
    return f"Could not request results from Google Speech Recognition service; {e}"

In [None]:
# to call above function for input
query_audio_path = '/content/WhatsApp Ptt 2025-03-22 at 4.47.56 PM (online-audio-converter.com).wav'
query_text = audio_text(query_audio_path)
print(query_text)

princess leave the minibus salon pick somebody


Calculating Cosine Similarity and obtaining required results

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_most_similar_documents(query_embedding, collection, top_n=5):
    # Fetching document embeddings and IDs from ChromaDB
    results = collection.get(include=['embeddings'])

    # Checking if embeddings are available in results
    if 'embeddings' not in results or len(results['embeddings']) == 0:
        raise ValueError("No embeddings found in the collection.")

    # Extracting embeddings and document IDs
    document_embeddings = np.array(results['embeddings'])  # Convert to numpy array if needed
    document_ids = results['ids']  # Ensure this is correctly retrieved from collection

    # Ensuring query_embedding is in the correct shape (1, embedding_dim)
    query_embedding = np.array(query_embedding).reshape(1, -1)

    # Checking if the query embedding shape matches the document embeddings
    if query_embedding.shape[1] != document_embeddings.shape[1]:
        raise ValueError("The query embedding dimensionality does not match the document embeddings.")

    # Computing cosine similarity between query and document embeddings
    similarities = cosine_similarity(query_embedding, document_embeddings)

    # Getting top N most similar documents (sorted in descending order)
    top_n_indices = similarities.argsort()[0][-top_n:][::-1]
    top_documents = [(document_ids[idx], similarities[0][idx]) for idx in top_n_indices]

    return top_documents

query_embedding = generate_query_embedding(query_text)

# Retrieving the top N similar documents
top_documents = get_most_similar_documents(query_embedding, collection, top_n=5)

In [None]:
for doc_id, score in top_documents:
    # Converting doc_id to the appropriate type in your dataframe (e.g., int)
    doc_id_int = int(doc_id.split('_')[-1])  # Extracting the numeric part of doc_id

    # Gettig the metadata from ChromaDB using doc_id
    metadata = collection.get(ids=[doc_id], include=['metadatas'])['metadatas'][0]

    chunk_text = metadata['chunk_text']

    # Filtering df_sample based on the chunk text
    doc = df_sample[df_sample['cleaned_content'].str.contains(chunk_text)]

    # Checking if 'doc' is not empty before accessing elements
    if not doc.empty:
        print(f"Document ID: {doc_id}")
        print(f"Title: {doc['name'].values[0]}")
        print(f"Content: {doc['cleaned_content'].values[0]}")
        print(f"Similarity Score: {score:.4f}")
        print("=" * 50)
    else:
        print(f"Document with ID {doc_id} not found in df_sample.")

Document ID: document_1_6
Title: maybe.this.time.(2014).eng.1cd
Content: watch video online opensubtitles free browser extension osdblinkext iit couldve another summeri ibut set foot sandi ithat summer suddenly felt differenti ilike going summeri ithat would change lifei ithe summer freedomi ithe summer endless possibilitiesi ithe summer ooh aah ooh oh oh oh ooh ithat summer mei youre quite dancer stop come keep dancing whatever im kidding dont get mad huh hey im going get towel stop thought gon na kiss excuse wan na kiss yet mean yet youre girl mean girl girlfriend miss wish dont call miss dont pretend gentleman youre clearly call rude snob bitch douche handsome conceited like huh jerk exactly type leave alone steph aha steph ill call tep remove f way im tonio still tomorrow dont leave yet im going court ii chose walk away youi ibut fate different plani councilor teaching basic english literacy well teaching children read write english yes long program run okay entire summer ii though