In [7]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.15-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.27.0-py3-none-any.whl.metadata (2.3 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)
  

In [1]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [2]:
!kaggle datasets download -d mobassir/en-bn-sahih-bukhari-muslim

Dataset URL: https://www.kaggle.com/datasets/mobassir/en-bn-sahih-bukhari-muslim
License(s): CC0-1.0
en-bn-sahih-bukhari-muslim.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
!unzip /content/en-bn-sahih-bukhari-muslim.zip -d ./dataset


Archive:  /content/en-bn-sahih-bukhari-muslim.zip
replace ./dataset/bn_bukhari_muslim.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('en_bn_bukhari_muslim.csv')

# Display the dataframe
print(df.head())


   id  hadith_id           source  chapter_no  hadith_no      chapter  \
0   0          1   Sahih Bukhari            1          1  Revelation    
1   1          2   Sahih Bukhari            1          2  Revelation    
2   2          3   Sahih Bukhari            1          3  Revelation    
3   3          4   Sahih Bukhari            1          4  Revelation    
4   4          5   Sahih Bukhari            1          5  Revelation    

                              chain_indx  \
0   30418, 20005, 11062, 11213, 11042, 3   
1         30355, 20001, 11065, 10511, 53   
2  30399, 20023, 11207, 11013, 10511, 53   
3                       11013, 10567, 34   
4         20040, 20469, 11399, 11050, 17   

                                             text_ar  \
0  حدثنا الحميدي عبد الله بن الزبير، قال حدثنا سف...   
1  حدثنا عبد الله بن يوسف، قال أخبرنا مالك، عن هش...   
2  حدثنا يحيى بن بكير، قال حدثنا الليث، عن عقيل، ...   
3  قال ابن شهاب وأخبرني أبو سلمة بن عبد الرحمن، أ...   
4  حدثنا موسى بن

In [2]:
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

model_name_or_path = 'intfloat/multilingual-e5-large'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
# Example of a custom embedding function
# Custom embedding function for batch processing
def custom_embedding_function(texts):
    # Tokenize the input sentences
    batch_dict = tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

    # Move tensors to GPU if available
    if torch.cuda.is_available():
        batch_dict = {k: v.to('cuda') for k, v in batch_dict.items()}
        model.to('cuda')

    # Get model outputs
    with torch.no_grad():
        outputs = model(**batch_dict)

    # Extract the embeddings from the last hidden state (CLS token)
    embeddings = outputs.last_hidden_state[:, 0]

    # Normalize the embeddings
    normalized_embeddings = F.normalize(embeddings, p=2, dim=1)

    # Return embeddings as lists
    return normalized_embeddings.cpu().tolist()


In [3]:
import chromadb
from tqdm import tqdm
# Initialize ChromaDB client
client = chromadb.PersistentClient(path="hadith-chroma-e5")

# Create a collection for your data
collection = client.get_or_create_collection(name="hadith")


In [4]:
# client.delete_collection(name="hadith")

In [4]:

# Batch size
batch_size = 128  # Adjust batch size based on available memory

# Prepare a batch list
batch_texts = []
batch_embeddings = []
batch_metadatas = []
batch_ids = []

for index, row in tqdm(df.iterrows(), total=len(df)):
    text = row['text_en']  # Text to be embedded
    metadata = row.to_dict()

    # Add text and metadata to batch lists
    batch_texts.append(text)
    batch_metadatas.append(metadata)
    batch_ids.append(str(index))

    # If batch is full, process it
    if len(batch_texts) == batch_size:
        embeddings = custom_embedding_function(batch_texts)
        collection.add(
            documents=batch_texts,
            embeddings=embeddings,
            ids=batch_ids,
            metadatas=batch_metadatas
        )
        # Clear batch lists after adding
        batch_texts = []
        batch_embeddings = []
        batch_metadatas = []
        batch_ids = []

# Process any remaining items
if batch_texts:
    embeddings = custom_embedding_function(batch_texts)
    collection.add(
        documents=batch_texts,
        embeddings=embeddings,
        ids=batch_ids,
        metadatas=batch_metadatas
    )


100%|█████████████████████████████████████| 14966/14966 [10:37<00:00, 23.48it/s]


In [9]:
# collection.peek() # returns a list of the first 10 items in the collection


In [10]:
collection.count()

14966

In [11]:
import chromadb
from tqdm import tqdm
# Initialize ChromaDB client
client = chromadb.PersistentClient(path="hadith-chroma-e5")
collection = client.get_collection(name="hadith")

In [12]:
result=collection.query(
    query_embeddings=custom_embedding_function("when do i pay zakat?"),
    n_results=5
)


In [14]:
for i in range(len(result['metadatas'][0])):
    print("==="*50)
    print(result['metadatas'][0][i]['text_bn'])
    print(result['metadatas'][0][i]['text_en'])
    print(f"\nHadith no: {result['metadatas'][0][i]['hadith_no']}\n")
    print(result['metadatas'][0][i]['chapter'])
    print(result['metadatas'][0][i]['source'])





ইবনে উমর বলেন, রাসূলুল্লাহ (সা.) রমজান মাসে মানুষের জন্য জাকাত-উল-ফিতর (রমজান রোজা ভাঙার সময়) প্রদানের বিধান দিয়েছেন, প্রত্যেক স্বাধীন ব্যক্তির জন্য, অথবা প্রত্যেক দাসের জন্য, মুসলমানদের মধ্যে পুরুষ ও মহিলা - এক সেট শুকনো খেজুর, অথবা এক সেট গম 
 Ibn Umar said that Allahs Messenger (way peace be upon him) prescribed the payment of Zakat-ul-Fitr (on breaking the fast) of Ramadan for people, for every freeman, or slave, male and female among the Muslims-one sa of dried dates, or one sa of barley.

Hadith no: 2325

The Book of Zakat 
 Sahih Muslim 
ইবনে উমর (রাঃ) এর কাহিনীঃ নবী (সাঃ) মানুষকে নির্দেশ দিয়েছিলেন জাকাত-উল-ফিতর (জাকাত-উল-ফিতর) দিতে এবং নামাজের আগে নামাজে যেতে। 
 Narrated Ibn `Umar:                     The Prophet ordered the people to pay Zakat-ul-Fitr before going to the `Id prayer.

Hadith no: 1534

Obligatory Charity Tax Zakat 
 Sahih Bukhari 
ইবনে উমর বলেন, রাসূলুল্লাহ (সা.) আদেশ দিয়েছেন যে, নামাজের জন্য বের হওয়ার আগে সদ্কাৎ-উল-ফিৎর (সাদাকাত-উল-ফিতর) দিতে হবে। 
 Ibn Um