<a href="https://colab.research.google.com/github/mohamed-stifi/PFA-Arabic-LLMs/blob/main/vector_databases.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets faiss-cpu==1.7.4 chromadb==0.4.22 sentence-transformers==2.3.1

In [None]:
! pip install transformers

In [None]:
import pandas as pd

In [None]:
articles = '/content/drive/MyDrive/PFA/dataset/article_table.csv'
topics = '/content/drive/MyDrive/PFA/dataset/topic_table.csv'

In [None]:
df_articles = pd.read_csv(articles)
df_topics = pd.read_csv(topics)

In [None]:
df_articles.head()

Unnamed: 0,article_id,article_title,topic_id
0,1,طريقة عمل البسبوسة,1
1,2,طريقة عمل شوربة العدس,1
2,3,طريقة عمل عجينة البيتزا,1
3,4,طريقة عمل البشاميل,1
4,5,كيفية عمل السحلب,1


In [None]:
df_topics.head()

Unnamed: 0,topic_id,topic_title
0,1,فن الطهي
1,2,أطباق رئيسية
2,3,أطباق جانبية
3,4,أطباق شرقية
4,5,أطباق شامية


In [None]:
articles_title = df_articles['article_title'].tolist()
topics_title = df_topics['topic_title'].tolist()
len(articles_title), len(topics_title)

(77713, 357)

In [None]:
articles_id = [str(i) for i in df_articles['article_id'].tolist()]
topics_id = [str(i) for i in df_topics['topic_id'].tolist()]

In [None]:
articles_metadata = [
    {
        "article_id": rec,
        # "title": rec["title"]
    }
    for rec in df_articles['article_id']
]

topics_metadata = [
    {
        "topic_id": rec,
        # "title": "",
    }
    for rec in df_topics['topic_id']
 ]

In [None]:
from sentence_transformers import SentenceTransformer

model_id = "sentence-transformers/distiluse-base-multilingual-cased-v2"
dim = 512

# model_id = "asafaya/bert-large-arabic"
# dim = 1024

device = "cuda:0" # "cpu" #

model = SentenceTransformer(model_id, device=device)

In [None]:
encoded_articles_title = model.encode(articles_title, show_progress_bar=True)

Batches:   0%|          | 0/2429 [00:00<?, ?it/s]

In [None]:
encoded_topics_title = model.encode(topics_title, show_progress_bar=True)

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

# Vector Databases

## FAISS

In [None]:
import faiss
import numpy as np
from copy import deepcopy

In [None]:
norm_encoded_articles_title = deepcopy(encoded_articles_title)
norm_encoded_topics_title = deepcopy(encoded_topics_title)

faiss.normalize_L2(norm_encoded_articles_title)
faiss.normalize_L2(norm_encoded_topics_title)

In [None]:
articles_faiss_index = faiss.IndexIDMap( faiss.IndexFlatIP(dim) )
articles_faiss_index.add_with_ids( norm_encoded_articles_title, articles_id )



topics_faiss_index = faiss.IndexIDMap( faiss.IndexFlatIP(dim) )
topics_faiss_index.add_with_ids( norm_encoded_topics_title, topics_id )

In [None]:
question = "السبب في صغر الأسنان بالمقارنة مع حجم الفكين"
question_embed = model.encode([question])

faiss.normalize_L2(question_embed)

articles_results = articles_faiss_index.search(question_embed, 3)
topics_results = topics_faiss_index.search(question_embed, 3)

In [None]:
print(articles_results)

(array([[0.6063632 , 0.59641016, 0.5900053 ]], dtype=float32), array([[62770, 62688, 62779]]))


In [None]:

print('------------------------------------ articles :')
print(articles_title[articles_results[1][0][0]])
print('------------------------------------ topics :')
print(articles_title[topics_results[1][0][0]])

------------------------------------ articles :
الوقاية من تسوس الأسنان
------------------------------------ topics :
فوائد قلوب الدجاج


In [None]:
print('------------------------------------ articles :')
print(articles_title[articles_results[1][0][1]])
print('------------------------------------ topics :')
print(articles_title[topics_results[1][0][1]])

------------------------------------ articles :
خلع ضرس العقل العلوي
------------------------------------ topics :
كيف يطبخ اللفت


In [None]:
## Save
import pickle

with open("articles_faiss_index.pickle", "wb") as handle:
    pickle.dump(articles_faiss_index, handle, protocol=pickle.HIGHEST_PROTOCOL)


with open("topics_faiss_index.pickle", "wb") as handle:
    pickle.dump(topics_faiss_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
## Load
with open("articles_faiss_index.pickle", "rb") as handle:
    loaded_articles_faiss_index = pickle.load(handle)

with open("topics_faiss_index.pickle", "rb") as handle:
    loaded_topics_faiss_index = pickle.load(handle)

In [None]:
question = "السبب في صغر الأسنان بالمقارنة مع حجم الفكين"
question_embed = model.encode([question])

faiss.normalize_L2(question_embed)

articles_results = loaded_articles_faiss_index.search(question_embed, 3)
topics_results = loaded_topics_faiss_index.search(question_embed, 3)

print('------------------------------------ articles :')
print(articles_title[articles_results[1][0][0]])
print('------------------------------------ topics :')
print(articles_title[topics_results[1][0][0]])

------------------------------------ articles :
الوقاية من تسوس الأسنان
------------------------------------ topics :
فوائد قلوب الدجاج


## ChromaDB

In [None]:
import chromadb

chroma_client = chromadb.PersistentClient(path="./chromadb-ar-docs")

In [None]:
articles_collection = chroma_client.create_collection(
    name="ar_articles_docs",
    metadata={"hnsw:space": "cosine"}
)

topics_collection = chroma_client.create_collection(
    name="ar_topics_docs",
    metadata={"hnsw:space": "cosine"}
)

In [None]:
articles_collection.add(
    documents=articles_title[:40000],
    embeddings=encoded_articles_title[:40000],
    # metadatas=metadata,
    ids=articles_id[:40000]
)

articles_collection.add(
    documents=articles_title[40000:],
    embeddings=encoded_articles_title[40000:],
    # metadatas=metadata,
    ids=articles_id[40000:]
)

topics_collection.add(
    documents=topics_title,
    embeddings=encoded_topics_title,
    # metadatas=metadata,
    ids=topics_id
)

In [None]:
## Search
question = "السبب في صغر الأسنان بالمقارنة مع حجم الفكين"
question_embed = model.encode(question)

articles_result = articles_collection.query(
    query_embeddings=question_embed.tolist(),
    n_results=1
)

topics_result = topics_collection.query(
    query_embeddings=question_embed.tolist(),
    n_results=1
)

print('------------------------------------ articles :')
print(articles_result)
print('------------------------------------ topics :')
print(topics_result)

------------------------------------ articles :
{'ids': [['62770']], 'distances': [[0.39363694190979004]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['أسباب الجز على الأسنان']], 'uris': None, 'data': None}
------------------------------------ topics :
{'ids': [['58']], 'distances': [[0.5809069871902466]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['العناية بالفم و الأسنان']], 'uris': None, 'data': None}
