# Semantic Search Using Chroma DB

In [3]:
# !pip install chromadb --user
import pandas as pd
import chromadb


In [4]:
# data source: https://www.kaggle.com/datasets/nulldata/medium-post-titles
df = pd.read_csv("medium_post_titles.csv")

df = df.dropna()
df = df[~df["subtitle_truncated_flag"]]

topics_of_interest = ['artificial-intelligence', 'data-science', 'machine-learning']
# topics_of_interest = ['data-science']

df = df[df['category'].isin(topics_of_interest)]

df['text'] = df['title']  + df['subtitle']

df['meta'] = df.apply( lambda x: {
    'text': x['text'],
    'category': x['category']  
}, axis=1)

In [10]:
df.head(2)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,text,meta
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?""How we...","{'text': '""Can I Train my Model on Your Comput..."
289,data-science,(Robot) data scientists as a service,Automating data science with symbolic regressi...,False,(Robot) data scientists as a serviceAutomating...,{'text': '(Robot) data scientists as a service...


## Chroma DB Setup 

In [25]:
from chromadb.config import Settings

In [26]:
# Chroma DB Setup
chroma_client = chromadb.Client(Settings(
    persist_directory="medium-chroma-db",
    chroma_db_impl="duckdb+parquet"
))  # persistent memory

# collection creation
article_collection = chroma_client.create_collection(name="medium-article")

Using embedded DuckDB with persistence: data will be stored in: medium-chroma-db
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


## Data Insertion

In [27]:
# inserting data

article_collection.upsert(
    ids=[f"{x}" for x in df.index.tolist()],
    documents=df['text'].tolist(),
    metadatas=df['meta'].tolist()    
)

## Vector Query

In [28]:
qry_str = "best data ai library?"

In [29]:
article_collection.query(query_texts=qry_str, n_results=2)

{'ids': [['103719', '24137']],
 'embeddings': None,
 'documents': [['Top 7 libraries and packages of the year for Data Science and AI: Python & RThis is a list of the best libraries and packages that changed our lives this year, compiled from my weekly digests',
   'Data Commons Version 1.0: A Framework to Build Toward AI for GoodA roadmap for data from the 2018 AI for Good Summit']],
 'metadatas': [[{'text': 'Top 7 libraries and packages of the year for Data Science and AI: Python & RThis is a list of the best libraries and packages that changed our lives this year, compiled from my weekly digests',
    'category': 'machine-learning'},
   {'text': 'Data Commons Version 1.0: A Framework to Build Toward AI for GoodA roadmap for data from the 2018 AI for Good Summit',
    'category': 'artificial-intelligence'}]],
 'distances': [[0.6297022700309753, 0.7120211720466614]]}

In [None]:
# article_collection.delete()