### Semantic Search Using Chroma DB

In [3]:
import chromadb
import pandas as pd

In [41]:
df = pd.read_csv("medium_post_titles.csv")

df = df.dropna()
df = df[~df["subtitle_truncated_flag"]]

topics_of_interest = ['artificial-intelligence', 'data-science', 'machine-learning']

df = df[df['category'].isin(topics_of_interest)]

df['text'] = df['title'] + df['subtitle']

df['meta'] = df.apply( lambda x: {
    'text': x['text'],
    'category': x['category']
}, axis=1)

In [20]:
df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,text,meta
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?""How we...","{'text': '""Can I Train my Model on Your Comput..."
289,data-science,(Robot) data scientists as a service,Automating data science with symbolic regressi...,False,(Robot) data scientists as a serviceAutomating...,{'text': '(Robot) data scientists as a service...
448,data-science,10 Free tools to get started with Data Visuali...,Jump right into the Data Visualisation process...,False,10 Free tools to get started with Data Visuali...,{'text': '10 Free tools to get started with Da...
454,data-science,10 Great Programming Projects to Improve Your ...,"Improve your skills in web development, progra...",False,10 Great Programming Projects to Improve Your ...,{'text': '10 Great Programming Projects to Imp...
487,machine-learning,10 Lessons Learned From Participating in Googl...,"Quick, Draw! Doodle Recognition Challenge was ...",False,10 Lessons Learned From Participating in Googl...,{'text': '10 Lessons Learned From Participatin...


In [56]:
# Chroma DB Setup
current_df = df

chrome_client = chromadb.PersistentClient("local_dbs/chroma_db")

# Collection creation
article_collection = chroma_client.get_or_create_collection(name="medium_article")

indexes = [f"{x}" for x in current_df.index.tolist()]

len(indexes)

4082

In [57]:
# Inserting data
article_collection.upsert(
    ids=indexes,
    documents=current_df['text'].tolist(),
    metadatas=current_df['meta'].tolist()
)

In [59]:
qry_str = "best data science library"
article_collection.query(query_texts=qry_str, n_results=1)

{'ids': [['65427']],
 'distances': [[0.5896157026290894]],
 'metadatas': [[{'category': 'data-science',
    'text': 'My Favorite Data Science/Machine Learning ResourcesA summary of sources to get into Data Science'}]],
 'embeddings': None,
 'documents': [['My Favorite Data Science/Machine Learning ResourcesA summary of sources to get into Data Science']]}

In [60]:
qry_str = "best data ai library"
article_collection.query(query_texts=qry_str, n_results=1)

{'ids': [['103719']],
 'distances': [[0.6521782875061035]],
 'metadatas': [[{'category': 'machine-learning',
    'text': 'Top 7 libraries and packages of the year for Data Science and AI: Python & RThis is a list of the best libraries and packages that changed our lives this year, compiled from my weekly digests'}]],
 'embeddings': None,
 'documents': [['Top 7 libraries and packages of the year for Data Science and AI: Python & RThis is a list of the best libraries and packages that changed our lives this year, compiled from my weekly digests']]}