In [9]:
import chromadb
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

In [10]:
df = pd.read_csv("medium_post_titles.csv")
df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False
3,equality,"""Call me Nat Love"" :Black Cowboys and the Fron...",,False
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False


In [12]:
# Define categories you're interested in
topics_of_interest = ['artificial-intelligence', 'data-science', 'machine-learning']

# Drop rows with missing values
df = df.dropna(subset=['title', 'subtitle', 'category'])

# Apply filters
df = df[~df["subtitle_truncated_flag"]]
df = df[df['category'].isin(topics_of_interest)]

# Combine title and subtitle
df['text'] = df['title'] + " - " + df['subtitle']

# Create metadata dictionary
df['meta'] = df.apply(lambda x: {
    'text': x['text'],
    'category': x['category']
}, axis=1)

In [13]:
df.head(2)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,text,meta
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?"" - How...","{'text': '""Can I Train my Model on Your Comput..."
289,data-science,(Robot) data scientists as a service,Automating data science with symbolic regressi...,False,(Robot) data scientists as a service - Automat...,{'text': '(Robot) data scientists as a service...


In [16]:
from typing import List
from chromadb.utils.embedding_functions import EmbeddingFunction

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embedding function wrapper (compliant)
class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        pass  # Currently empty, but required in future

    def __call__(self, input: List[str]) -> List[List[float]]:
        return model.encode(input).tolist()

embedding_function = MyEmbeddingFunction()

In [17]:
chroma_client = chromadb.Client()
article_collection = chroma_client.create_collection(name="medium-article",embedding_function=embedding_function,get_or_create=True)

In [19]:
article_collection.add(
    ids = [f'{x}' for x in df.index.tolist()],
    documents = df['text'].tolist(),
    metadatas = df['meta'].tolist()
)

In [20]:
query = "What are the best ai libraries"

In [24]:
from pprint import pprint
results = article_collection.query(
    query_texts=query, # Chroma will embed this for you
    n_results=2, # how many results to return
    include=['documents', 'distances', 'metadatas'] # what to return
)
pprint(results)

{'data': None,
 'distances': [[0.7012187242507935, 0.7626003623008728]],
 'documents': [['Top 7 libraries and packages of the year for Data Science and '
                'AI: Python & R - This is a list of the best libraries and '
                'packages that changed our lives this year, compiled from my '
                'weekly digests',
                'What are Some ‘Advanced ‘ AI and Machine Learning Online '
                'Courses? - Where can you find advanced AI and machine '
                'learning courses? A comprehensive review based on my personal '
                'experience with these courses.']],
 'embeddings': None,
 'ids': [['103719', '112075']],
 'included': ['documents', 'distances', 'metadatas'],
 'metadatas': [[{'category': 'machine-learning',
                 'text': 'Top 7 libraries and packages of the year for Data '
                         'Science and AI: Python & R - This is a list of the '
                         'best libraries and packages that ch