In [1]:
import shutil
import os 
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings


In [2]:
from dotenv import load_dotenv
load_dotenv()
print(os.getenv("OPENAI_API_KEY"))
print(os.getenv("TOGETHER_API_KEY"))

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
6f97e18081ff098b066d0c8c8dfa7010b1ddc1ecd2e1e511930b8fa918b723de


In [3]:
db_path = "./chroma_db"
if os.path.exists(db_path):
    print(f"Deleting old database at {db_path}...")
    shutil.rmtree(db_path) # This is the key command - it deletes the folder

Deleting old database at ./chroma_db...


In [4]:
chroma_client: chromadb.ClientAPI = chromadb.PersistentClient(path=db_path)

In [5]:
collection_name = "bbc_collection"


In [6]:
import joblib


In [7]:
bbc_data = joblib.load('./data/bbc_data.joblib')
len(bbc_data)

9973

In [8]:
from pprint import pprint

pprint(bbc_data[0])

{'article_content': "Justin Welby speaks on BBC Radio 4's Today programme as "
                    'part of a special show guest edited by Dame Emma Warmsley '
                    'The Archbishop of Canterbury has urged politicians not to '
                    'treat their opponents as enemies but fellow human beings. '
                    'Speaking to the BBC, the Most Rev Justin Welby warned '
                    "Britain's leaders to avoid divisive topics. But he said "
                    'our capacity "to disagree deeply and not destructively" '
                    "is cause for hope. Later, he will deliver a new year's "
                    'message reflecting on global conflicts and his wishes for '
                    'a "peaceful 2024". The archbishop\'s intervention came '
                    "during an interview for BBC Radio 4's Today programme, "
                    'which is being guest edited by Dame Emma Walmsley, chief '
                    'executive of pharmaceutical

In [9]:
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions

import numpy as np
from typing import List, Union

# Method 1: Create a custom embedding function class
class BGEEmbeddingFunction(embedding_functions.EmbeddingFunction):
    def __init__(self, model_name: str = 'BAAI/bge-base-en-v1.5'):
        self.model = SentenceTransformer(model_name)
    
    def __call__(self, input: List[str]) -> List[List[float]]:
        # Generate embeddings
        embeddings = self.model.encode(input, convert_to_tensor=False)
        # Convert numpy arrays to lists if necessary
        if isinstance(embeddings, np.ndarray):
            embeddings = embeddings.tolist()
        return embeddings


In [10]:
custom_ef = BGEEmbeddingFunction()

# Create a collection. This is like a table in a database.
collection = chroma_client.get_or_create_collection(
    name="collection_name",
    metadata={"hnsw:space": "cosine"},  # Cosine similarity is often a good choice
    embedding_function=custom_ef,
    configuration={
        "hnsw": {
            "space": "cosine",
            "ef_construction": 200
        }
    }
)

In [11]:
import uuid_utils as uuid

In [12]:
[key for key in bbc_data[0].keys()]

['title', 'pubDate', 'guid', 'link', 'description', 'article_content']

In [13]:
# Add documents to collection (embeddings will be generated automatically)
nums = 1000

    
documents = [ "Title: "+ item['title'] + "\nDescription: " + item['description'] + "\nContent:" + item['article_content']  for item in bbc_data[:nums]]

ids = [str(uuid.uuid7()) for _ in range(nums)]

print(documents)
print (ids)

['0199053e-ba69-7e12-a178-b0bd0f5be0b4', '0199053e-ba69-7e12-a178-b0c4a2016350', '0199053e-ba69-7e12-a178-b0dafedb2d62', '0199053e-ba69-7e12-a178-b0eccbd9e9fd', '0199053e-ba69-7e12-a178-b0fcd50d832b', '0199053e-ba69-7e12-a178-b109c3235fe4', '0199053e-ba69-7e12-a178-b117e39a876e', '0199053e-ba69-7e12-a178-b120a9192b8b', '0199053e-ba69-7e12-a178-b139aa04458d', '0199053e-ba69-7e12-a178-b147b5a5d1a5', '0199053e-ba69-7e12-a178-b157c2ef7980', '0199053e-ba69-7e12-a178-b16a86186036', '0199053e-ba69-7e12-a178-b17a4832683d', '0199053e-ba69-7e12-a178-b18f6c6084aa', '0199053e-ba69-7e12-a178-b19c43732359', '0199053e-ba69-7e12-a178-b1a36e10ab84', '0199053e-ba69-7e12-a178-b1bc79463934', '0199053e-ba69-7e12-a178-b1ce307bd4c9', '0199053e-ba69-7e12-a178-b1d7e81bd463', '0199053e-ba69-7e12-a178-b1e048bdd033', '0199053e-ba69-7e12-a178-b1fef6db73d2', '0199053e-ba69-7e12-a178-b20110724e3d', '0199053e-ba69-7e12-a178-b21846aeae31', '0199053e-ba69-7e12-a178-b225e7244103', '0199053e-ba69-7e12-a178-b23ec530ca4a',

In [14]:
collection.add(
    documents=documents,
    ids = ids,
)

In [15]:
collection.count()

1000

In [16]:
model = SentenceTransformer('BAAI/bge-base-en-v1.5') # Using a smaller model for speed


In [18]:
# Query 1: Basic Semantic Search
print("=== Basic Semantic Search ===")
results = collection.query(
    query_embeddings=model.encode("Russia war").tolist(),
    n_results=5
)
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
    print(f"Document: {doc}\n")

=== Basic Semantic Search ===
Document: Title: Ukraine-Russia war: missiles kill 11, including children, Ukrainian official says
Description: Five children are among those killed in a missile strike in the east, the local governor says.

Document: Title: Ukraine war: Russians find no shelter in border city of Belgorod
Description: Belgorod has seen cross-border attacks before, but Saturday's was the deadliest on Russian soil so far.
Content:Last Saturday's attack on Belgorod came within 24 hours of Russia's biggest aerial bombardment of Ukraine so far The people of Belgorod have experienced Ukrainian cross-border attacks before, but Saturday's was the deadliest on Russian soil since the start of the war. Twenty-five people were killed and more than 100 others hurt in the biggest Russian city in close proximity to Ukraine. The Ukrainian missile strikes came a day after Russia launched its biggest aerial bombardment of the war so far, leaving more than 40 dead.  President Volodymyr Zelen

In [21]:
import pandas as pd
df = pd.DataFrame(bbc_data)


In [22]:
# Check the shape and columns
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# View first few rows
print(df.head())

# Check data types
print(df.dtypes)

DataFrame shape: (9973, 6)
Columns: ['title', 'pubDate', 'guid', 'link', 'description', 'article_content']
                                               title             pubDate  \
0  Justin Welby: Political leaders should treat o... 2024-01-01 00:00:04   
1  Almost three million tested for cancer in England 2024-01-01 00:09:56   
2  Household energy price rise of 5% comes into f... 2024-01-01 00:00:16   
3  Primrose Hill stabbing: Harry Pitman named as ... 2024-01-01 17:11:13   
4  Israel Supreme Court strikes down judicial ref... 2024-01-01 19:47:58   

                                                guid  \
0             https://www.bbc.co.uk/news/uk-67844356   
1         https://www.bbc.co.uk/news/health-67841348   
2       https://www.bbc.co.uk/news/business-67785266   
3  https://www.bbc.co.uk/news/uk-england-london-6...   
4  https://www.bbc.co.uk/news/world-middle-east-6...   

                                                link  \
0  https://www.bbc.co.uk/news/uk-67844356?a

In [23]:
df['pubDate'] = pd.to_datetime(df['pubDate'])


In [24]:
df.dtypes

title                      object
pubDate            datetime64[ns]
guid                       object
link                       object
description                object
article_content            object
dtype: object

In [25]:
df.loc[0]

title              Justin Welby: Political leaders should treat o...
pubDate                                          2024-01-01 00:00:04
guid                          https://www.bbc.co.uk/news/uk-67844356
link               https://www.bbc.co.uk/news/uk-67844356?at_medi...
description        The Archbishop of Canterbury urges politicians...
article_content    Justin Welby speaks on BBC Radio 4's Today pro...
Name: 0, dtype: object

In [29]:
df["guid"].isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
9968    False
9969    False
9970    False
9971    False
9972    False
Name: guid, Length: 9973, dtype: bool