Components to store:
* Embeddings
* Source texts
* Metadata
    * IDs and references
    * Additional data useful for filtering results

In [3]:
import chromadb

client = chromadb.PersistentClient(path='/Users/mac/Desktop/Projects/ml-practice/datacamp/tmp/chroma')

collection = client.create_collection(name='netflix_titles')

# list collections
collections = client.list_collections()

# Add single document
collection.add(ids=['1'], documents=['This is a test document.'])

# Add multiple documents
collection.add(
    ids=['2', '3'],
    documents=['This is another test document.', 'This is yet another test document.']
)
# Query the collection
collection.peek()

# Query the collection with a filter
collection.query(
    query_embeddings=['This is a test document.'],
    n_results=2,
    where={'id': '1'}
)

# Estimating embedding cost
import tiktoken

enc = tiktoken.encoding_for_model('text-embedding-ada-002')
total_tokens = sum(len(enc.encode(doc)) for doc in ['This is a test document.'])
cost_per_1k_tokens = 0.00002
cost = (total_tokens / 1000) * cost_per_1k_tokens

PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.10/migration/#basesettings-has-moved-to-pydantic-settings for more details.

For further information visit https://errors.pydantic.dev/2.10/u/import-error

In [2]:
import pandas as pd
import os

print(os.getcwd())

os.chdir('/Users/mac/Desktop/Projects/ml-practice/datacamp/AI Systems')

netflix_df = pd.read_csv('netflix_titles_1000.csv')
print(netflix_df.head())

/Users/mac/Desktop/Projects/ml-practice/datacamp
  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  

In [None]:
# Querying the database
import chromadb

collection = client.get_collection(name='netflix_titles')

# Query the collection
result = collection.query(
    query_texts=['What are the top 10 movies?'],
    n_results=10,
)

print(result['results'])

# updating the collection
collection.update(
    ids=['1'],
    documents=['This is an updated test document.']
)

# Upsert a collection
collection.upsert(
    ids=['4'],
    documents=['This is a new test document.']
)

# Deleting a collection
collection.delete(
    ids=['2']
)

# Delete all collections
client.reset()

# Multiple Queries and Filtering

In [None]:
import csv 

ids = []
metadatas = []
reference_ids = ['1', '2', '3']
reference_texts = collection.get(ids=reference_ids)['documents']

result = collection.query(
    query_texts=reference_texts,
    n_results=2
)

with open('netflix_titles_1000.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for i, row in enumerate(reader):
        ids.append(row['show_id'])
        metadatas.append({
            'type': row['type'],
            'title': row['title'],
            'director': row['director'],
            'cast': row['cast'],
            'country': row['country'],
            'date_added': row['date_added'],
            'release_year': row['release_year'],
            'rating': row['rating'],
            'duration': row['duration'],
            'listed_in': row['listed_in'],
            'description': row['description']
        })

collection.update(ids=ids, metadatas=metadatas)

result = collection.query(
    query_texts=['What are the top 10 movies?'],
    n_results=10,
    where={'type': 'Movie'}
)

# Multiple where filters
result = collection.query(
    query_texts=['What are the top 10 movies?'],
    n_results=10,
    where={
        "$and": [
            {"type":
             {"$eq": "Movie"}
                 },
            {"release_year":
                {"$gte": 2000}
                    },
                {"rating":
                {"$eq": "PG-13"}
                    }
        ]
    }
)