In [1]:
!pip install langchain chromadb openai tiktoken pypdf langchain_openai langchain-community

Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pypdf
  Downloading pypdf-6.1.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.33-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.30-py3-none-any.whl.metadata (3.0 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0

In [2]:
# Used to securely store your API key
from google.colab import userdata

import os

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [3]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [4]:
from langchain.schema import Document

doc1 = Document(
    page_content="Virat Kohli is one of the greatest batsmen in cricket history. "
                 "He has scored more than 25,000 international runs and captained "
                 "Royal Challengers Bangalore in the IPL for several seasons.",
    metadata={'team': 'Royal Challengers Bangalore'}
)

doc2 = Document(
    page_content="MS Dhoni is known for his calm captaincy and finishing ability. "
                 "He has led Chennai Super Kings to multiple IPL titles.",
    metadata={'team': 'Chennai Super Kings'}
)

doc3 = Document(
    page_content="Rohit Sharma, nicknamed 'Hitman', is famous for his ability to "
                 "score double centuries in ODIs and for captaining Mumbai Indians.",
    metadata={'team': 'Mumbai Indians'}
)

doc4 = Document(
    page_content="AB de Villiers, also known as Mr. 360, is admired for his "
                 "unorthodox shots and explosive batting style.",
    metadata={'team': 'Royal Challengers Bangalore'}
)

doc5 = Document(
    page_content="Rashid Khan is one of the top T20 bowlers in the world, "
                 "feared for his leg-spin and variations in the IPL.",
    metadata={'team': 'Gujarat Titans'}
)

In [5]:
docs = [doc1, doc2, doc3, doc4, doc5]

In [6]:
vector_store = Chroma(
    embedding_function=OpenAIEmbeddings(),
    persist_directory="chroma_db",
    collection_name="my_collection"
)

  vector_store = Chroma(


In [7]:
docs

[Document(metadata={'team': 'Royal Challengers Bangalore'}, page_content='Virat Kohli is one of the greatest batsmen in cricket history. He has scored more than 25,000 international runs and captained Royal Challengers Bangalore in the IPL for several seasons.'),
 Document(metadata={'team': 'Chennai Super Kings'}, page_content='MS Dhoni is known for his calm captaincy and finishing ability. He has led Chennai Super Kings to multiple IPL titles.'),
 Document(metadata={'team': 'Mumbai Indians'}, page_content="Rohit Sharma, nicknamed 'Hitman', is famous for his ability to score double centuries in ODIs and for captaining Mumbai Indians."),
 Document(metadata={'team': 'Royal Challengers Bangalore'}, page_content='AB de Villiers, also known as Mr. 360, is admired for his unorthodox shots and explosive batting style.'),
 Document(metadata={'team': 'Gujarat Titans'}, page_content='Rashid Khan is one of the top T20 bowlers in the world, feared for his leg-spin and variations in the IPL.')]

In [8]:
# add documents
vector_store.add_documents(docs)

['4b65d7e8-5ac1-4982-ac27-a436543a9010',
 '7c3af609-adca-49ae-954b-356c3a12d740',
 'ea08ff99-9287-432d-86c8-9474a7704630',
 '9cc94157-c2e8-43b5-9dbc-bcf22cbde2c4',
 '73942844-5b12-4e01-be3c-b84d3b5a63b9']

In [9]:
# view documents
vector_store.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['4b65d7e8-5ac1-4982-ac27-a436543a9010',
  '7c3af609-adca-49ae-954b-356c3a12d740',
  'ea08ff99-9287-432d-86c8-9474a7704630',
  '9cc94157-c2e8-43b5-9dbc-bcf22cbde2c4',
  '73942844-5b12-4e01-be3c-b84d3b5a63b9'],
 'embeddings': array([[-0.0065006 , -0.00929543,  0.02696914, ..., -0.01507099,
         -0.00479954,  0.00959288],
        [ 0.00778978,  0.00231461,  0.00906435, ..., -0.03071915,
          0.00369432, -0.00407928],
        [-0.02245789,  0.00536337,  0.00697946, ..., -0.01949612,
          0.00875008, -0.01511787],
        [-0.02187739, -0.00310758, -0.00189642, ..., -0.02692603,
         -0.00155539,  0.01132754],
        [-0.00568425,  0.01161151,  0.02845082, ..., -0.0131943 ,
          0.00917493, -0.00208193]]),
 'documents': ['Virat Kohli is one of the greatest batsmen in cricket history. He has scored more than 25,000 international runs and captained Royal Challengers Bangalore in the IPL for several seasons.',
  'MS Dhoni is known for his calm captaincy and fin

In [10]:
# search documents
vector_store.similarity_search("Batsman", k=2)

[Document(metadata={'team': 'Royal Challengers Bangalore'}, page_content='AB de Villiers, also known as Mr. 360, is admired for his unorthodox shots and explosive batting style.'),
 Document(metadata={'team': 'Mumbai Indians'}, page_content="Rohit Sharma, nicknamed 'Hitman', is famous for his ability to score double centuries in ODIs and for captaining Mumbai Indians.")]

In [11]:
# search with similarity score
vector_store.similarity_search_with_score("Batsman", k=2)

[(Document(metadata={'team': 'Royal Challengers Bangalore'}, page_content='AB de Villiers, also known as Mr. 360, is admired for his unorthodox shots and explosive batting style.'),
  0.3107920289039612),
 (Document(metadata={'team': 'Mumbai Indians'}, page_content="Rohit Sharma, nicknamed 'Hitman', is famous for his ability to score double centuries in ODIs and for captaining Mumbai Indians."),
  0.3134373426437378)]

In [12]:
# meta-data filtering
vector_store.similarity_search_with_score("", filter={"team": "Royal Challengers Bangalore"})

[(Document(metadata={'team': 'Royal Challengers Bangalore'}, page_content='AB de Villiers, also known as Mr. 360, is admired for his unorthodox shots and explosive batting style.'),
  0.6359187960624695),
 (Document(metadata={'team': 'Royal Challengers Bangalore'}, page_content='Virat Kohli is one of the greatest batsmen in cricket history. He has scored more than 25,000 international runs and captained Royal Challengers Bangalore in the IPL for several seasons.'),
  0.6604252457618713)]