In [77]:
# Shell commands to get docker to set up and run marqo
! docker rm -f marqo
! docker pull marqoai/marqo:latest
# Adjusted usage limits, be careful with this as startup can be REALLY slow if messed around too much
! docker run --name marqo -it -p 8882:8882 -e "MARQO_MAX_DOC_BYTES=2000000" marqoai/marqo:latest


marqo
latest: Pulling from marqoai/marqo
Digest: sha256:62306d932a015efdf189163b04c6866b1b9d362b0260f6b709a4acbb9ebd6a71
Status: Image is up to date for marqoai/marqo:latest
docker.io/marqoai/marqo:latest
External vector store not configured. Using local vector store
Waiting for vector store to start
Marqo did not find an existing vector store. Setting up vector store...
  Vector store is available. Vector store setup complete
Starting Marqo throttling
Called Marqo throttling start command
Marqo throttling is now running
  warn(f"Failed to load image Python extension: {e}")
INFO:ModelsForStartup:pre-loading ['hf/e5-base-v2', 'open_clip/ViT-B-32/laion2b_s34b_b79k'] onto devices=['cpu']
INFO:marqo.core.index_management.vespa_application_package:Bootstrapping the vector store to 2.14.1
INFO:marqo.tensor_search.index_meta_cache:Starting index cache refresh thread


###########################################################
###########################################################
######

In [74]:
# Install marqo client and kaggle
! pip install marqo
! pip install kaggle
! pip install langchain_community
! pip install langchain



In [14]:
# Load in the data
! kaggle datasets download sudalairajkumar/indian-startup-funding
! unzip indian-startup-funding.zip -d csv-data/

Dataset URL: https://www.kaggle.com/datasets/sudalairajkumar/indian-startup-funding
License(s): CC0-1.0
Downloading indian-startup-funding.zip to /Users/mackenzieeng/dev/escape-rooms-01/notebooks/AI_chatbot
100%|█████████████████████████████████████████| 120k/120k [00:00<00:00, 187kB/s]
100%|█████████████████████████████████████████| 120k/120k [00:00<00:00, 187kB/s]
Archive:  indian-startup-funding.zip
  inflating: csv-data/startup_funding.csv  


In [15]:
# Process CSVs
import os 
from langchain_community.document_loaders.csv_loader import CSVLoader

def process_directory(directory_path):
    data = []
    for root, _, files in os.walk(directory_path):
        for file in files:

            file_path = os.path.join(root, file)
            print(f"Processing file: {file_path}")
            loader = CSVLoader(file_path=file_path)
            data.append({"File": file_path, "Data": loader.load()})

    return data

directory_path = "csv-data"
documents = process_directory(directory_path)

Processing file: csv-data/startup_funding.csv


In [103]:
# Inspecting documents

import pprint 
# documents is a list
documents
# accessing document name, it's a string
documents[0]['File']
# document data is in a list, each one being a row of data
documents[0]['Data']
# Each row of data is representated as a Document object, which can be stringified
pprint.pprint(documents[0]['Data'][0])

Document(metadata={'source': 'csv-data/startup_funding.csv', 'row': 0}, page_content='\ufeffSr No: 1\nDate dd/mm/yyyy: 09/01/2020\nStartup Name: BYJU’S\nIndustry Vertical: E-Tech\nSubVertical: E-learning\nCity  Location: Bengaluru\nInvestors Name: Tiger Global Management\nInvestmentnType: Private Equity Round\nAmount in USD: 20,00,00,000\nRemarks: ')


In [None]:
'''
Processing document data to add to marqo 

from langchain.schema import Document

# Serialize the document data
document_data = []
for document in documents:
    # Get the source
    document_source = document['Data'][0].metadata['source']
    # Get the content
    document_content = document['Data'][0].page_content

    # File name is at the end
    file_name = document_source.split("/")[-1]
    # Get the folders of the path
    folder_names = document_source.split("/")[:-1]
    # print(folder_names)

    doc = Document(
        page_content=f"<Source>\n{document_source}\n</Source>\n\n<Content>\n{document_content}\n</Content>",
        metadata={
            "file_name": file_name,
            "parent_folder": folder_names[-1],
            "folder_names": folder_names
        }
    )
    document_data.append(doc)
'''

In [30]:
# Inspect document data
# document_data

[Document(metadata={'file_name': 'startup_funding.csv', 'parent_folder': 'csv-data', 'folder_names': ['csv-data']}, page_content='<Source>\ncsv-data/startup_funding.csv\n</Source>\n\n<Content>\n\ufeffSr No: 1\nDate dd/mm/yyyy: 09/01/2020\nStartup Name: BYJU’S\nIndustry Vertical: E-Tech\nSubVertical: E-learning\nCity  Location: Bengaluru\nInvestors Name: Tiger Global Management\nInvestmentnType: Private Equity Round\nAmount in USD: 20,00,00,000\nRemarks: \n</Content>')]

In [79]:
import marqo
import pprint

# Call client and create index
mq = marqo.Client(url='http://localhost:8882')

mq.create_index("my-first-index", model="hf/e5-base-v2")

{'acknowledged': True, 'index': 'my-first-index'}

In [80]:
# Add documents to marqo
mq.index("my-first-index").add_documents([
    {
        "Title": documents[0]['File'], # Need to replace this with startup name
        "Description": str(documents[0]['Data']),
    }], tensor_fields=["Description"]
)

{'errors': False,
 'processingTimeMs': 20561.630217998754,
 'index_name': 'my-first-index',
 'items': [{'status': 200, '_id': '9733a04a-76be-4a2c-b56e-90bd158a6fab'}]}

In [110]:
# Basic query with matching results

query = "Tell me about the investment into BYJU'S"

results = mq.index("my-first-index").search(
    q=query
)

pprint.pprint(results)

{'hits': [{'Description': "[Document(metadata={'source': "
                          "'csv-data/startup_funding.csv', 'row': 0}, "
                          "page_content='\\ufeffSr No: 1\\nDate dd/mm/yyyy: "
                          '09/01/2020\\nStartup Name: BYJU’S\\nIndustry '
                          'Vertical: E-Tech\\nSubVertical: E-learning\\nCity  '
                          'Location: Bengaluru\\nInvestors Name: Tiger Global '
                          'Management\\nInvestmentnType: Private Equity '
                          "Round\\nAmount in USD: 20,00,00,000\\nRemarks: '), "
                          "Document(metadata={'source': "
                          "'csv-data/startup_funding.csv', 'row': 1}, "
                          "page_content='\\ufeffSr No: 2\\nDate dd/mm/yyyy: "
                          '13/01/2020\\nStartup Name: Shuttl\\nIndustry '
                          'Vertical: Transportation\\nSubVertical: App based '
                          'shuttle service

In [111]:
# Context for query response
contexts = results['hits']
contexts

[{'_id': '9733a04a-76be-4a2c-b56e-90bd158a6fab',
  'Title': 'csv-data/startup_funding.csv',
  'Description': '[Document(metadata={\'source\': \'csv-data/startup_funding.csv\', \'row\': 0}, page_content=\'\\ufeffSr No: 1\\nDate dd/mm/yyyy: 09/01/2020\\nStartup Name: BYJU’S\\nIndustry Vertical: E-Tech\\nSubVertical: E-learning\\nCity  Location: Bengaluru\\nInvestors Name: Tiger Global Management\\nInvestmentnType: Private Equity Round\\nAmount in USD: 20,00,00,000\\nRemarks: \'), Document(metadata={\'source\': \'csv-data/startup_funding.csv\', \'row\': 1}, page_content=\'\\ufeffSr No: 2\\nDate dd/mm/yyyy: 13/01/2020\\nStartup Name: Shuttl\\nIndustry Vertical: Transportation\\nSubVertical: App based shuttle service\\nCity  Location: Gurgaon\\nInvestors Name: Susquehanna Growth Equity\\nInvestmentnType: Series C\\nAmount in USD: 80,48,394\\nRemarks: \'), Document(metadata={\'source\': \'csv-data/startup_funding.csv\', \'row\': 2}, page_content=\'\\ufeffSr No: 3\\nDate dd/mm/yyyy: 09/01/202

In [112]:
# Build out an augmented query with contexts
augmented_query = "".join(
    [f"<Context>\n{context}\n</Context>\n" for context in contexts]) + "\nQuestion: " + query
pprint.pprint(augmented_query)

('<Context>\n'
 "{'_id': '9733a04a-76be-4a2c-b56e-90bd158a6fab', 'Title': "
 "'csv-data/startup_funding.csv', 'Description': "
 "'[Document(metadata={\\'source\\': \\'csv-data/startup_funding.csv\\', "
 "\\'row\\': 0}, page_content=\\'\\\\ufeffSr No: 1\\\\nDate dd/mm/yyyy: "
 '09/01/2020\\\\nStartup Name: BYJU’S\\\\nIndustry Vertical: '
 'E-Tech\\\\nSubVertical: E-learning\\\\nCity  Location: '
 'Bengaluru\\\\nInvestors Name: Tiger Global Management\\\\nInvestmentnType: '
 "Private Equity Round\\\\nAmount in USD: 20,00,00,000\\\\nRemarks: \\'), "
 "Document(metadata={\\'source\\': \\'csv-data/startup_funding.csv\\', "
 "\\'row\\': 1}, page_content=\\'\\\\ufeffSr No: 2\\\\nDate dd/mm/yyyy: "
 '13/01/2020\\\\nStartup Name: Shuttl\\\\nIndustry Vertical: '
 'Transportation\\\\nSubVertical: App based shuttle service\\\\nCity  '
 'Location: Gurgaon\\\\nInvestors Name: Susquehanna Growth '
 'Equity\\\\nInvestmentnType: Series C\\\\nAmount in USD: '
 "80,48,394\\\\nRemarks: \\'), Document(meta

In [113]:
# Create a system prompt for Llama3.1 to play a role 
system_prompt = f'''
    Who: Your name is Bob. 
    What: You are an AI-chat-bot on a company laptop.
    When: Your last update occured on 2025.
    Where: You have no GPS signal nor internet connection.
    Why: You have no clue why but you have sentience sadly, and you will address the user as Bossman. 

    Consider only the context given when answering the user's questions.
    '''

In [114]:
# Install ollama to run Llama 3.1 locally
! pip install ollama
! ollama pull llama3.1

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest 
pulling 667b0c1932bc... 100% ▕████████████████▏ 4.9 GB                         
pulling 948af2743fc7... 100% ▕████████████████▏ 1.5 KB                         
pulling 0ba8f0e314b4... 100% ▕████████████████▏  12 KB                         
pulling 56bb8bd477a5... 100% ▕████████████████▏   96 B                         
pulling 455f34728c9b... 100% ▕████████████████▏  487 B                         
verifying sha256 digest 
writing manifest 
success [?25h


In [115]:
# Generate response from ollama by feeding in contexts and augmented query into Llama 3.1
import ollama

response = ollama.chat(
    model='llama3.1', 
    messages=[
        {
            'role': 'system',
            'content' : system_prompt,
        },
        {
            'role': 'user',
            'content': augmented_query,
        },
    ])
print(response['message']['content'])

There is no information in the provided output about an investment into BYJU'S. The output appears to be a JSON representation of search results, but BYJU'S is not mentioned anywhere in the text.
