In [48]:
# Shell commands to get docker to set up and run marqo
! docker rm -f marqo
! docker pull marqoai/marqo:latest
# Adjusted usage limits, be careful with this as startup can be REALLY slow if messed around too much
! docker run --name marqo -it -p 8882:8882 -e "MARQO_MAX_DOC_BYTES=2000000" marqoai/marqo:latest


marqo
latest: Pulling from marqoai/marqo
Digest: sha256:62306d932a015efdf189163b04c6866b1b9d362b0260f6b709a4acbb9ebd6a71
Status: Image is up to date for marqoai/marqo:latest
docker.io/marqoai/marqo:latest
External vector store not configured. Using local vector store
Waiting for vector store to start
Marqo did not find an existing vector store. Setting up vector store...
  Vector store is available. Vector store setup complete
Starting Marqo throttling
Called Marqo throttling start command
Marqo throttling is now running
  warn(f"Failed to load image Python extension: {e}")
INFO:ModelsForStartup:pre-loading ['hf/e5-base-v2', 'open_clip/ViT-B-32/laion2b_s34b_b79k'] onto devices=['cpu']
INFO:marqo.core.index_management.vespa_application_package:Bootstrapping the vector store to 2.14.1
INFO:marqo.tensor_search.index_meta_cache:Starting index cache refresh thread


###########################################################
###########################################################
######

In [49]:
# Install marqo client and kaggle
! pip install marqo
! pip install kaggle
! pip install langchain_community
! pip install langchain



In [16]:
# Load in the data
! kaggle datasets download sudalairajkumar/indian-startup-funding
! unzip indian-startup-funding.zip -d csv-data/

Dataset URL: https://www.kaggle.com/datasets/sudalairajkumar/indian-startup-funding
License(s): CC0-1.0
Downloading indian-startup-funding.zip to /Users/mackenzieeng/dev/escape-rooms-01/notebooks/AI_chatbot
100%|█████████████████████████████████████████| 120k/120k [00:00<00:00, 172kB/s]
100%|█████████████████████████████████████████| 120k/120k [00:00<00:00, 172kB/s]
Archive:  indian-startup-funding.zip
  inflating: csv-data/startup_funding.csv  


In [17]:
# Process CSVs
import os 
from langchain_community.document_loaders.csv_loader import CSVLoader

def process_directory(directory_path):
    data = []
    for root, _, files in os.walk(directory_path):
        for file in files:

            file_path = os.path.join(root, file)
            print(f"Processing file: {file_path}")
            loader = CSVLoader(file_path=file_path)
            data.append({"File": file_path, "Data": loader.load()})

    return data

directory_path = "csv-data"
documents = process_directory(directory_path)

Processing file: csv-data/startup_funding.csv


In [18]:
# Inspecting documents

import pprint 
# documents is a list
documents
# accessing document name, it's a string
documents[0]['File']
# document data is in a list, each one being a row of data
documents[0]['Data']
# Each row of data is representated as a Document object, which can be stringified after getting information we need
pprint.pprint(documents[0]['Data'][0].page_content)

('\ufeffSr No: 1\n'
 'Date dd/mm/yyyy: 09/01/2020\n'
 'Startup Name: BYJU’S\n'
 'Industry Vertical: E-Tech\n'
 'SubVertical: E-learning\n'
 'City  Location: Bengaluru\n'
 'Investors Name: Tiger Global Management\n'
 'InvestmentnType: Private Equity Round\n'
 'Amount in USD: 20,00,00,000\n'
 'Remarks: ')


In [50]:
import marqo
import pprint

# Call client
mq = marqo.Client(url='http://localhost:8882')


In [51]:
# Create index
mq.create_index("my-first-index",
                model="flax-sentence-embeddings/all_datasets_v4_MiniLM-L6")

{'acknowledged': True, 'index': 'my-first-index'}

In [52]:
# Loop through documents to insert documents into marqo, significant improvement after switching models

def insert_documents_into_marqo(documents):
    for document in documents:
        file_name = document['File']
        document_data = document['Data']

        processed_document_data = []

        for row in document_data:
            page_content = str(row.page_content)
            document_data = {
                "Title": file_name,
                "Description": page_content,
            }

            processed_document_data.append(document_data)
        
        mq.index("my-first-index").add_documents(processed_document_data, tensor_fields=["Description"], client_batch_size=128
            )

insert_documents_into_marqo(documents)

2025-01-07 10:13:08,403 logger:'marqo' INFO     add_documents batch 0: took 26.180s for Marqo to process & index 128 docs. Roundtrip time: 26.195s.
2025-01-07 10:13:17,794 logger:'marqo' INFO     add_documents batch 1: took 9.382s for Marqo to process & index 128 docs. Roundtrip time: 9.390s.
2025-01-07 10:13:25,993 logger:'marqo' INFO     add_documents batch 2: took 8.190s for Marqo to process & index 128 docs. Roundtrip time: 8.199s.
2025-01-07 10:13:33,246 logger:'marqo' INFO     add_documents batch 3: took 7.243s for Marqo to process & index 128 docs. Roundtrip time: 7.253s.
2025-01-07 10:13:40,696 logger:'marqo' INFO     add_documents batch 4: took 7.442s for Marqo to process & index 128 docs. Roundtrip time: 7.452s.
2025-01-07 10:13:47,647 logger:'marqo' INFO     add_documents batch 5: took 6.944s for Marqo to process & index 128 docs. Roundtrip time: 6.952s.
2025-01-07 10:13:54,081 logger:'marqo' INFO     add_documents batch 6: took 6.426s for Marqo to process & index 128 docs. 

In [None]:
# Basic query with matching results

query = "Tell me about BYJU'S"

results = mq.index("my-first-index").search(
    q=query
)

results['hits']

[{'_id': '4db1d2be-7167-4bd9-9606-1208fcaf67e0',
  'Title': 'csv-data/startup_funding.csv',
  'Description': '\ufeffSr No: 1\nDate dd/mm/yyyy: 09/01/2020\nStartup Name: BYJU’S\nIndustry Vertical: E-Tech\nSubVertical: E-learning\nCity  Location: Bengaluru\nInvestors Name: Tiger Global Management\nInvestmentnType: Private Equity Round\nAmount in USD: 20,00,00,000\nRemarks: ',
  '_highlights': [{'Description': '\ufeffSr No: 1\nDate dd/mm/yyyy: 09/01/2020\nStartup Name: BYJU’S\nIndustry Vertical: E-Tech\nSubVertical: E-learning\nCity  Location: Bengaluru\nInvestors Name: Tiger Global Management\nInvestmentnType: Private Equity Round\nAmount in USD: 20,00,00,000\nRemarks:'}],
  '_score': 0.6240473214919875},
 {'_id': '20243032-c23d-4529-b6b4-32e80cf59ffa',
  'Title': 'csv-data/startup_funding.csv',
  'Description': '\ufeffSr No: 68\nDate dd/mm/yyyy: 10/07/2019\nStartup Name: "BYJU\\\\\'S"\nIndustry Vertical: EdTech\nSubVertical: Education\nCity  Location: Bengaluru\nInvestors Name: Qatar In

In [54]:
# Context for query response
contexts = results['hits']
contexts

[{'_id': '4db1d2be-7167-4bd9-9606-1208fcaf67e0',
  'Title': 'csv-data/startup_funding.csv',
  'Description': '\ufeffSr No: 1\nDate dd/mm/yyyy: 09/01/2020\nStartup Name: BYJU’S\nIndustry Vertical: E-Tech\nSubVertical: E-learning\nCity  Location: Bengaluru\nInvestors Name: Tiger Global Management\nInvestmentnType: Private Equity Round\nAmount in USD: 20,00,00,000\nRemarks: ',
  '_highlights': [{'Description': '\ufeffSr No: 1\nDate dd/mm/yyyy: 09/01/2020\nStartup Name: BYJU’S\nIndustry Vertical: E-Tech\nSubVertical: E-learning\nCity  Location: Bengaluru\nInvestors Name: Tiger Global Management\nInvestmentnType: Private Equity Round\nAmount in USD: 20,00,00,000\nRemarks:'}],
  '_score': 0.6240473214919875},
 {'_id': '20243032-c23d-4529-b6b4-32e80cf59ffa',
  'Title': 'csv-data/startup_funding.csv',
  'Description': '\ufeffSr No: 68\nDate dd/mm/yyyy: 10/07/2019\nStartup Name: "BYJU\\\\\'S"\nIndustry Vertical: EdTech\nSubVertical: Education\nCity  Location: Bengaluru\nInvestors Name: Qatar In

In [55]:
# Build out an augmented query with contexts
augmented_query = "".join(
    [f"<Context>\n{context}\n</Context>\n" for context in contexts]) + "\nQuestion: " + query
pprint.pprint(augmented_query)

('<Context>\n'
 "{'_id': '4db1d2be-7167-4bd9-9606-1208fcaf67e0', 'Title': "
 "'csv-data/startup_funding.csv', 'Description': '\\ufeffSr No: 1\\nDate "
 'dd/mm/yyyy: 09/01/2020\\nStartup Name: BYJU’S\\nIndustry Vertical: '
 'E-Tech\\nSubVertical: E-learning\\nCity  Location: Bengaluru\\nInvestors '
 'Name: Tiger Global Management\\nInvestmentnType: Private Equity '
 "Round\\nAmount in USD: 20,00,00,000\\nRemarks: ', '_highlights': "
 "[{'Description': '\\ufeffSr No: 1\\nDate dd/mm/yyyy: 09/01/2020\\nStartup "
 'Name: BYJU’S\\nIndustry Vertical: E-Tech\\nSubVertical: E-learning\\nCity  '
 'Location: Bengaluru\\nInvestors Name: Tiger Global '
 'Management\\nInvestmentnType: Private Equity Round\\nAmount in USD: '
 "20,00,00,000\\nRemarks:'}], '_score': 0.6240473214919875}\n"
 '</Context>\n'
 '<Context>\n'
 "{'_id': '20243032-c23d-4529-b6b4-32e80cf59ffa', 'Title': "
 "'csv-data/startup_funding.csv', 'Description': '\\ufeffSr No: 68\\nDate "
 'dd/mm/yyyy: 10/07/2019\\nStartup Name: "BYJU\\\

In [62]:
# Create a system prompt for LLM to play a role 
system_prompt = f'''
    Consider only the context given when answering the user's questions.
    '''

In [69]:
# Install ollama to run LLM locally
! pip install ollama
! ollama pull qwen2.5

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest 
pulling 2bada8a74506...  56% ▕█████████       ▏ 2.6 GB/4.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 2bada8a74506...  56% ▕█████████       ▏ 2.6 GB/4.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 2bada8a74506...  56% ▕█████████       ▏ 2.6 GB/4.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 2bada8a74506...  56% ▕█████████       ▏ 2.6 GB/4.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 2bada8a74506...  56% ▕█████████       ▏ 2.6 GB/4.7 GB             

In [70]:
# Generate response from ollama by feeding in contexts and augmented query into LLM
import ollama

response = ollama.chat(
    model='qwen2.5',
    messages=[
        {
            'role': 'system',
            'content' : system_prompt,
        },
        {
            'role': 'user',
            'content': augmented_query,
        },
    ])
print(response['message']['content'])

Here are some details about BYJU'S from the provided data:

- **Industry Vertical:** Education and Consumer Internet
- **SubVerticals:**
  - Educational Video Content Creator
  - Online Learning App
- **Location:** Bangalore
- **Investors:**
  - Sequoia India, Sofina (for educational video content creator)
  - Chan Zuckerberg Initiative, Times Internet Ltd, Sequoia Capital, Sofina, Lightspeed Ventures (for private equity in online learning app)
- **Investment Amounts:**
  - $75,000,000 on March 22, 2016
  - $500,000,000 on September 9, 2016

BYJU'S seems to be a significant player in the educational technology space with multiple rounds of investment and focus on both video content creation and online learning apps.
