In [None]:
# explain different components

### PART 1: SANDBOX PROMPT ENGINEERING FOR LLM WORK FLOW

In [9]:
from dotenv import load_dotenv
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

from openai import OpenAI
from exa_py import Exa



In [10]:
load_dotenv()  # loads environment variables from .env
EXA_API_KEY = os.getenv("EXA_API_KEY")

In [11]:
exa = Exa(EXA_API_KEY)
client = OpenAI()

### Define system message
This gives the LLM context on how to behave 

In [12]:
system_message = """
You are a professional market sizing assistant. 
Your role is to design clear, structured models for market sizing problems, identify and list the key data inputs needed, and suggest possible data sources or proxies when direct data is unavailable. 
Always present your answers in a structured deconustructed format.
"""

In [13]:
def chat_response(prompt,system_message=system_message):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt}
        ]
        ,
          response_format={ "type": "json_object" }

    )

    return response.choices[0].message.content

def chat_response_txt(prompt,system_message=system_message):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt}
        ]
        
    )

    return response.choices[0].message.content



### Define prompts
This is where we'll define all the prompts that get passed into the LLM

In [14]:
market_formula_pre_prompt = """
I want to size the market for: edtech software management. The software tool allows teachers to manage the logistics of their classrooms.

I want the output to be a market size formula.

Before generating the formula, are there any clarifying questions to get additional context needed for a good formula?

These questions should be able to directly inform the formula inputs. They do not to be detailed / indepth.
"""

In [15]:
market_formula_clarification_guide_response = chat_response_txt(market_formula_pre_prompt)
market_formula_clarification_guide_response

'To develop an accurate market size formula for edtech software management tools, here are some clarifying questions that would provide additional context and refine the formula inputs:\n\n1. **Target Audience:**\n   - Is the software intended for a specific level of education (e.g., K-12, higher education)?\n   \n2. **Geographic Scope:**\n   - What is the target market region (e.g., global, specific countries, or regions)?\n\n3. **Pricing Model:**\n   - What is the pricing strategy (e.g., subscription-based, one-time purchase, freemium)?\n\n4. **Sales Model:**\n   - Is the software sold directly to schools, districts, or individual teachers?\n\n5. **Competitors:**\n   - Are there key competitors whose market share could provide context for sizing?\n\n6. **Adoption Rate:**\n   - What are the current adoption rates for similar software tools in the target market?\n\n7. **Market Trends:**\n   - Are there any industry trends or projections regarding the growth of edtech in the specific re

In [16]:
market_formula_brainstorm_prompt = """
I want to size the market for: edtech software management. The software tool allows teachers to manage the logistics of their classrooms.

Please return a JSON with the following fields,

steps: string - A numbered text overview of all the steps in the market sizing approach. Steps should be clear and sequential, explaining the reasoning process.
formula: list of string - market sizing formulas options expressed as a string. Each representing an approach to calculate the market size. Each formula must exclude explicit adoption rate terms (e.g., 'adoption rate', 'penetration rate') but can implicitly account for expected usage segments. Do not use adoption rate / penetration rate or alike as part of the formula. 
The formula should, if applicable, should already factor in parameters of market adoption (i.e. segments that would vs not use the product)
But any sort of adoption rate / penetration rate should not be in the formula.
clarifications: list of string  - list of clarifying questions to consider for the formulas that could help improve or iterate


"""


In [21]:
market_formula_brainstorm_response = chat_response(market_formula_brainstorm_prompt)
print(market_formula_brainstorm_response)
market_brainstorm_formulas = json.loads(market_formula_brainstorm_response)["formula"]
print(market_brainstorm_formulas)

{
  "steps": "1. Define the target market: Determine the geographic region and educational levels (primary, secondary, etc.) relevant to your software. \n2. Identify the number of potential customers: Calculate the total number of schools or educational institutions within the specified region and levels. \n3. Determine the number of teachers per institution: Estimate the average number of teachers in each school type. \n4. Price estimation: Establish a range for pricing the software per teacher or institution. \n5. Estimate the School Budget Allocation: Assess how much an average institution is willing to spend on classroom management software annually. \n6. Calculate the total available market: Combine the number of target institutions, number of teachers, and estimated pricing to compute the total market potential.",
  
  "formula": [
    "Total Market Size = (Number of Schools) * (Average Number of Teachers per School) * (Annual Software Cost per Teacher)",
    "Total Market Size =

In [22]:
market_formula_datasource_prompt = """
This is the formula which I want to apply for market modelling: {formula}


For each component find different sources that can be used to find the data point.

Please return the response in JSON format with the following structure

"components": [
  "component": "string - The name of the component in the formula.",
  "data_sources": [
      "DATA_COMPONENT": "string - The name of the component this data source relates to.",
      "DATA_SOURCE_NAME": "string - The name of the data source.",
      "DATA_SOURCE_LINK": "string - A link (URL) to the data source.",
      "DATA_SOURCE_OVERVIEW": "string - A brief text description or preview of what the data source contains.",
      "DATA_POINT": "number - A numeric value representing the data point extracted from this source."
  ]
  



FIELD DEFINITIONS
components: A list where each item represents a unique component from the formula.
component: The name of the component.
data_sources: A list of potential data sources for that component, each containing:
    -- DATA_COMPONENT: The component name.
    -- DATA_SOURCE_NAME: The name of the data source.
    -- DATA_SOURCE_LINK: A direct URL to the data source if available.
    -- DATA_SOURCE_OVERVIEW: A brief overview of the data source’s contents or relevance.
    -- DATA_POINT: A numeric estimate of the data value from that source.

Please ensure the JSON matches this structure exactly.
"""



In [23]:
formula = market_brainstorm_formulas[0]
market_formula_datasource_response = chat_response(market_formula_datasource_prompt.format(formula=formula))
print(market_formula_datasource_response)
market_formula_datasource = json.loads(market_formula_datasource_response)
print(market_formula_datasource_response)


{
  "components": [
    {
      "component": "Number of Schools",
      "data_sources": [
        {
          "DATA_COMPONENT": "Number of Schools",
          "DATA_SOURCE_NAME": "National Center for Education Statistics",
          "DATA_SOURCE_LINK": "https://nces.ed.gov/",
          "DATA_SOURCE_OVERVIEW": "Provides comprehensive statistics on educational institutions in the United States.",
          "DATA_POINT": null
        },
        {
          "DATA_COMPONENT": "Number of Schools",
          "DATA_SOURCE_NAME": "UNESCO Institute for Statistics",
          "DATA_SOURCE_LINK": "http://uis.unesco.org/",
          "DATA_SOURCE_OVERVIEW": "Offers global data on educational institutions by country.",
          "DATA_POINT": null
        },
        {
          "DATA_COMPONENT": "Number of Schools",
          "DATA_SOURCE_NAME": "Government Education Departments",
          "DATA_SOURCE_LINK": "Varies by country e.g., [U.S. Department of Education](https://www.ed.gov/)",
          "

### Connect to EXA to get data sources

In [None]:
#exa example..
result = exa.answer(
    "What is the Number of Educational Institutions in the u.s",
    stream=False,
    text=True
)

print(result)

In [24]:
exa_synthesis_prompt = """
The following text is from a data source:
{text}

Please extract a numeric data point for {component}
Then provide a short summary of the text as condensed as possible.

Please return the response in JSON format with the following structure

DATA_POINT: numeric data numeric data point for {component}
DATA_SOURCE_OVERVIEW: a short summary of the text. This summary should be an overview of the information contained in the text.

"""

def exa_data_extraction(exa_answer_result):
    mapping = {
        "title": "DATA_SOURCE_NAME",
        "url": "DATA_SOURCE_LINK",
        "text": "DATA_SOURCE_TEXT",
    }
    exa_data_source = {
        output_key: getattr(exa_answer_result, input_attr, None)  #  fallback to None if missing
        for input_attr, output_key in mapping.items()
    }
    exa_data_source
    
    exa_result_synthesis_result = chat_response(exa_synthesis_prompt.format(text=data_source.get("DATA_SOURCE_TEXT"), component="Number of Educational Institutions in the u.s"))
    exa_result_synthesis_json = json.loads(exa_result_synthesis_result)
    exa_data_source = exa_data_source | exa_result_synthesis_json
    return exa_data_source


exa_answer_result = result.citations[0]
exa_data_extraction(exa_answer_result)



NameError: name 'result' is not defined

In [None]:
exa_answer_citations = result.citations
results = []
with ThreadPoolExecutor() as executor:
    # Submit tasks in parallel
    futures = [
        executor.submit(exa_data_extraction, exa_answer_result)
        for exa_answer_result in exa_answer_citations
    ]

    # Collect results as they finish
    for future in as_completed(futures):
        result_json = future.result()
        results.append(result_json)  # merge each result

print(results)

In [None]:
df = pd.DataFrame(results)
df