### APPLY OOP / CLASS STRUCTURE

In [19]:
class PromptFactory:
    """
    Factory class with standardized prompts for market sizing tasks.
    Prompts can be retrieved by calling `get_prompt(prompt_name, **kwargs)`,
    where prompt_name must exactly match one of the defined static method names.
    """

    @staticmethod
    def clarifying_questions_prompt(market_description: str) -> str:
        return (
            f"I want to size the market for: {market_description}.\n\n"
            "Before generating the formula, are there any clarifying questions to get additional context needed for a good formula?\n\n"
            "These questions should directly inform the formula inputs."
        )

    @staticmethod
    def formula_brainstorm_prompt(market_description: str) -> str:
        return (
            f"I want to size the market for: {market_description}.\n\n"
            "Please return a JSON with the following fields:\n\n"
            "steps: string - A numbered overview of the steps.\n"
            "formula: list of strings - Market sizing formulas expressed as strings, excluding explicit adoption/penetration rates.\n"
            "clarifications: list of strings - Clarifying questions to consider for the formulas."
        )

    @staticmethod
    def datasource_prompt(formula: str) -> str:
        return (
            f"This is the formula which I want to apply for market modeling:\n\n{formula}\n\n"
            "Create a JSON of all the components of the formula.\n\n"
            "For each component, find different sources that can be used to find the data point.\n\n"
            "Please return a JSON with the following fields:\n\n"
            "components: list of component_data objects\n"
            "component_data objects:\n"
            "  -- component: name of component\n"
            "  -- data_sources: list of data source objects with fields:\n"
            "       -- DATA_COMPONENT: component name\n"
            "       -- DATA_SOURCE_NAME: name of data source\n"
            "       -- DATA_SOURCE_LINK: link to data source\n"
            "       -- DATA_SOURCE_OVERVIEW: text description/preview of data source\n"
            "       -- DATA_POINT: numeric data value for the component"
        )

    @staticmethod
    def exa_synthesis_prompt(text: str, component: str) -> str:
        return (
            f"The following text is from a data source:\n{text}\n\n"
            f"Please extract a numeric data point for {component}.\n"
            "Then provide a short summary of the text as condensed as possible.\n\n"
            "Please return the response in JSON format with the following structure:\n\n"
            "DATA_POINT: numeric data point for the component\n"
            "DATA_SOURCE_OVERVIEW: a short summary of the text, providing an overview of the information contained in the text."
        )

    @staticmethod
    def decompose_formula_prompt(formula: str) -> str:
        return (
            f"I have a market size formula: {formula}\n\n"
            "Decompose the formula into each individual component / data.\n\n"
            "Return this as a JSON with the following fields:\n\n"
            "components : list of strings, each string is the name of a component."
        )

    @classmethod
    def get_prompt(cls, name: str, **kwargs) -> str:
        """
        Routes a prompt name to the appropriate prompt generator.
        Keys must match the exact function names.
        """
        router = {
            "clarifying_questions_prompt": cls.clarifying_questions_prompt,
            "formula_brainstorm_prompt": cls.formula_brainstorm_prompt,
            "datasource_prompt": cls.datasource_prompt,
            "exa_synthesis_prompt": cls.exa_synthesis_prompt,
            "decompose_formula_prompt": cls.decompose_formula_prompt,
        }

        if name not in router:
            raise ValueError(
                f"Unknown prompt name '{name}'. Valid options: {list(router.keys())}"
            )

        return router[name](**kwargs)


In [47]:
from dotenv import load_dotenv
import os
import json
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from openai import OpenAI
from exa_py import Exa

class ResearchSequenceTask:
    """
    A class to handle the full workflow of clarifying questions, formula brainstorming,
    data sourcing via Exa, and result synthesis for market sizing tasks, using PromptFactory.
    """

    def __init__(self, exa_client=None, openai_client=None):
        load_dotenv()
        self.EXA_API_KEY = os.getenv("EXA_API_KEY")
        self.client = openai_client or OpenAI()
        self.exa = exa_client or Exa(self.EXA_API_KEY)
        self.system_message = (
            "You are a professional market sizing assistant. "
            "Your role is to design clear, structured models for market sizing problems, "
            "identify and list the key data inputs needed, and suggest possible data sources or proxies "
            "when direct data is unavailable. Always present your answers in a structured deconstructed format."
        )

    # def chat_response_package(self, prompt, response_json=False):
    #     """
    #     Calls OpenAI with the given prompt.
    #     Returns JSON if response_json=True, otherwise returns text.
    #     """
    #     response = self.client.chat.completions.create(
    #         model="gpt-4o",
    #         messages=[
    #             {"role": "system", "content": self.system_message},
    #             {"role": "user", "content": prompt}
    #         ],
    #         response_format={"type": "json_object"} if response_json else None
    #     )
    #     return response.choices[0].message.content

    def chat_response(self, prompt, response_json=False):
        """
        Calls OpenAI endpoint with the given prompt.
        Returns JSON if response_json=True, otherwise returns text.
        """
        url = "https://api.openai.com/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        data = {
            "model": "gpt-4o",
            "messages": [
                {"role": "system", "content": self.system_message},
                {"role": "user", "content": prompt}
            ],
        }
        # Add response format if JSON requested
        if response_json:
            data["response_format"] = {"type": "json_object"}

        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()  # Raise error for bad status codes

        result = response.json()
        return result["choices"][0]["message"]["content"]

    def get_clarifying_questions(self, market_description):
        """
        Generates clarifying questions for the given market description using PromptFactory.
        """
        prompt = PromptFactory.clarifying_questions_prompt(market_description)
        return self.chat_response(prompt)

    def generate_market_formulas(self, market_description):
        """
        Runs brainstorm prompt and extracts formulas from the response using PromptFactory.
        """
        prompt = PromptFactory.formula_brainstorm_prompt(market_description)
        response = self.chat_response(prompt, response_json=True)
        data = json.loads(response)
        return data.get("formula", [])

    def find_data_for_formula(self, formula):
        """
        Finds potential data sources for the given formula using PromptFactory.
        """
        prompt = PromptFactory.datasource_prompt(formula)
        response = self.chat_response(prompt, response_json=True)
        return json.loads(response)

    def get_components_from_formula(self, formula: str) -> list:
        """
        Decomposes a formula string into individual components using the language model.
        Returns a list of component names.

        Args:
            formula (str): The market size formula to decompose.

        Returns:
            list[str]: List of component names extracted from the formula.
        """
        prompt = PromptFactory.get_prompt(
            "decompose_formula_prompt",
            formula=formula
        )

        response = self.chat_response(prompt, response_json=True)
        data = json.loads(response)
        components = data.get("components", [])

        if not isinstance(components, list):
            print(f"[WARNING] Unexpected components format in response: {components}")
            return []

        return components


    def exa_search(self, query):
        """
        Runs an Exa semantic search query.
        """
        return self.exa.answer(query, stream=False, text=True)

    def exa_data_extraction(self, exa_answer_result, component):
        """
        Extracts numeric data points from Exa answers and synthesizes them via OpenAI using PromptFactory.
        """
        mapping = {
            "title": "DATA_SOURCE_NAME",
            "url": "DATA_SOURCE_LINK",
            "text": "DATA_SOURCE_TEXT",
        }
        data_source = {
            output_key: getattr(exa_answer_result, input_attr, None)
            for input_attr, output_key in mapping.items()
        }

        exa_synthesis_prompt = PromptFactory.exa_synthesis_prompt(
            data_source.get("DATA_SOURCE_TEXT", ""),
            component
        )

        synthesis_response = self.chat_response(exa_synthesis_prompt, response_json=True)
        synthesis_json = json.loads(synthesis_response)
        return {'component':component} | data_source | synthesis_json # merge dictionaries

    def parallel_exa_extraction(self, exa_results, component):
        """
        Runs exa_data_extraction in parallel over a list of Exa citations.
        Returns a DataFrame of merged results.
        """
        results = []
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(self.exa_data_extraction, result, component)
                for result in exa_results
            ]
            for future in as_completed(futures):
                results.append(future.result())
        return results


    def run_exa_workflow(self, query: str, component: str) -> pd.DataFrame:
        """
        Executes the full Exa data sourcing workflow:
        1) Runs a semantic search with Exa on the provided query.
        2) Processes each Exa citation in parallel to extract numeric data points.
        3) Returns a pandas DataFrame containing the compiled data.

        Args:
            query (str): The natural language search query for Exa.
            component (str): The name of the component being sourced (e.g., "Number of Educational Institutions").

        Returns:
            pd.DataFrame: DataFrame of extracted data source details and numeric estimates.
        """
        print(f"[INFO] Running Exa semantic search for query: {query}")
        exa_result = self.exa_search(query)
        print(f"[INFO] Retrieved {len(exa_result.citations)} citations from Exa.")

        print("[INFO] Starting parallel extraction of data points from Exa citations...")
        results = self.parallel_exa_extraction(exa_result.citations, component)
        print("[INFO] Extraction complete. Returning DataFrame.")

        return results


    def run_exa_workflow_for_components_sequential(self, components: list) -> dict:
        """
        Executes the Exa data sourcing workflow for each component sequentially.
        Runs run_exa_workflow (which parallelizes citation extraction) one component at a time.
        
        Args:
            components (list[str]): List of component names to process.

        Returns:
            dict[str, pd.DataFrame]: Mapping of component name → DataFrame of extracted data.
        """
        all_results = {}

        for component in components:
            print(f"\n[INFO] Processing component: '{component}' sequentially")
            try:
                df = self.run_exa_workflow(query=component, component=component)
                all_results[component] = df
                print(f"[INFO] Completed workflow for component: '{component}'")
            except Exception as e:
                print(f"[ERROR] Failed workflow for component '{component}': {e}")
                all_results[component] = None

        print("[INFO] Exa workflows complete for all components (sequential execution).")
        return all_results



In [41]:
task = ResearchSequenceTask()
topic = "edtech software management in the U.S. The software allows university professors to manage the logistics of their classrooms. Usually it is one software per teacher and they use that for all their classes. They are charged by c;asses they have on the software."

In [36]:

# Clarifying questions
questions = task.get_clarifying_questions(topic)
print(questions)

To effectively size the market for edtech software management aimed at university professors, we need clarity on several aspects of the market. Below are some clarifying questions that will help define the inputs necessary for an accurate market sizing model:

1. **Geographic Scope and Segmentation**
   - What is the geographic scope of the market (e.g., global, specific regions, countries)?
   - Are there specific segments or types of universities (e.g., public vs. private, research vs. teaching) you are targeting?

2. **Pricing Model**
   - What is the pricing structure per class for the software?
   - Are there any tiers or package deals available (e.g., discounts for multiple class subscriptions)?

3. **Market Penetration and Competition**
   - What is the current penetration rate of similar edtech software in the target market?
   - Are there existing competitors and what is their market share?

4. **University and Professor Demographics**
   - What is the average number of classe

In [42]:
# Generate market formulas list
formulas = task.generate_market_formulas(topic)
print(formulas)

['Total professors in the U.S. * Average number of classes per professor', 'Annual demand = Total licenses per semester * Number of semesters in a year', 'Market size = Annual demand * Price per class']


In [43]:

if formulas:
    components = task.get_components_from_formula(formulas[0])
    print(components)


['Total professors in the U.S.', 'Average number of classes per professor']


In [44]:

#decompose market formula
data_research_objects = []
if components:
    for component in components:
        data_sources = task.run_exa_workflow(component, component)
        print(data_sources)
        data_research_objects.append(data_sources)


[INFO] Running Exa semantic search for query: Total professors in the U.S.
[INFO] Retrieved 8 citations from Exa.
[INFO] Starting parallel extraction of data points from Exa citations...
[INFO] Extraction complete. Returning DataFrame.
[{'DATA_SOURCE_NAME': 'College Professor Demographics and Statistics [2025] - Zippia', 'DATA_SOURCE_LINK': 'https://www.zippia.com/college-professor-jobs/demographics', 'DATA_SOURCE_TEXT': 'Sign In Post Job Explore Jobs Jobs Near Me Remote Jobs Full Time Jobs Part Time Jobs Entry Level Jobs Work From Home Jobs Find Specific Jobs $15 Per Hour Jobs $20 Per Hour Jobs Hiring Immediately Jobs High School Jobs H1b Visa Jobs LGBTQ Jobs Explore Careers Sales Healthcare Business And Financial Architecture And Engineering Computer And Mathematical Explore Professions Salaries What They Do Certifications Skills Education Demographics Best Companies Health Care Media Start-Up Non-profit Fortune 500 Explore Companies Reviews Salaries History Locations CEO And Executi

In [52]:

for data in data_research_objects:
    df = pd.DataFrame(data)
    display(df)


Unnamed: 0,DATA_SOURCE_NAME,DATA_SOURCE_LINK,DATA_SOURCE_TEXT,DATA_POINT,DATA_SOURCE_OVERVIEW,component
0,College Professor Demographics and Statistics ...,https://www.zippia.com/college-professor-jobs/...,Sign In Post Job Explore Jobs Jobs Near Me Rem...,,The text appears to be from a job and career e...,Total professors in the U.S.
1,Postsecondary Teachers : Occupational Outlook ...,https://www.bls.gov/ooh/education-training-and...,Summary What They Do \n Work Environment \n Ho...,1397600.0,The text provides detailed information about p...,Total professors in the U.S.
2,Professors in the United States - Wikipedia,https://en.wikipedia.org/wiki/Professors_in_th...,Professors in the United States commonly occup...,824347.0,The text provides an overview of the various f...,Total professors in the U.S.
3,Fast Facts: Race/ethnicity of college faculty ...,https://nces.ed.gov/fastfacts/display.asp?id=61,Race/ethnicity of college faculty \n Question:...,1.5,The text provides a demographic breakdown of f...,Total professors in the U.S.
4,Huge data set shows 80% of US professors come ...,https://www.nature.com/articles/d41586-022-030...,Host: Nick Petrić Howe Welcome back to the Nat...,300000.0,The text is an interview discussing a study th...,Total professors in the U.S.
5,Higher education faculty statistics - Ballotpedia,https://ballotpedia.org/Higher_education_facul...,Monitor evolving ESG laws with Ballotpedia’s f...,800611.0,The text provides information about the change...,Total professors in the U.S.
6,COE - Characteristics of Postsecondary Faculty,https://nces.ed.gov/programs/coe/indicator/csc...,Postsecondary Education \n \n \n \n \n \n \n T...,135300.0,The text provides an overview of postsecondary...,Total professors in the U.S.
7,"Digest of Education Statistics,",https://nces.ed.gov/programs/digest/d23/tables...,Table 315.20. Full-time faculty in degree-gran...,189362.0,The text provides a detailed breakdown of full...,Total professors in the U.S.


Unnamed: 0,DATA_SOURCE_NAME,DATA_SOURCE_LINK,DATA_SOURCE_TEXT,DATA_POINT,DATA_SOURCE_OVERVIEW,component
0,How Many Classes Does a Typical Professor Teac...,https://www.youtube.com/watch?v=rIxqn1ERZKI,#askaprofessor #professorlife #truthaboutteach...,,The text is from a YouTube video or channel as...,Average number of classes per professor
1,Teaching Load and Equivalencies - Limestone Fa...,https://fac-staff-hb.limestone.edu/teaching-lo...,"Contractual Teaching Loads\n \n Full-time, non...",10.0,The document outlines the contractual teaching...,Average number of classes per professor
2,What Is A Student-to-Professor Ratio In College?,https://www.affordablecollegesonline.org/colle...,What is Student Faculty Ratio? \n Prospective ...,1.94,The article discusses the student-to-faculty r...,Average number of classes per professor
3,How Much Do They Really Teach?,https://jamesgmartin.center/2014/06/how-much-d...,A lot of dollars are riding on how many course...,3.7,The text discusses the teaching loads of profe...,Average number of classes per professor
4,How many classes does a full-time professor te...,https://www.quora.com/How-many-classes-does-a-...,"It depends on the university, the field, and t...",6.5,The text outlines teaching loads for professor...,Average number of classes per professor
5,How many courses a prof teaches and why it mat...,https://collegeisforme.com/articles/how-many-c...,How many courses does an adjunct and full-time...,3.0,The text provides an overview of the teaching ...,Average number of classes per professor
6,It's Time for College Professors to Teach | Ma...,https://manhattan.institute/article/its-time-f...,Introduction \n \n Higher education is plagued...,2.0,The text discusses concerns about the low prio...,Average number of classes per professor
7,How many class room hours does the typical uni...,https://academia.stackexchange.com/questions/7...,"I am wondering, for full time university teach...",10.0,The text discusses the teaching load of full-t...,Average number of classes per professor
