In [None]:
from discovery_utils.getters import gtr
from discovery_utils.getters import crunchbase
from discovery_utils.utils import search

from src import PROJECT_DIR
from src import VECTOR_DB_DIR

OUTPUT_DIR = PROJECT_DIR / 'data/2025_01_MS_ahl/'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
import pandas as pd

In [None]:
from discovery_utils.utils.llm.batch_check import LLMProcessor, generate_system_message


In [None]:
CB = crunchbase.CrunchbaseGetter(vector_db_path=VECTOR_DB_DIR)

In [None]:
SearchCB = search.SearchDataset(CB, CB.organisations_enriched, "config_MS_glp1.yaml")

In [None]:
search_cb_df = SearchCB.do_search()

In [None]:
search_cb_df[['id', 'name', 'short_description', 'homepage_url', '_score_keywords', '_score_vectors', '_score_avg']]


In [None]:
relevant_df = search_cb_df.query("_score_avg > 0.3")

In [None]:
system_message = generate_system_message("config_MS_glp1.yaml")
fields = [
    {"name": "is_relevant", "type": "str", "description": "A one-word answer: 'yes' or 'no'."},
]

check_data = dict(zip(relevant_df['id'], relevant_df['short_description']))

processor = LLMProcessor(
    output_path=str(OUTPUT_DIR / "output_MS_glp1.jsonl"),
    system_message=system_message,
    session_name="mission_studio",
    output_fields=fields,
)

processor.run(check_data, batch_size=15, sleep_time=0.5)

In [None]:
relevant_check_df = pd.read_json(OUTPUT_DIR / "output_MS_glp1.jsonl", lines=True)

In [None]:
relevant_checked_df = relevant_df.merge(relevant_check_df[['id', 'is_relevant']], left_on='id', right_on='id', how='left')
relevant_checked_df.query("is_relevant == 'yes'")[['id', 'name', 'short_description', 'homepage_url', '_score_avg']]

## Number of new companies

In [None]:
from discovery_utils.utils.analysis_crunchbase import get_timeseries
from discovery_utils.utils import charts
from discovery_utils.utils import analysis

matching_ids = relevant_checked_df.query("is_relevant == 'yes'").id.to_list()

ts_df = get_timeseries(
    cb_orgs = CB.organisations_enriched.query("id in @matching_ids"),
    cb_funding_rounds = CB.funding_rounds_enriched.query("org_id in @matching_ids"),
    min_year = 2010,
    max_year = 2025,
    period='year',
)

In [None]:
len(matching_ids)

In [None]:
ts_df

In [None]:
charts.ts_bar(
    ts=ts_df,
    variable = "n_orgs_founded",
    variable_title = "Number of organisations founded"
)

In [None]:
analysis.smoothed_growth(ts_df, year_start=2020, year_end=2024)

In [None]:
# remove duplicates
CB.funding_rounds_enriched.drop_duplicates(subset=["funding_round_id"]).duplicated("funding_round_id").sum()

## Baseline calculations

In [None]:
baseline_df = get_timeseries(
    cb_orgs = CB.organisations_enriched,
    cb_funding_rounds = CB.funding_rounds_enriched.query("org_id in @matching_ids"),
    min_year = 2010,
    max_year = 2025,
    period='year',
)

## Market map

In [None]:
from discovery_utils.utils import viz_landscape

In [None]:
id_condition = "id in ('{}')".format("', '".join(list(matching_ids)))
vectors_df = CB.VectorDB.vector_db.search().where(id_condition).limit(30000).to_pandas()

In [None]:
len(vectors_df)

In [None]:
fig, cb_viz_df = viz_landscape.generate_crunchbase_landscape(vectors_df, CB)

In [None]:
output_path = "test.html"
fig.save(str(output_path))