In [None]:
# Setup
from discovery_utils.getters import gtr
from discovery_utils.getters import crunchbase
from discovery_utils.utils import (
    search,
    analysis_crunchbase,
    analysis,
    charts,
    viz_landscape,
)
from discovery_utils.utils.llm.batch_check import LLMProcessor, generate_system_message

## change to markup until poetry issue resolved:
## from src import PROJECT_DIR


from pathlib import Path
import pandas as pd
import nltk
import os


nltk.download('stopwords')

# Paths and directories
PROJECT_DIR = Path("/Users/william.woodward/Documents/discovery_mission_radar_prototyping")
from src import VECTOR_DB_DIR
OUTPUT_DIR = PROJECT_DIR / "data/2025_01_MS_ahl/"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Crunchbase setup
CB = crunchbase.CrunchbaseGetter(vector_db_path=VECTOR_DB_DIR)

# List of configuration files
# List of configuration files
config_files = [
    "config_MS_challenger_brands.yaml",
    "config_MS_child_obesity.yaml",
    "config_MS_food_as_medicine.yaml",
    "config_MS_future_of_ag.yaml",
    "config_MS_glp1.yaml",
    "config_MS_hormonal_dysregulation.yaml"
]


In [None]:
# Loop over all config files
for config_file in config_files:
    config_suffix = "_".join(config_file.split('_')[2:]).split('.')[0]
    print(f"Processing config: {config_suffix}")

    SearchCB = search.SearchDataset(CB, CB.organisations_enriched, config_file)
    search_cb_df = SearchCB.do_search()

    search_cb_df[['id', 'name', 'short_description', 'homepage_url', '_score_keywords', '_score_vectors', '_score_avg']]

    # Filter for number of companies that are above arbitrary relevance score and check number
    relevant_df = search_cb_df.query("_score_avg > 0.3")
    print(f"Number of relevant organisations: {len(relevant_df)}")

    # Add LLM processor to check lists 
    system_message = generate_system_message(config_file)
    fields = [
        {"name": "is_relevant", "type": "str", "description": "A one-word answer: 'yes' or 'no'."},
    ]

    check_data = dict(zip(relevant_df['id'], relevant_df['short_description']))

    processor = LLMProcessor(
        output_path=str(OUTPUT_DIR / f"llm_check_MS_{config_suffix}.jsonl"),
        system_message=system_message,
        session_name="mission_studio",
        output_fields=fields,
    )

    processor.run(check_data, batch_size=15, sleep_time=0.5)

    str(OUTPUT_DIR / f"output_{config_suffix}.jsonl")

    file_path = OUTPUT_DIR / f"llm_check_MS_{config_suffix}.jsonl"
    if os.path.exists(file_path):
        print(f"File successfully saved: {file_path}")
    else:
        print(f"File not found: {file_path}")
    
    relevant_check_df = pd.read_json(file_path, lines=True)
    relevant_checked_df = relevant_df.merge(relevant_check_df[['id', 'is_relevant']], left_on='id', right_on='id', how='left')
    relevant_checked_df.query("is_relevant == 'yes'")[['id', 'name', 'short_description', 'homepage_url', '_score_avg']]

    matching_ids = set(relevant_checked_df.id)
    print(f"Matching IDs: {matching_ids}")
    print(f"Number of matching IDs: {len(matching_ids)}")

    matchings_orgs_df = CB.organisations_enriched.query("id in @matching_ids")
    funding_rounds_df = (
        CB.select_funding_rounds(org_ids=matching_ids, funding_round_types=["angel", "pre_seed", "seed", "series_a", "series_b"])
    )

    # Organise investors by each funding round
    investors_df = (
        CB.funding_rounds_enriched
        .query("funding_round_id in @funding_rounds_df.funding_round_id")
        .groupby("funding_round_id")
        .agg(investor_name=("investor_name", list))
        .reset_index()
    )

    funding_rounds_df = (
        funding_rounds_df
        .drop(columns=["investor_name"])
        .merge(investors_df, on="funding_round_id", how="left")
    )

    funding_rounds_df.to_csv(f"{OUTPUT_DIR}/{config_suffix}_funding_rounds.csv", index=False)

    # organise investors by each funding round
    len(funding_rounds_df)

    # save funding rounds as csv
    funding_rounds_df.to_csv(f"{OUTPUT_DIR}/{config_suffix}_funding_rounds.csv", index=False)

    # Generate time series
    ts_df = analysis_crunchbase.get_timeseries(matchings_orgs_df, funding_rounds_df, period='year', min_year=2014, max_year=2025)
    
    # Breakdown of deal types
    deals_df, deal_counts_df = analysis_crunchbase.get_funding_by_year_and_range(funding_rounds_df, 2014, 2025)
    aggregated_funding_types_df = analysis_crunchbase.aggregate_by_funding_round_types(funding_rounds_df)

    # Chart by deal counts and save
    investment_types_counts_fig = analysis_crunchbase.chart_investment_types_counts(aggregated_funding_types_df)
    investment_types_counts_chart_filename = f"{OUTPUT_DIR}/charts/{config_suffix}_investment_types_counts.png"
    investment_types_counts_fig.save(investment_types_counts_chart_filename)

    # Chart by deal types (deal size amounts)
    investment_types_fig = analysis_crunchbase.chart_investment_types(aggregated_funding_types_df)
    investment_types_chart_filename = f"{OUTPUT_DIR}/charts/{config_suffix}_investment_types.png"
    investment_types_fig.save(investment_types_chart_filename)

    # Chart companies founded
    fig = charts.ts_bar(
        ts=ts_df,
        variable="n_orgs_founded",
        variable_title="Number of companies founded"
    )
    companies_founded_chart_filename = f"{OUTPUT_DIR}/charts/{config_suffix}_companies_founded.png"
    fig.save(companies_founded_chart_filename)

    # Generate market map
    id_condition = "id in ('{}')".format("', '".join(list(matching_ids)))
    vectors_df = CB.VectorDB.vector_db.search().where(id_condition).limit(30000).to_pandas()

    fig, cb_viz_df = viz_landscape.generate_crunchbase_landscape(vectors_df, CB, min_cluster_size=15)
    output_path = f"{OUTPUT_DIR}/charts/{config_suffix}_market_map.html"
    fig.save(str(output_path))

    # Filter companies with exits greater than zero
    companies_with_exits = matchings_orgs_df[matchings_orgs_df['num_exits'] > 0]
    print(companies_with_exits[['name', 'num_exits']])

    companies_with_exits[['name', 'num_exits']].to_csv(f"{OUTPUT_DIR}/{config_suffix}_exits.csv", index=False)

## Number of new companies

In [None]:
from discovery_utils.utils.analysis_crunchbase import get_timeseries
from discovery_utils.utils import charts
from discovery_utils.utils import analysis

matching_ids = relevant_checked_df.query("is_relevant == 'yes'").id.to_list()

ts_df = get_timeseries(
    cb_orgs = CB.organisations_enriched.query("id in @matching_ids"),
    cb_funding_rounds = CB.funding_rounds_enriched.query("org_id in @matching_ids"),
    min_year = 2010,
    max_year = 2025,
    period='year',
)

In [None]:
len(matching_ids)

In [None]:
ts_df

In [None]:
charts.ts_bar(
    ts=ts_df,
    variable = "n_orgs_founded",
    variable_title = "Number of organisations founded"
)

In [None]:
analysis.smoothed_growth(ts_df, year_start=2020, year_end=2024)

In [None]:
# remove duplicates
CB.funding_rounds_enriched.drop_duplicates(subset=["funding_round_id"]).duplicated("funding_round_id").sum()

## Baseline calculations

In [None]:
baseline_df = get_timeseries(
    cb_orgs = CB.organisations_enriched,
    cb_funding_rounds = CB.funding_rounds_enriched.query("org_id in @matching_ids"),
    min_year = 2010,
    max_year = 2025,
    period='year',
)

## Market map

In [None]:
from discovery_utils.utils import viz_landscape

In [None]:
id_condition = "id in ('{}')".format("', '".join(list(matching_ids)))
vectors_df = CB.VectorDB.vector_db.search().where(id_condition).limit(30000).to_pandas()

In [None]:
len(vectors_df)

In [None]:
fig, cb_viz_df = viz_landscape.generate_crunchbase_landscape(vectors_df, CB)

In [None]:
output_path = "test.html"
fig.save(str(output_path))