# Examples of using analysis functionalities

Using discovery_utils analyses functionalities for investments data

Here, we'll find companies using their categories, but you can also use search results from the process shown in cybersec_search.ipynb

In [1]:
from discovery_utils.utils import (
    analysis_crunchbase,
    analysis,
    charts
)

In [2]:
import pandas as pd
PROJECT_DIR = "/Users/william.woodward/Documents/discovery_mission_radar_prototyping"

# change to markup until poetry issue resolved:
# from src import PROJECT_DIR

In [3]:
from discovery_utils.getters import crunchbase
CB = crunchbase.CrunchbaseGetter()

2025-01-15 10:41:01,541 - discovery_utils.getters.crunchbase - INFO - Checking for latest version of data in S3 bucket: discovery-iss
2025-01-15 10:41:01,700 - discovery_utils.getters.crunchbase - INFO - Latest Crunchbase version found: Crunchbase_2025-01-13


In [4]:
# Categories for ahl
CB.find_similar_categories("obesity, diabetes, nutrition", category_type="narrow", n_results=10)

2025-01-15 10:41:13,205 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-01-15 10:41:14,649 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-01-15 10:41:15,215 - discovery_utils.getters.crunchbase - INFO - Downloading parquet file: data/crunchbase/Crunchbase_2025-01-13/category_groups.parquet
2025-01-15 10:41:15,373 - discovery_utils.getters.crunchbase - INFO - Successfully downloaded and read parquet file: data/crunchbase/Crunchbase_2025-01-13/category_groups.parquet


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Unnamed: 0,category,similarity
418,Diabetes,0.67279
341,Nutrition,0.621489
435,Personal Health,0.450007
444,Wellness,0.445706
445,mHealth,0.428885
424,Health Care,0.422943
725,Fitness,0.402077
336,Food and Beverage,0.391738
296,Health Insurance,0.385957
432,Nutraceutical,0.384207


In [20]:
selected_df = CB.get_companies_in_categories(["AgTech","Agriculture","Diabetes","Dietary Supplements","Farmers Market","Food Delivery","Food Processing","Food Trucks","Food and Beverage","Grocery","Nutrition","Organic Food","Recipes","Restaurants","Snack Food"], category_type="narrow")
matching_ids = set(selected_df.id)    

Group categories into lists

In [42]:
# set category lists
lists_of_categories = {
    "biological": ["Agtech", "Agriculture"],
    "health_diabetes": ["Diabetes"],
    "health_nutrition": ["Dietary Supplements", "Nutrition"],
    "economic_retail": ["Organic Food", "Grocery", "Snack Food", "Farmers Market"],
    "economic - ooh": ["Food Delivery", "Restaurants"],
    "economic - food proc": ["Food Processing"],
    "Food & bev": ["Food and Beverage"],
    "Social": ["Recipes"]
}

for category_group, categories in lists_of_categories.items():
    # Get companies for the current category group
    selected_df = CB.get_companies_in_categories(categories, category_type="narrow")
    matching_ids = set(selected_df.id)

    # Check companies by querying ids
    matchings_orgs_df = CB.organisations_enriched.query("id in @matching_ids")
    matchings_orgs_df[['name', 'homepage_url', 'short_description']]

    # Get the funding rounds for the matching companies
    funding_rounds_df = (
    CB.select_funding_rounds(org_ids=matching_ids, funding_round_types=["angel", "pre_seed", "seed", "series_a", "series_b"])
    )

    # organise investors by each funding round
    investors_df = (
    CB.funding_rounds_enriched
    .query("funding_round_id in @funding_rounds_df.funding_round_id")
    .groupby("funding_round_id")
    .agg(investor_name=("investor_name", list))
    .reset_index()
    )

    funding_rounds_df = (
    funding_rounds_df
    .drop(columns=["investor_name"])
    .merge(investors_df, on="funding_round_id", how="left")
    )

    len(funding_rounds_df)

    # save funding rounds as csv
    funding_rounds_df.to_csv(f"{PROJECT_DIR}/data/2025_01_MS_ahl/{category_group}_funding_rounds.csv", index=False)

    # generate basic time series
    ts_df = analysis_crunchbase.get_timeseries(matchings_orgs_df, funding_rounds_df, period='year', min_year=2014, max_year=2024)
    ts_df

    # create raised amount bar chart
    fig = charts.ts_bar(
        ts_df,
        variable='raised_amount_gbp_total',
        variable_title="Raised amount, £ millions",
        category_column="_category",
    )
    charts.configure_plots(fig, chart_title=f"Funding raised over time for {category_group}")
    
    # Save the chart with a unique filename
    chart_filename = f"{PROJECT_DIR}/charts/{category_group}_raised_amount.png"
    fig.save(chart_filename)

    # Let's look into breakdown of deal types
    deals_df, deal_counts_df = analysis_crunchbase.get_funding_by_year_and_range(funding_rounds_df, 2014, 2024)
    aggregated_funding_types_df = analysis_crunchbase.aggregate_by_funding_round_types(funding_rounds_df)

    # now let's chart by deal types
    investment_types_fig = analysis_crunchbase.chart_investment_types(aggregated_funding_types_df)
    investment_types_chart_filename = f"{PROJECT_DIR}/charts/{category_group}_investment_types.png"
    investment_types_fig.save(investment_types_chart_filename)

    # Chart by deal counts and save the chart
    investment_types_counts_fig = analysis_crunchbase.chart_investment_types_counts(aggregated_funding_types_df)
    investment_types_counts_chart_filename = f"{PROJECT_DIR}/charts/{category_group}_investment_types_counts.png"
    investment_types_counts_fig.save(investment_types_counts_chart_filename)

    # Chart deal sizes and save the chart
    deal_sizes_fig = analysis_crunchbase.chart_deal_sizes(deals_df)
    deal_sizes_chart_filename = f"{PROJECT_DIR}/charts/{category_group}_deal_sizes.png"
    deal_sizes_fig.save(deal_sizes_chart_filename)
    
    # Chart deal sizes counts and save the chart
    deal_sizes_counts_fig = analysis_crunchbase.chart_deal_sizes_counts(deal_counts_df)
    deal_sizes_counts_chart_filename = f"{PROJECT_DIR}/charts/{category_group}_deal_sizes_counts.png"
    deal_sizes_counts_fig.save(deal_sizes_counts_chart_filename)
    

In [55]:
aggregated_funding_types_df

Unnamed: 0,year,investment_type,raised_amount_gbp,counts
0,2001,angel,3.465666e+02,2
1,2001,seed,1.800985e+03,2
2,2001,series_a,8.778214e+04,4
3,2001,series_b,7.766404e+03,1
4,2002,seed,3.234500e+02,1
...,...,...,...,...
114,2024,series_b,1.564320e+06,113
115,2025,pre_seed,0.000000e+00,3
116,2025,seed,1.033822e+04,9
117,2025,series_a,2.574418e+04,5


In [56]:
deals_df

Unnamed: 0,year,n/a,£0-5M,£5-20M,£20-100M,£100M+,total_amount
0,2014,0.001242,428.851038,468.137258,241.827808,862.750964,2001.56831
1,2015,0.001632,626.100537,695.302379,393.879367,0.0,1715.283915
2,2016,0.0,726.877855,750.038929,532.191843,106.016091,2115.124718
3,2017,0.002344,905.489535,1174.373785,945.681462,506.194696,3531.74182
4,2018,0.002352,1033.496169,1406.527005,1135.127638,931.215826,4506.36899
5,2019,0.004351,1193.496357,1724.038073,1755.632398,1273.640894,5946.812072
6,2020,0.001013,1287.3388,2014.159485,1726.656062,1294.422053,6322.577413
7,2021,0.0,1907.280041,3702.801379,3711.932637,3325.199978,12647.214035
8,2022,0.001514,1772.197505,3146.240449,3402.970443,1068.396607,9389.806517
9,2023,0.002432,1248.028828,2061.554159,1094.310671,114.488262,4518.384351


In [57]:
deal_counts_df

Unnamed: 0,year,n/a,£0-5M,£5-20M,£20-100M,£100M+,total_counts
0,2014,244,667,48,7,1,967
1,2015,317,888,75,13,0,1293
2,2016,434,940,84,14,1,1473
3,2017,562,1004,130,27,3,1726
4,2018,680,1188,148,36,6,2058
5,2019,772,1324,179,42,4,2321
6,2020,808,1334,201,45,5,2393
7,2021,909,1862,356,95,15,3237
8,2022,871,1637,306,91,5,2910
9,2023,670,1146,220,31,1,2068


In [58]:
analysis_crunchbase.chart_investment_types(aggregated_funding_types_df)

In [59]:
analysis_crunchbase.chart_investment_types_counts(aggregated_funding_types_df)

In [60]:
analysis_crunchbase.chart_deal_sizes(deals_df)

In [61]:
analysis_crunchbase.chart_deal_sizes_counts(deal_counts_df)