In [43]:
import pandas as pd
from discovery_utils import PROJECT_DIR
from discovery_utils.utils import google
from discovery_utils.enrichment import crunchbase
import os

DATA_DIR = PROJECT_DIR / "src/enrichment"


In [27]:
def get_category_taxonomy() -> pd.DataFrame:
    """Get the category taxonomy from the Google Sheet and return it as a DataFrame."""
    taxonomy_df = []
    for mission in ["ASF", "AHL", "AFS", "X"]:
        taxonomy_df.append(
            google.access_google_sheet(os.environ["SHEET_ID_KEYWORDS"], mission)
            .drop_duplicates(subset=["Category", "Subcategory"])
            .assign(Mission=mission)
        )
        
    taxonomy_df = (
        pd.concat(taxonomy_df, ignore_index=True)
        .sort_values(["Mission", "Category", "Subcategory"])
        [["Mission", "Category", "Subcategory", "Core"]]
    )

    # Remove rows where Category contains "general terms"
    return taxonomy_df[~taxonomy_df["Category"].str.contains("general terms")]    

In [28]:
taxonomy_df = get_category_taxonomy()

In [62]:
taxonomy_df.head(5)

Unnamed: 0,Mission,Category,Subcategory,Core
44,AFS,Content,Content,core
45,AFS,Delivery of childcare,Child care,core
46,AFS,Delivery of childcare,Preschool,core
47,AFS,Development,Cognitive,core
48,AFS,Development,Communication and language,core


In [None]:
organisations_enriched = pd.read_parquet(DATA_DIR / "organisations_enriched.parquet")
funding_rounds_enriched = pd.read_parquet(DATA_DIR / "funding_rounds_enriched.parquet")

## Check investment opportunity threshold

In [74]:
crunchbase._step_function_decay(13, 10)

array(0.50118723)

## Labels
- Overall number of companies
- Number of companies in the UK and in the geo scope
- Number of companies that are investment opportunities (UK, foreign)

In [32]:
organisations_enriched.columns

Index(['id', 'name', 'type', 'permalink', 'cb_url', 'rank', 'created_at',
       'updated_at', 'legal_name', 'roles', 'domain', 'homepage_url',
       'country_code', 'state_code', 'region', 'city', 'address',
       'postal_code', 'status', 'short_description', 'category_list',
       'category_groups_list', 'num_funding_rounds', 'total_funding_usd',
       'total_funding', 'total_funding_currency_code', 'founded_on',
       'last_funding_on', 'closed_on', 'employee_count', 'email', 'phone',
       'facebook_url', 'linkedin_url', 'twitter_url', 'logo_url', 'alias1',
       'alias2', 'alias3', 'primary_role', 'num_exits', 'employee_count_max',
       'investment_funding_gbp', 'num_investment_rounds', 'grant_funding_gbp',
       'num_grants', 'total_funding_gbp', 'potential_investment_opp',
       'investment_opp', 'interesting_foreign_opp', 'investment_opp_metric',
       'smart_money', 'mission_labels', 'topic_labels',
       'smart_money_investor'],
      dtype='object')

In [75]:
org_labels_df = (
    organisations_enriched[["id", "country_code", "mission_labels", "topic_labels", "potential_investment_opp", "investment_opp", "interesting_foreign_opp", "smart_money", "smart_money_investor", "num_grants"]]
    .assign(mission_labels_list = lambda df: df.mission_labels.fillna("").str.split(","))
    .assign(topic_labels_list = lambda df: df.topic_labels.fillna("").str.split(","))
    .assign(has_mission_label = lambda df: df.mission_labels.fillna("").str.contains("|".join(["ASF", "AHL", "AFS"])))
    .assign(is_potential_investment_opp = lambda df: df.potential_investment_opp >= 1)
    .assign(is_investment_opp = lambda df: df.investment_opp >= 0.75)
    .assign(is_interesting_foreign_opp = lambda df: df.interesting_foreign_opp >= 0.75)
    .assign(is_uk = lambda df: df.country_code == "GBR")
    .assign(is_geo_scope = lambda df: df.country_code.isin(crunchbase.COUNTRIES_SCOPE))
    .assign(is_mission_uk = lambda df: df.has_mission_label & df.is_uk)
    .assign(has_grants = lambda df: df.num_grants > 0)
    .assign(has_grants_is_uk = lambda df: df.has_grants & df.is_uk) 
    .assign(has_smart_money_is_uk = lambda df: df.smart_money & df.is_uk)
    .assign(is_investment_opp_with_smart_money_uk = lambda df: df.is_investment_opp & df.has_smart_money_is_uk)
    .assign(is_potential_investment_opp_with_smart_money_uk = lambda df: df.is_potential_investment_opp & df.has_smart_money_is_uk)
    .assign(is_uk_smart_money_investor = lambda df: df.smart_money_investor & df.is_uk)
    .assign(is_foreign_opp_with_smart_money = lambda df: df.is_interesting_foreign_opp & df.smart_money)
)

In [76]:
summary_missions_df = (
    org_labels_df
    .explode("mission_labels_list")
    .fillna({"mission_labels_list": "none"})
    .dropna(subset=["id", "mission_labels_list"])
    .groupby("mission_labels_list")
    .agg(
        total=("id", "count"),
        total_uk=("is_uk", "sum"),
        uk_potential_investment_opp=("is_potential_investment_opp", "sum"),
        uk_investment_opp=("is_investment_opp", "sum"),
        uk_with_grant=("has_grants_is_uk", "sum"),
        uk_has_smart_money=("has_smart_money_is_uk", "sum"),
        uk_investment_opp_with_smart_money=("is_investment_opp_with_smart_money_uk", "sum"),
        uk_potential_investment_opp_with_smart_money=("is_potential_investment_opp_with_smart_money_uk", "sum"),
        total_geo_scope=("is_geo_scope", "sum"),
        foreign_opp=("is_interesting_foreign_opp", "sum"),
        foreign_opp_with_smart_money=("is_foreign_opp_with_smart_money", "sum"),
        uk_smart_money_investor=("is_uk_smart_money_investor", "sum"),
        total_smart_money_investor=("smart_money_investor", "sum"),
    )
    .reset_index()
)

In [77]:
summary_missions_df

Unnamed: 0,mission_labels_list,total,total_uk,uk_potential_investment_opp,uk_investment_opp,uk_with_grant,uk_has_smart_money,uk_investment_opp_with_smart_money,uk_potential_investment_opp_with_smart_money,total_geo_scope,foreign_opp,foreign_opp_with_smart_money,uk_smart_money_investor,total_smart_money_investor
0,,2238994,161577,6015,2529,1299,2600,543,1452,1616347,86288,23571,4,6
1,AFS,38107,2608,307,91,44,135,23,80,28861,3984,1201,3,31
2,AHL,13643,841,133,54,30,76,14,34,9455,1614,502,2,24
3,ASF,38557,2841,256,157,127,176,54,75,28775,3796,1218,3,54
4,X,1007294,74037,4925,2204,1199,2616,573,1381,723600,71081,20536,6,55


In [78]:
summary_topics_df = (
    org_labels_df
    .explode("topic_labels_list")
    .fillna({"topic_labels_list": "none"})
    .dropna(subset=["id", "topic_labels_list"])
    .groupby("topic_labels_list")
    .agg(
        total=("id", "count"),
        total_uk=("is_uk", "sum"),
        uk_potential_investment_opp=("is_potential_investment_opp", "sum"),
        uk_investment_opp=("is_investment_opp", "sum"),
        uk_with_grant=("has_grants_is_uk", "sum"),
        uk_has_smart_money=("has_smart_money_is_uk", "sum"),
        uk_investment_opp_with_smart_money=("is_investment_opp_with_smart_money_uk", "sum"),
        uk_potential_investment_opp_with_smart_money=("is_potential_investment_opp_with_smart_money_uk", "sum"),
        total_geo_scope=("is_geo_scope", "sum"),
        foreign_opp=("is_interesting_foreign_opp", "sum"),
        foreign_opp_with_smart_money=("is_foreign_opp_with_smart_money", "sum"),
        uk_smart_money_investor=("is_uk_smart_money_investor", "sum"),
        total_smart_money_investor=("smart_money_investor", "sum"),
    )
    .reset_index()
    .merge(taxonomy_df, left_on="topic_labels_list", right_on="Subcategory", how="left")
    .sort_values(["Category", "Subcategory"])
)

In [80]:
# summary_topics_df.to_csv(DATA_DIR / "summary_topics.csv", index=False)

In [82]:
organisations_enriched.columns

Index(['id', 'name', 'type', 'permalink', 'cb_url', 'rank', 'created_at',
       'updated_at', 'legal_name', 'roles', 'domain', 'homepage_url',
       'country_code', 'state_code', 'region', 'city', 'address',
       'postal_code', 'status', 'short_description', 'category_list',
       'category_groups_list', 'num_funding_rounds', 'total_funding_usd',
       'total_funding', 'total_funding_currency_code', 'founded_on',
       'last_funding_on', 'closed_on', 'employee_count', 'email', 'phone',
       'facebook_url', 'linkedin_url', 'twitter_url', 'logo_url', 'alias1',
       'alias2', 'alias3', 'primary_role', 'num_exits', 'employee_count_max',
       'investment_funding_gbp', 'num_investment_rounds', 'grant_funding_gbp',
       'num_grants', 'total_funding_gbp', 'potential_investment_opp',
       'investment_opp', 'interesting_foreign_opp', 'investment_opp_metric',
       'smart_money', 'mission_labels', 'topic_labels',
       'smart_money_investor'],
      dtype='object')

In [97]:
last_funding_df = (
    funding_rounds_enriched
    .sort_values("announced_on", ascending=False)
    .drop_duplicates("org_id", keep="first")
)

1164940              seed
1164939              seed
1164938              seed
1164937       undisclosed
1164936              seed
                ...      
248        series_unknown
249              series_a
251        series_unknown
252              series_a
0          series_unknown
Name: investment_type, Length: 316843, dtype: object

In [111]:
df_export = (
    org_labels_df
    .query("has_mission_label")
    .query("is_geo_scope")
    .query("country_code == 'GBR'")
    .merge(
        organisations_enriched[["id", "name", "short_description", "cb_url", "homepage_url", "employee_count", "founded_on", 'investment_funding_gbp', 'num_investment_rounds','grant_funding_gbp', 'total_funding_gbp', 'last_funding_on', 'investment_opp_metric']],
        on="id",
        how="left"
    )
    .merge(
        last_funding_df[['org_id', 'investment_type']]
        .rename(columns={"investment_type": "last_investment_type"}),
        left_on="id",
        right_on="org_id",
        how="left"
    )
    .drop(columns="org_id")
)

In [112]:
len(df_export)

6228

In [113]:
df_export.to_csv(DATA_DIR / "organisations_with_labels.csv", index=False)