# Examples of using analysis functionalities

Using discovery_utils analyses functionalities for investments data

Here, we'll find companies using their categories, but you can also use search results from the process shown in cybersec_search.ipynb

In [None]:
from discovery_utils.utils import (
    analysis_crunchbase,
    analysis,
    charts
)

In [None]:
import pandas as pd
from src import PROJECT_DIR

In [None]:
from discovery_utils.getters import crunchbase
CB = crunchbase.CrunchbaseGetter()

In [None]:
CB.vector_db_path = "tmp/vector_db"
CB.VectorDB = crunchbase.embeddings.VectorDB(
    db_path=PROJECT_DIR / "tmp/vector_db",
    db_name="crunchbase-lancedb",
    table_name="company_embeddings",
    model="all-MiniLM-L6-v2",
)

## Selecting companies using our categories

In [None]:
has_weight = CB.organisations_enriched.topic_labels.str.contains('Weight').astype(bool)
orgs_weight_df = CB.organisations_enriched[has_weight]


In [None]:
map_dict = {
    "Publishing": "Yes",
}

(
    CB.organisation_categories
    .head(10)
    .explode("category_list")
    .category_list
    .apply(lambda x: map_dict.get(x, "Nan"))
)

## Selecting companies using CB categories

In [None]:
# Categories for cybersec
CB.find_similar_categories("obesity, diabetes, nutrition", category_type="narrow", n_results=10)

In [None]:
# lists_of_categories = {
#     "health_diabetes": ["Diabetes"],
#     "health_nutrition": ["Nutrition", "Dietary supplements"],
#     "biology": ["AgTech", "Agriculture"],
#     "economic": ["Farmers Market", "Food Delivery", "Food Processing", "Food Trucks", "Food and Beverage", "Grocery", "Organic Food", "Restaurants", "Snack Food", ],
#     "social": ["Recipes"],
# }

lists_of_categories = {
    "biological": ["Agtech", "Agriculture"],
    "health_diabetes": ["Diabetes"],
    "health_nutrition": ["Dietary Supplements", "Nutrition"],
    "economic_retail": ["Organic Food", "Grocery", "Snack Food", "Farmers Market"],
    "economic_ooh": ["Food Delivery", "Restaurants"],
    "economic_food_proc": ["Food Processing"],
    "food_beverage": ["Food and Beverage"],
    "social": ["Recipes"]
}

In [None]:
save_name = "social"
list_of_categories = lists_of_categories[save_name]
print(list_of_categories)
selected_df = CB.get_companies_in_categories(list_of_categories, category_type="narrow")

matching_ids = set(list(selected_df.id.to_list()))

In [None]:
only_recent_companies = False
if only_recent_companies:
    ids_recent = CB.organisations_enriched.query("last_funding_on > '2019'").id.to_list()
    ids_new = CB.organisations_enriched.query("founded_on > '2019'").id.to_list()
    recent_or_new = set(ids_recent + ids_new)
    matching_ids = matching_ids.intersection(recent_or_new)

In [None]:
len(matching_ids)

In [None]:
# write an sql query to achieve id in test_ids
id_condition = "id in ('{}')".format("', '".join(list(matching_ids)))
vectors_df = CB.VectorDB.vector_db.search().where(id_condition).limit(30000).to_pandas()
len(vectors_df)

In [None]:
import bertopic
import numpy as np
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP
from bertopic.vectorizers import ClassTfidfTransformer

In [None]:
import openai
from bertopic.representation import OpenAI
from bertopic import BERTopic
import os
# Create your representation model
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
representation_model = OpenAI(client, model="gpt-4o-mini", delay_in_seconds=1, chat=True)

In [None]:
# MIN_CLUSTER_SIZE = 15
MIN_CLUSTER_SIZE = 50

# Initialize BERTopic
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

custom_hdbscan = HDBSCAN(
    min_cluster_size=MIN_CLUSTER_SIZE,  # Minimum size for clusters
    prediction_data=True,  # Allow prediction for new data points
    cluster_selection_method='leaf',
    metric='euclidean',
)

umap_model = UMAP(n_neighbors=15,
            n_components=5,
            min_dist=0.0,
            metric='euclidean',
            low_memory=False,
            random_state=42)

topic_model = BERTopic(
    min_topic_size=MIN_CLUSTER_SIZE,  # Set the minimum size for a cluster
    n_gram_range=(1, 1),  # Set the n-gram range for topic extraction
    verbose=True,  # Enable verbose output for progress tracking
    hdbscan_model=custom_hdbscan,
    ctfidf_model=ctfidf_model,
    umap_model=umap_model,
    representation_model=representation_model,
    nr_topics=10,
)


# Fit the model using precomputed embeddings
topics, probs = topic_model.fit_transform(
    vectors_df['text'], 
    embeddings=np.array(vectors_df['vector'].to_list()),
)

In [None]:
vectors_df['topics'] = topics
vectors_df['topics'].value_counts()


In [None]:
try:
    new_topics = topic_model.reduce_outliers(
        vectors_df['text'].to_list(),
        topics=vectors_df['topics'].to_list(),
        strategy='embeddings',
        embeddings=np.array(vectors_df['vector'].to_list())
    )
    vectors_df['new_topic'] = new_topics
    print("Outliers reduced")
except Exception as e:
    vectors_df['new_topic'] = topics
vectors_df = vectors_df.assign(new_topic = lambda df: df.new_topic.astype(int))

In [None]:
from umap import UMAP
import altair as alt
# max rows
alt.data_transformers.disable_max_rows()

In [None]:
reduced_embeddings = (
    UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine')
    .fit_transform(np.array(vectors_df['vector'].to_list()))
)

In [None]:
viz_df = (
    vectors_df
    .assign(
        umap_x = reduced_embeddings[:, 0],
        umap_y = reduced_embeddings[:, 1],
    )
    .merge(
        topic_model.get_topic_info(), left_on='new_topic', right_on='Topic', how='left'
    )
    # .merge(
    #     topic_names, left_on='Topic', right_on='Cluster', how='left'
    # )
    # .drop(columns=['charity_activities_vector', 'charity_activities']) 
    # .merge(
    #     data_df[['registered_charity_number','charity_activities']]
    #     .astype({'registered_charity_number': str}), on='registered_charity_number', how='left'
    # )
)

In [None]:
len(viz_df)

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Iterator, Dict
from collections import defaultdict

simple_tokenizer = lambda x: x.split()


def cluster_texts(documents: Iterator[str], cluster_labels: Iterator) -> Dict:
    """
    Creates a large text string for each cluster, by joining up the
    text strings (documents) belonging to the same cluster
    Args:
        documents: A list of text strings
        cluster_labels: A list of cluster labels, indicating the membership of the text strings
    Returns:
        A dictionary where keys are cluster labels, and values are cluster text documents
    """

    assert len(documents) == len(cluster_labels)
    doc_type = type(documents[0])

    cluster_text_dict = defaultdict(doc_type)
    for i, doc in enumerate(documents):
        if doc_type is str:
            cluster_text_dict[cluster_labels[i]] += doc + " "
        elif doc_type is list:
            cluster_text_dict[cluster_labels[i]] += doc
    return cluster_text_dict


def cluster_keywords(
    documents: Iterator[str],
    cluster_labels: Iterator[int],
    n: int = 10,
    tokenizer=simple_tokenizer,
    max_df: float = 0.90,
    min_df: float = 0.01,
    Vectorizer=TfidfVectorizer,
) -> Dict:
    """
    Generates keywords that characterise the cluster, using the specified Vectorizer
    Args:
        documents: List of (preprocessed) text documents
        cluster_labels: List of integer cluster labels
        n: Number of top keywords to return
        Vectorizer: Vectorizer object to use (eg, TfidfVectorizer, CountVectorizer)
        tokenizer: Function to use to tokenise the input documents; by default splits the document into words
    Returns:
        Dictionary that maps cluster integer labels to a list of keywords
    """

    # Define vectorizer
    vectorizer = Vectorizer(
        analyzer="word",
        tokenizer=tokenizer,
        preprocessor=lambda x: x,
        token_pattern=None,
        max_df=max_df,
        min_df=min_df,
        max_features=10000,
    )

    # Create cluster text documents
    cluster_documents = cluster_texts(documents, cluster_labels)
    unique_cluster_labels = list(cluster_documents.keys())

    # Apply the vectorizer
    token_score_matrix = vectorizer.fit_transform(list(cluster_documents.values()))

    # Create a token lookup dictionary
    id_to_token = dict(
        zip(list(vectorizer.vocabulary_.values()), list(vectorizer.vocabulary_.keys()))
    )

    # For each cluster, check the top n tokens
    top_cluster_tokens = {}
    for i in range(token_score_matrix.shape[0]):
        # Get the cluster feature vector
        x = token_score_matrix[i, :].todense()
        # Find the indices of the top n tokens
        x = list(np.flip(np.argsort(np.array(x)))[0])[0:n]
        # Find the tokens corresponding to the top n indices
        top_cluster_tokens[unique_cluster_labels[i]] = [id_to_token[j] for j in x]

    return top_cluster_tokens

In [None]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
full_stopwords = stopwords.words("english")# + ["family", "aged", "object"]
def preproc(text: str) -> str:
    text = re.sub(r"[^a-zA-Z ]+", "", text).lower()
    text = text.split()
    text = [lemmatizer.lemmatize(t) for t in text]
    text = [t for t in text if t not in full_stopwords]
    return " ".join(text)

In [None]:
clusterer = KMeans(n_clusters=35, random_state=10)
clusterer.fit(viz_df[["umap_x", "umap_y"]])
soft_clusters = list(clusterer.labels_)
soft_cluster = [np.argmax(x) for x in soft_clusters]

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
title_texts = viz_df["text"].apply(preproc)
_cluster_texts = cluster_texts(title_texts, soft_clusters)

In [None]:
_cluster_keywords = cluster_keywords(
    documents=list(_cluster_texts.values()),
    cluster_labels=list(_cluster_texts.keys()),
    n=2,
    max_df=0.90,
    min_df=0.01,
    Vectorizer=TfidfVectorizer,
)

In [None]:
viz_df["soft_cluster"] = soft_clusters
viz_df["soft_cluster_"] = [str(x) for x in soft_clusters]


In [None]:
centroids = (
    viz_df.groupby("soft_cluster")
    .agg(x_c=("umap_x", "mean"), y_c=("umap_y", "mean"))
    .reset_index()
    .assign(
        keywords=lambda x: x.soft_cluster.apply(
            lambda y: ", ".join(_cluster_keywords[y])
        )
    )
)

In [None]:
import altair as alt
# remove max rows
alt.data_transformers.disable_max_rows()

In [None]:
text = (
    alt.Chart(centroids)
    .mark_text(
        fontSize=13.5,
        fontStyle="bold",
        opacity=0.8,
        stroke="white",
        strokeWidth=1,
        strokeOffset=0,
        strokeOpacity=0.4,
    )
    .encode(x=alt.X("x_c:Q"), y=alt.Y("y_c:Q"), text=alt.Text("keywords"))
)

In [None]:
viz_df.columns

In [None]:
orgs_df = CB.organisations_enriched.query("id in @viz_df.id.to_list()")[["id", "last_funding_on", "total_funding_gbp"]]

In [None]:
region_mapping = {
    # North America + Australia
    'USA': 'North America + Australia',
    'CAN': 'North America + Australia',
    'AUS': 'North America + Australia',
    'NZL': 'North America + Australia',

    # South America (Including Mexico and Central America)
    'VEN': 'South + Central America',
    'ARG': 'South + Central America',
    'BRA': 'South + Central America',
    'CHL': 'South + Central America',
    'COL': 'South + Central America',
    'PER': 'South + Central America',
    'URY': 'South + Central America',
    'PRY': 'South + Central America',
    'ECU': 'South + Central America',
    'BOL': 'South + Central America',
    'GUY': 'South + Central America',
    'SUR': 'South + Central America',
    'MEX': 'South + Central America',
    'CRI': 'South + Central America',
    'SLV': 'South + Central America',
    'GTM': 'South + Central America',
    'HND': 'South + Central America',
    'PAN': 'South + Central America',
    'NIC': 'South + Central America',

    # Europe
    'IRL': 'Europe',
    'LUX': 'Europe',
    'CHE': 'Europe',
    'ESP': 'Europe',
    'DEU': 'Europe',
    'FRA': 'Europe',
    'FIN': 'Europe',
    'SWE': 'Europe',
    'NLD': 'Europe',
    'BEL': 'Europe',
    'DNK': 'Europe',
    'CZE': 'Europe',
    'POL': 'Europe',
    'EST': 'Europe',
    'AUT': 'Europe',
    'ITA': 'Europe',
    'ROU': 'Europe',
    'CYP': 'Europe',
    'NOR': 'Europe',
    'PRT': 'Europe',
    'BGR': 'Europe',
    'BLR': 'Europe',
    'SVN': 'Europe',
    'ARM': 'Europe',
    'HUN': 'Europe',
    'ISL': 'Europe',
    'LVA': 'Europe',
    'LTU': 'Europe',
    'HRV': 'Europe',
    'MKD': 'Europe',
    'BIH': 'Europe',
    'SRB': 'Europe',
    'SVK': 'Europe',
    'GEO': 'Europe',
    'MDA': 'Europe',
    'ALB': 'Europe',
    'SMR': 'Europe',
    'AND': 'Europe',
    'GIB': 'Europe',
    'FRO': 'Europe',
    'LIE': 'Europe',
    'IMN': 'Europe',
    'GGY': 'Europe',
    'JEY': 'Europe',
    'ALA': 'Europe',

    # UK
    'GBR': 'UK',

    # Asia
    'IND': 'Asia',
    'HKG': 'Asia',
    'ISR': 'Asia',
    'RUS': 'Asia',
    'KOR': 'Asia',
    'SGP': 'Asia',
    'JPN': 'Asia',
    'ARE': 'Asia',
    'CHN': 'Asia',
    'PHL': 'Asia',
    'IDN': 'Asia',
    'THA': 'Asia',
    'TUR': 'Asia',
    'MYS': 'Asia',
    'TWN': 'Asia',
    'PAK': 'Asia',
    'LBN': 'Asia',
    'ARM': 'Asia',
    'BGD': 'Asia',
    'KWT': 'Asia',
    'VNM': 'Asia',
    'MDV': 'Asia',
    'JOR': 'Asia',
    'LKA': 'Asia',
    'IRN': 'Asia',
    'SYR': 'Asia',
    'KAZ': 'Asia',
    'UZB': 'Asia',
    'IRQ': 'Asia',
    'OMN': 'Asia',
    'PSE': 'Asia',
    'TJK': 'Asia',
    'BTN': 'Asia',
    'TLS': 'Asia',
    'MAC': 'Asia',
    'MMR': 'Asia',
    'MNG': 'Asia',
    'KHM': 'Asia',
    'LAO': 'Asia',
    'BRN': 'Asia',

    # Africa
    'ZAF': 'Africa',
    'MUS': 'Africa',
    'EGY': 'Africa',
    'GHA': 'Africa',
    'KEN': 'Africa',
    'NGA': 'Africa',
    'MAR': 'Africa',
    'CIV': 'Africa',
    'ETH': 'Africa',
    'TUN': 'Africa',
    'MOZ': 'Africa',
    'UGA': 'Africa',
    'SEN': 'Africa',
    'ZWE': 'Africa',
    'RWA': 'Africa',
    'SDN': 'Africa',
    # Add more African countries as needed...

    # Middle East
    'SAU': 'Middle East',
    'ARE': 'Middle East',
    'KWT': 'Middle East',
    'QAT': 'Middle East',
    'OMN': 'Middle East',
    'IRQ': 'Middle East',
    'IRN': 'Middle East',
    'SYR': 'Middle East',
    'JOR': 'Middle East',
    'LBN': 'Middle East',
    'ISR': 'Middle East',
    'YEM': 'Middle East',

    # Rest of the World
    None: 'Rest of the World',
    'BMU': 'Rest of the World',
    'TTO': 'Rest of the World',
    'GLP': 'Rest of the World',
    'CYM': 'Rest of the World',
    'IMN': 'Rest of the World',
    # Add any small or undefined territories here...
}

In [None]:
_viz_df = (
    viz_df.merge(orgs_df, on="id", how="left")
    .assign(recent_funding = lambda x: x.last_funding_on > "2019")
    .astype({"recent_funding": str})
    .rename(columns={'name': 'title', 'text': 'description', 'Name': 'category'})
    .fillna({'total_funding_gbp': 0})
    .assign(total_funding_gbp = lambda df: df.total_funding_gbp.apply(lambda x: round(x/1e+3,3)))
    .assign(region = lambda x: x.country_code.apply(lambda y: region_mapping.get(y, "Rest of the World")))
)

In [None]:
_viz_df.recent_funding.value_counts()

In [None]:
# Dropdown menu for 'recent_funding'
recent_funding_dropdown = alt.binding_select(
    options=[None] + list(sorted(list(_viz_df['recent_funding'].unique()))),  # Add 'None' for reset
    name="Recent Funding:"
)
recent_funding_selection = alt.selection_point(
    fields=['recent_funding'],
    bind=recent_funding_dropdown,
    name="SelectFunding"
)

# Dropdown menu for 'Name'
name_dropdown = alt.binding_select(
    options=[None] + list(sorted(list(_viz_df['category'].unique()))),  # Add 'None' for reset
    name="Category:"
)
name_selection = alt.selection_point(
    fields=['category'],
    bind=name_dropdown,
    name="SelectName"
)

# Dropdown menu for 'region'
region_dropdown = alt.binding_select(
    options=[None] + list(sorted(list(_viz_df['region'].unique()))),  # Add 'None' for reset
    name="Region:"
)
region_selection = alt.selection_point(
    fields=['region'],
    bind=region_dropdown,
    name="SelectRegion"
)

# Scatterplot with dropdown filters
fig = (
    alt.Chart(_viz_df, width=900, height=750)
    .mark_point(size=30, opacity=0.5)
    .encode(
        x=alt.X("umap_x:Q", axis=None),
        y=alt.Y("umap_y:Q", axis=None),
        tooltip=["title", "description", "country_code", "region", "category", "homepage_url", 'last_funding_on', 'total_funding_gbp'],
        color=alt.Color("category", legend=alt.Legend(title="Category",  labelLimit=300)),
        shape=alt.Shape("recent_funding", legend=alt.Legend(title="Recent funding (since 2020)")),
        opacity=alt.condition(
            recent_funding_selection & name_selection & region_selection, alt.value(0.5), alt.value(0.0)
        ),
        href="homepage_url"
    )
    .add_params(
        recent_funding_selection,
        name_selection,
        region_selection,
    )
    .interactive()
)

In [None]:
fig_final = (
    (fig + text)
    .configure_axis(
        # gridDash=[1, 7],
        gridColor="white",
        # remove axis all together
        domain=False,

    )
    .configure_view(strokeWidth=0, strokeOpacity=0)
    .properties(
        # title={
        #     "anchor": "start",
        #     "text": ["Children and parenting app landscape"],
        #     "subtitle": [
        #         "Each app is visualised as a circle, with similar apps located closer together",
        #     ],
        #     "subtitleFont": pu.FONT,
        #     "subtitleFontSize": 14,
        # },
    )
    .interactive()
)

# fig_final


In [None]:
output_path = PROJECT_DIR / f'data/2025_01_MS_ahl/landscape_{save_name}.html'
_viz_df.to_csv(PROJECT_DIR / f"data/2025_01_MS_ahl/table_{save_name}.csv", index=False)
fig_final.save(str(output_path))

Now let's generate some basic time series

In [None]:
ts_df = analysis_crunchbase.get_timeseries(matchings_orgs_df, funding_rounds_df, period='year', min_year=2014, max_year=2024)
ts_df

In [None]:
fig = charts.ts_bar(
    ts_df,
    variable='raised_amount_gbp_total',
    variable_title="Raised amount, £ millions",
    category_column="_category",
)
charts.configure_plots(fig, chart_title="")

Let's look into breakdown of deal types

In [None]:
deals_df, deal_counts_df = analysis_crunchbase.get_funding_by_year_and_range(funding_rounds_df, 2014, 2024)
aggregated_funding_types_df = analysis_crunchbase.aggregate_by_funding_round_types(funding_rounds_df)

In [None]:
aggregated_funding_types_df

In [None]:
deals_df

In [None]:
deal_counts_df

In [None]:
analysis_crunchbase.chart_investment_types(aggregated_funding_types_df)

In [None]:
analysis_crunchbase.chart_investment_types_counts(aggregated_funding_types_df)

In [None]:
analysis_crunchbase.chart_deal_sizes(deals_df)

In [None]:
analysis_crunchbase.chart_deal_sizes_counts(deal_counts_df)