# Examples of using analysis functionalities

Using discovery_utils analyses functionalities for investments data

Here, we'll find companies using their categories, but you can also use search results from the process shown in cybersec_search.ipynb

In [2]:
from discovery_utils.utils import (
    viz_landscape
)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/karlis.kanders/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import pandas as pd
from src import PROJECT_DIR

In [4]:
from discovery_utils.getters import crunchbase
CB = crunchbase.CrunchbaseGetter(vector_db_path=PROJECT_DIR / "tmp/vector_db")

2025-01-22 17:36:06,782 - discovery_utils.getters.crunchbase - INFO - Checking for latest version of data in S3 bucket: discovery-iss
2025-01-22 17:36:06,902 - discovery_utils.getters.crunchbase - INFO - Latest Crunchbase version found: Crunchbase_2025-01-20


In [5]:
lists_of_categories = {
    "biological": ["Agtech", "Agriculture"],
    "health_diabetes": ["Diabetes"],
    "health_nutrition": ["Dietary Supplements", "Nutrition"],
    "economic_retail": ["Organic Food", "Grocery", "Snack Food", "Farmers Market"],
    "economic_ooh": ["Food Delivery", "Restaurants"],
    "economic_food_proc": ["Food Processing"],
    "food_beverage": ["Food and Beverage"],
    "social": ["Recipes"]
}

In [6]:
save_name = "social"
list_of_categories = lists_of_categories[save_name]
print(list_of_categories)
selected_df = CB.get_companies_in_categories(list_of_categories, category_type="narrow")

matching_ids = set(list(selected_df.id.to_list()))

2025-01-22 17:36:06,916 - discovery_utils.getters.crunchbase - INFO - Downloading parquet file: data/crunchbase/enriched/organizations_full.parquet
2025-01-22 17:36:07,038 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-01-22 17:36:07,042 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-01-22 17:36:07,068 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-01-22 17:36:07,071 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-01-22 17:36:07,077 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', '

['Recipes']


2025-01-22 17:36:07,495 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-01-22 17:36:07,497 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-01-22 17:36:07,522 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-01-22 17:36:07,791 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-01-22 17:36:07,884 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
2025-01-22 17:36:07,894 - botocore.httpchecksum - INFO - Skipping checksum validation. Response did not contain one of the followi

In [7]:
only_recent_companies = False
if only_recent_companies:
    ids_recent = CB.organisations_enriched.query("last_funding_on > '2019'").id.to_list()
    ids_new = CB.organisations_enriched.query("founded_on > '2019'").id.to_list()
    recent_or_new = set(ids_recent + ids_new)
    matching_ids = matching_ids.intersection(recent_or_new)

In [8]:
len(matching_ids)

1719

In [9]:
# write an sql query to achieve id in test_ids
id_condition = "id in ('{}')".format("', '".join(list(matching_ids)))
vectors_df = CB.VectorDB.vector_db.search().where(id_condition).limit(30000).to_pandas()
len(vectors_df)

2025-01-22 17:36:35,709 - root - INFO - Folder /Users/karlis.kanders/Code/discovery_mission_radar_prototyping/tmp/vector_db/crunchbase-lancedb/ already exists. Set overwrite=True to download again.
2025-01-22 17:36:35,712 - root - INFO - Connected with database crunchbase-lancedb. Available tables: ['company_embeddings']
2025-01-22 17:36:35,751 - root - ERROR - Error creating FTS index: Index already exists. Use replace=True to overwrite.


1701

In [10]:
fig, cb_viz_df = viz_landscape.generate_crunchbase_landscape(vectors_df, CB)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-01-22 17:36:45,260 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-01-22 17:36:46,411 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-01-22 17:36:47,479 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-01-22 17:36:48,487 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-01-22 17:36:49,632 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-01-22 17:36:49,673 - root - INFO - Outliers were successfully reduced


In [11]:
output_path = PROJECT_DIR / f'data/2025_01_MS_ahl/_landscape_{save_name}.html'
# cb_viz_df.to_csv(PROJECT_DIR / f"data/2025_01_MS_ahl/_table_{save_name}.csv", index=False)
fig.save(str(output_path))