In [1]:
import sys

sys.path.append("..")
import os
from toolkit.detect_entity_networks.api import DetectEntityNetworks
from toolkit.AI.openai_configuration import OpenAIConfiguration
import polars as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create the workflow object
den = DetectEntityNetworks()
# Set the AI configuration
ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.environ["OPENAI_API_KEY"],
        "model": "gpt-4o",
    }
)
den.set_ai_configuration(ai_configuration)

data_path = "../example_outputs/detect_entity_networks/company_grievances/company_grievances_input.csv"
entity_df = pl.read_csv(data_path)

print("Loaded data")

Loaded data


In [3]:
# set entity-attributes
from toolkit.detect_entity_networks.prepare_model import format_data_columns


entity_id_column = "name"
columns_to_link = ["address", "city", "email", "phone", "owner"]
entity_df = format_data_columns(entity_df, columns_to_link, entity_id_column)
den.add_attribute_links(entity_df, entity_id_column, columns_to_link)

summary = den.get_model_summary_value()
print("Summary")
print(summary)

Summary
Number of entities: 3602, Number of attributes: 18549, Number of flags: 0, Number of groups: 0, Number of links: 41727


In [4]:
# set flags
from toolkit.detect_entity_networks.classes import FlagAggregatorType


entity_id_column = "name"
columns_to_link = [
    "safety_grievances",
    "pay_grievances",
    "conditions_grievances",
    "treatment_grievances",
    "workload_grievances",
]
flag_format = FlagAggregatorType.Count
den.add_flag_links(entity_df, entity_id_column, columns_to_link, flag_format)
summary = den.get_model_summary_value()
print("Summary")
print(summary)

flags shape: (18_010, 4)
┌────────────────────────┬─────────────────────┬─────────────────────┬───────┐
│ entity                 ┆ type                ┆ flag                ┆ count │
│ ---                    ┆ ---                 ┆ ---                 ┆ ---   │
│ str                    ┆ str                 ┆ str                 ┆ i64   │
╞════════════════════════╪═════════════════════╪═════════════════════╪═══════╡
│ BlueWave Marine        ┆ safety_grievances   ┆ safety_grievances   ┆ 0     │
│ Urban Builders Co      ┆ safety_grievances   ┆ safety_grievances   ┆ 18    │
│ Solar Future Corp      ┆ safety_grievances   ┆ safety_grievances   ┆ 0     │
│ Tech Innovators Inc    ┆ safety_grievances   ┆ safety_grievances   ┆ 12    │
│ LearnTech              ┆ safety_grievances   ┆ safety_grievances   ┆ 0     │
│ …                      ┆ …                   ┆ …                   ┆ …     │
│ BuildIt Innovations    ┆ workload_grievances ┆ workload_grievances ┆ 0     │
│ Sports United          ┆ 

In [5]:
# set groups
entity_id_column = "name"
columns_to_link = ["sector", "country"]
den.add_group_links(entity_df, entity_id_column, columns_to_link)

summary = den.get_model_summary_value()
print("Summary")
print(summary)

Summary
Number of entities: 3602, Number of attributes: 18549, Number of flags: 8108, Number of groups: 634, Number of links: 41727


In [6]:
await den.index_nodes(["ENTITY"])
if len(den.embedded_texts) > 0:
    print(f"Number of nodes indexed: {len(den.embedded_texts)}")

Got 3602 existing texts
Got 0 new texts
Number of nodes indexed: 3602


In [None]:
# infer nodes with similar names

threshold = 0.03
den.infer_nodes(threshold)

inferred_links_count = len(den.inferred_links)
if inferred_links_count > 0:
    print(f"Number of links inferred: {inferred_links_count}")
    inferred_df = den.inferred_nodes_df()
    print(inferred_df)
else:
    print("No inferred links")

TypeError: object collections.defaultdict can't be used in 'await' expression

In [None]:
# Remove attributes


In [None]:
den.identify()

[('Enterprise Construction Co', 0, 0, 11, 0, 0, 0.0, 0.0),
 ('Venture Innovations', 0, 0, 11, 0, 0, 0.0, 0.0),
 ('Constructive Enterprises', 0, 0, 11, 0, 0, 0.0, 0.0),
 ('Enterprise Builders Group', 0, 0, 11, 0, 0, 0.0, 0.0),
 ('Constructive Ventures Ltd', 0, 0, 11, 0, 0, 0.0, 0.0),
 ('Balance Health Services', 0, 0, 11, 0, 0, 0.0, 0.0),
 ('ConstructCo Enterprizes', 0, 0, 11, 0, 0, 0.0, 0.0),
 ('Constructive Ventures', 0, 0, 11, 0, 0, 0.0, 0.0),
 ('BuildRight Solutions', 0, 0, 11, 0, 0, 0.0, 0.0),
 ('ConstructCo Enterprises', 0, 0, 11, 0, 0, 0.0, 0.0),
 ('ForestWood Products', 0, 1, 16, 0, 0, 0.0, 0.0),
 ('DesertTech Innovations', 0, 1, 16, 0, 0, 0.0, 0.0),
 ('ForestEdge Lumber Co', 0, 1, 16, 0, 0, 0.0, 0.0),
 ('DesertSun Solar', 0, 1, 16, 0, 0, 0.0, 0.0),
 ('Bark & Leaf Industries', 0, 1, 16, 0, 0, 0.0, 0.0),
 ('Sunshine Power', 0, 1, 16, 0, 0, 0.0, 0.0),
 ('Rays Solar Tech', 0, 1, 16, 0, 0, 0.0, 0.0),
 ('Bark & Branch Co', 0, 1, 16, 0, 0, 0.0, 0.0),
 ('Timberland Crafts', 0, 1, 16, 0

In [None]:
print(den.get_records_summary())
print(f"Attributes removed because of high degree: {len(den.trimmed_attributes)}")
print(den.trimmed_attributes)

Networks identified: 427 (427 with multiple entities, maximum 19)
Attributes removed because of high degree: 443
shape: (443, 2)
┌──────────────────────────┬─────────────────┐
│ Attribute                ┆ Linked Entities │
│ ---                      ┆ ---             │
│ str                      ┆ i64             │
╞══════════════════════════╪═════════════════╡
│ owner==Alex Scholar      ┆ 11              │
│ address==123 Silicon Ave ┆ 11              │
│ city==Breezetown         ┆ 11              │
│ city==ArtCity            ┆ 11              │
│ address==505 Journey Rd  ┆ 11              │
│ …                        ┆ …               │
│ city==Construct City     ┆ 189             │
│ city==Eco City           ┆ 189             │
│ city==Techville          ┆ 223             │
│ phone==5566778899        ┆ 225             │
│ phone==1122334455        ┆ 335             │
└──────────────────────────┴─────────────────┘


In [None]:
# with entities
den.get_entity_df().head()

entity_id,entity_flags,network_id,network_entities,network_flags,flagged,flags/entity,flagged/unflagged
str,i64,i64,i64,i64,i64,f64,f64
"""MediaHub""",10,340,12,86,8,7.17,2.0
"""Broadcast Inno…",7,340,12,86,8,7.17,2.0
"""Media Pioneers…",7,340,12,86,8,7.17,2.0
"""Media Solution…",7,340,12,86,8,7.17,2.0
"""Media Masters …",7,340,12,86,8,7.17,2.0


In [None]:
# with entities and groups
group_df = den.get_grouped_df()
group_df.head()

entity_id,entity_flags,network_id,network_entities,network_flags,flagged,flags/entity,flagged/unflagged,sector,country
str,i64,i64,i64,i64,i64,f64,f64,str,str
"""MediaHub""",10,340,12,86,8,7.17,2.0,"""Media""","""BroadcastLand"""
"""Broadcast Inno…",7,340,12,86,8,7.17,2.0,"""Media""","""Mediatown"""
"""Media Pioneers…",7,340,12,86,8,7.17,2.0,"""Media""","""Broadcastland"""
"""Media Solution…",7,340,12,86,8,7.17,2.0,"""Media""","""Broadcastland"""
"""Media Masters …",7,340,12,86,8,7.17,2.0,"""Media""","""Broadcastland"""


In [None]:
# report
selected_network = 115
selected_entity = ""
report = den.generate_report(selected_network=selected_network)
print(report)

##### Evaluation of Entity Network 115

In this network, we are examining the relationships and potential flag exposure among various entities connected through shared attributes such as phone numbers, email addresses, and physical addresses. The network includes several entities with similar names, which may indicate they are the same real-world entity or closely related entities.

### Entity Connections and Similarities

1. **Wellness Kare Group and Related Entities**:
   - The entity "Wellness Kare Group" is connected to several other entities with similar names, such as "WellCare Clinic," "Wellness Clinic," "Wellness First Clinic," and others. These entities share multiple attributes, including phone numbers, email addresses, and physical addresses, suggesting they may be part of the same organization or network of organizations.
   - The shared phone number 1122334455 and email address contact@wellnesscarecom further strengthen the likelihood that these entities are interconnected