In [1]:
import sys

sys.path.append("..")
import os
from toolkit.detect_entity_networks.api import DetectEntityNetworks
from toolkit.AI.openai_configuration import OpenAIConfiguration
import polars as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create the workflow object
den = DetectEntityNetworks()
# Set the AI configuration
ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.environ["OPENAI_API_KEY"],
        "model": "gpt-4o",
    }
)
den.set_ai_configuration(ai_configuration)

data_path = "../example_outputs/detect_entity_networks/company_grievances/company_grievances_input.csv"
entity_df = pl.read_csv(data_path)

print("Loaded data")

Loaded data


In [3]:
# set entity-attributes
from toolkit.detect_entity_networks.prepare_model import format_data_columns


entity_id_column = "name"
columns_to_link = ["address", "city", "email", "phone", "owner"]
entity_df = format_data_columns(entity_df, columns_to_link, entity_id_column)
den.add_attribute_links(entity_df, entity_id_column, columns_to_link)

summary = den.get_model_summary_value()
print("Summary")
print(summary)

Summary
Number of entities: 3602, Number of attributes: 18549, Number of flags: 0, Number of groups: 0, Number of links: 41727


In [4]:
# set flags
from toolkit.detect_entity_networks.classes import FlagAggregatorType


entity_id_column = "name"
columns_to_link = [
    "safety_grievances",
    "pay_grievances",
    "conditions_grievances",
    "treatment_grievances",
    "workload_grievances",
]
flag_format = FlagAggregatorType.Count
den.add_flag_links(entity_df, entity_id_column, columns_to_link, flag_format)
summary = den.get_model_summary_value()
print("Summary")
print(summary)

Summary
Number of entities: 3602, Number of attributes: 18549, Number of flags: 8108, Number of groups: 0, Number of links: 41727


In [5]:
# set groups
entity_id_column = "name"
columns_to_link = ["sector", "country"]
den.add_groups(entity_df, entity_id_column, columns_to_link)

summary = den.get_model_summary_value()
print("Summary")
print(summary)

Summary
Number of entities: 3602, Number of attributes: 18549, Number of flags: 8108, Number of groups: 634, Number of links: 41727


In [6]:
await den.index_nodes(["ENTITY"])
if len(den.embedded_texts) > 0:
    print(f"Number of nodes indexed: {len(den.embedded_texts)}")

Got 3602 existing texts
Got 0 new texts
Number of nodes indexed: 3602


In [7]:
############REMOVE FN from toolkit.detect_entity_networks.index_and_infer import build_inferred_df


threshold = 0.03
await den.infer_nodes(threshold)

inferred_links_count = len(den.inferred_links)
if inferred_links_count > 0:
    print(f"Number of links inferred: {inferred_links_count}")
    inferred_df = den.inferred_nodes_df()
    print(inferred_df)
else:
    print("No inferred links")

Number of links inferred: 2242
shape: (3_547, 2)
┌───────────────────────┬────────────────────────┐
│ text                  ┆ similar                │
│ ---                   ┆ ---                    │
│ str                   ┆ str                    │
╞═══════════════════════╪════════════════════════╡
│ Adventure Gear        ┆ Adventure Gear Co      │
│ Adventure Gear        ┆ Adventure Gear Company │
│ Adventure Gear        ┆ AdventureGear          │
│ Adventure Gear Co     ┆ Adventure Gear Company │
│ Adventure Gear Co     ┆ AdventureGear Co       │
│ …                     ┆ …                      │
│ WindPower Corp        ┆ WindPower Inc          │
│ WindPower Solutions   ┆ WindPower Solutons     │
│ Windy Heights Limited ┆ Windy Heights Ltd      │
│ WindyCity Energy      ┆ WindyCity Power        │
│ Zephyr Energy Co      ┆ Zephyr Energy Inc      │
└───────────────────────┴────────────────────────┘


In [8]:
# Remove attributes


In [9]:
den.identify()

In [10]:
print(den.get_records_summary())
print(f"Attributes removed because of high degree: {len(den.trimmed_attributes)}")
print(den.trimmed_attributes)

Networks identified: 433 (433 with multiple entities, maximum 19)
Attributes removed because of high degree: 443
shape: (443, 2)
┌──────────────────────────┬─────────────────┐
│ Attribute                ┆ Linked Entities │
│ ---                      ┆ ---             │
│ str                      ┆ i64             │
╞══════════════════════════╪═════════════════╡
│ owner==Morgan Build      ┆ 11              │
│ city==ArtCity            ┆ 11              │
│ city==Port City          ┆ 11              │
│ phone==9876543220        ┆ 11              │
│ address==123 Silicon Ave ┆ 11              │
│ …                        ┆ …               │
│ city==Construct City     ┆ 189             │
│ city==Eco City           ┆ 189             │
│ city==Techville          ┆ 223             │
│ phone==5566778899        ┆ 225             │
│ phone==1122334455        ┆ 335             │
└──────────────────────────┴─────────────────┘


In [11]:
# with entities
den.get_entity_df().head()

entity_id,entity_flags,network_id,network_entities,network_flags,flagged,flags/entity,flagged/unflagged
str,i64,i64,i64,i64,i64,f64,f64
"""MediaWave Stud…",6,213,10,93,9,9.3,9.0
"""WaveTech Produ…",7,213,10,93,9,9.3,9.0
"""MediaWave Prod…",16,213,10,93,9,9.3,9.0
"""Creative Visio…",0,213,10,93,9,9.3,9.0
"""Wave Media Gro…",4,213,10,93,9,9.3,9.0


In [12]:
# with entities and groups
group_df = den.get_grouped_df()
group_df.head()

entity_id,entity_flags,network_id,network_entities,network_flags,flagged,flags/entity,flagged/unflagged,sector,country
str,i64,i64,i64,i64,i64,f64,f64,str,str
"""MediaWave Stud…",6,213,10,93,9,9.3,9.0,"""Media""","""Broadcastland"""
"""WaveTech Produ…",7,213,10,93,9,9.3,9.0,"""Media""","""Broadcastland"""
"""MediaWave Prod…",16,213,10,93,9,9.3,9.0,"""Media""","""Broadcastland"""
"""Creative Visio…",0,213,10,93,9,9.3,9.0,"""Art""","""Artland"""
"""Wave Media Gro…",4,213,10,93,9,9.3,9.0,"""Media""","""Broadcastland"""


In [13]:
# report
selected_network = 115
selected_entity = ""
report = den.generate_report(selected_network=selected_network)
print(report)

##### Evaluation of Entity Network 115

In this analysis, we are examining the network of entities connected through shared attributes such as ownership and city location. The network includes several entities, some of which are directly linked to flags, while others are connected indirectly through shared attributes.

**Entities and Connections:**

1. **Farmer Jill** is an owner connected to three entities:
   - **Agri Coop**
   - **AgriLand Produce**
   - A group of entities collectively referred to as **Harvest Co; HarvestTime Co; Harvesters Co; Harvest Corp; HarvestTech Inc; Harvesters Inc; HarvestTech; TechHarvest Ltd; HarvestTech Co**. This group has a flag count of 3.

2. **Agri Coop** is linked to:
   - **Farmer Jill** (owner)
   - **Agri Town** (city)

3. **AgriLand Produce** is linked to:
   - **Farmer Jill** (owner)
   - **Harvest City** (city)

4. The group of entities (Harvest Co, etc.) is linked to:
   - **Farmer Jill** (owner)
   - **Agri Town** (city)
   - **Harvest Cit