In [1]:
import re
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from zipfile import ZipFile
from io import BytesIO
import polars as pl
import multiprocessing
from multiprocessing import Pool, cpu_count
from urllib.parse import urljoin
import concurrent.futures
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import json
from datetime import datetime

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

pl.Config.set_tbl_cols(100)  # mostra fins a 100 columnes
pl.Config.set_tbl_width_chars(100)  # amplia l'ample màxim de la taula

polars.config.Config

In [3]:
df = pl.read_parquet('data/FILTERED_DATAFRAME.parquet')
initially_imported_df = df.clone()
df.head()

state,date,month_year,year,event_code,quad_class,goldstein_scale,avg_tone,actor1_statecode,actor2_statecode,url,title,full_text
str,i64,i64,i64,i64,i64,f64,f64,str,str,str,str,str
"""USMO""",20210514,202105,2021,16,1,-2.0,-8.934073,"""USMO""","""USMO""","""https://www.natlawreview.com/a…","""State of the Law for Business …","""It’s been a year since COVID-1…"
"""USMO""",20210514,202105,2021,141,3,-6.5,-0.808625,"""USMO""","""USMO""","""https://www.kcur.org/health/20…","""Medicaid Expansion Supporters …","""A day after Missouri Gov. Mike…"
"""USMO""",20210529,202105,2021,13,1,0.4,-6.008584,"""USMO""","""USMO""","""https://www.dailystar.co.uk/ne…","""Elderly woman sucker-punched t…","""Elderly woman sucker-punched t…"
"""USAR""",20200207,202002,2020,16,1,-2.0,-8.0,"""USAR""",,"""https://www.houstonchronicle.c…",,
"""USNH""",20201206,202012,2020,70,2,7.0,0.088106,"""USNH""","""USNH""","""https://www.fosters.com/story/…","""Historically Speaking: Adventu…","""Historically Speaking: Adventu…"


# Resources

## LLM
We are going to use, for now, the free tier of Gemini 1.5 Pro.
Reasons:
* out of all LLMs, Gemini's free tiers is the one with the best conditions
* as per the specific model, 1.5 Pro "has the best blend of reasoning, context length (1M tokens), and accuracy for structured tasks like entity and relation extraction" (quoting chatgpt) -- only con is that it's kinda slow (throughput is 2 requests/minute)
* there's also a 1.5 Lite version - which is faster in processing at the expense of accuracy (difference should be minimal though) -- we will try to stick to the Pro version for now because accuracy is of the utmost important in our use case

⚠️ turns out LangChain doesn't support tool usage for Gemini ⚠️
options:
* not using LangChain and accessing the llm through the API - this was our original intention but it seems less straightforward
* doing it in Prompt-Based mode - this means prompting the LLM to set up the nodes and relationships

We are going to do the first option.

Things to know:
* LangChain organises the data using `pydantic`, a python library created exactly for this purpose ([more info here](https://www.geeksforgeeks.org/introduction-to-python-pydantic-library/)). What pydantic does is offer a set of classes (to be inherited by the classes created by the user). The goal of this is to have a base structure for the data so that all objects belong to one of these classes and so that all objects of the same type behave the same way. It is commonly used when building applications.
* The way to do this without LangChain is to create the classes in JSON format, so that the LLM (Gemini) understands them. 

we are gonna do both, paralelly, for now.

### **About Pydantic and LangChain classes:**
LangChain has the parent class BaseModel.
Pydantic has Node and Relationship (here we call them BaseNode and BaseRelationship to avoid confusion with our classes). Both of them inherit from [Serializable](https://python.langchain.com/api_reference/core/load/langchain_core.load.serializable.Serializable.html#langchain_core.load.serializable.Serializable), which is a class that allows serialization of objects to JSON, optionally (`is_lc_serializable: bool`).

* The [BaseNode](https://python.langchain.com/api_reference/community/graphs/langchain_community.graphs.graph_document.Node.html) class which has attributes `id`, `type`, and `properties`. We specify them below because we include their descriptions so they are passed to the LLM, as the whole point of defining these classes is for the LLM to understand our desired schema for the graph.

* Simlarly, the [BaseRelationship](https://python.langchain.com/api_reference/community/graphs/langchain_community.graphs.graph_document.Relationship.html) has attributes `source`, `target`, `type`, and `properties`, where source and target are instances of the Node class.


* [Field()](https://docs.pydantic.dev/latest/concepts/fields/) - a function used to add specifications to an attribute beyond its type, including constraints and description.
    * `...` - means that attribute is required - aka must be defined (you can also write *default* instead of ...)
    * `gt`, `lt`, `ge`, `le` -- numeric constraints >, <, >=, <=
    * `min_length`, `max_length`
    * `example`

In [4]:
# !pip install -U langchain-community
# !pip install neo4j
# !pip install google-generativeai

In [5]:
from pydantic import BaseModel, Field # base model from pydantic
from enum import Enum
from typing import List, Optional, Dict, Any

# base classes from LangChain
from langchain_community.graphs.graph_document import (Node as BaseNode,
                                                       Relationship as BaseRelationship,
                                                       GraphDocument,)

#### **NODES**

In JSON there's no such thing as parent-child classes but we can try to mimic is. First we do the Node parent class.

"parameters":
* id - human-readable name i.e. Barack Obama
* label - type of entitie i.e. Person/Actor
    * this is a closed list called that goes into the parameter `enum` 
    * i.e. ['Actor', 'Event', 'Place']
* properties - extra info/metadata

In [6]:
### LISTS OF POTENTIAL VALUES 
# enum possible values for each class    
enum_nodes = ["Actor", "Place", "Event"]
enum_properties = ['actor_type']

# actor types (CAMEO)
cameo_actor_dict = {
    "COP": "Police Forces",
    "GOV": "Government",
    "INS": "Insurgents",
    "JUD": "Judiciary",
    "MIL": "Military",
    "OPP": "Political Opposition",
    "REB": "Rebels",
    "SEP": "Separatist Rebels",
    "SPY": "State Intelligence",
    "UAF": "Unaligned Armed Forces",
    "AGR": "Agriculture",
    "BUS": "Business",
    "CRM": "Criminal",
    "CVL": "Civilian",
    "DEV": "Development",
    "EDU": "Education",
    "ELI": "Elites",
    "ENV": "Environmental",
    "HLH": "Health",
    "HRI": "Human Rights",
    "LAB": "Labor",
    "LEG": "Legislature",
    "MED": "Media",
    "REF": "Refugees",
    "MOD": "Moderate",
    "RAD": "Radical",
    "AMN": "Amnesty International",
    "IRC": "Red Cross",
    "GRP": "Greenpeace",
    "UNO": "United Nations",
    "PKO": "Peacekeepers",
    "UIS": "Unidentified State Actor",
    "IGO": "Inter-Governmental Organization",
    "IMG": "International Militarized Group",
    "INT": "International/Transnational Generic",
    "MNC": "Multinational Corporation",
    "NGM": "Non-Governmental Movement",
    "NGO": "Non-Governmental Organization",
    "SET": "Settler"
}

cameo_actor_names = list(cameo_actor_dict.values())
cameo_actor_codes = list(cameo_actor_dict.keys())

In [None]:
##### PARENT CLASS #####

## pydantic ##

class Property(BaseModel):
    """A single property consisting of key and value.
    Mainly used for models that do not support dictionaries as object values (OpenAI)"""
    key: str = Field(..., description=f"Available options are {enum_properties}. If the type is Actor, include 'actor_type' as a property.")
    value: str = Field(..., description="If key is 'actor_type', value must be one of: " +  ", ".join(f"{code} ({label})" for code, label in cameo_actor_names)
    )

class Node(BaseNode):
    """ Inherits from LangChain's Node class. We specify id and type for the LLM"""
    id: str = Field(..., description="Name or human-readable unique identifier")
    type: str = Field(..., description=f"Type of entity. Available options are {enum_nodes}") # enum_values to be defined
    properties: Optional[List[Property]]

## json ##

json_node = {
    "type": "object",
    "properties": {
        "id": {
            "type": "string",
            "description": "A unique, human-readable name for the node, e.g., 'Barack Obama' or 'New York City'"
        },
        "label": {
            "type": "string",
            "description": "The type of node. One of: ['Person', 'Organization', 'Place', 'Event']",
            "enum": ["Person", "Organization", "Place", "Event"]  # <-- this limits node types. equivalent to enum_values
        },
        "properties": {
            "type": "array",
            "description": "Optional metadata about the node",
            "items": {
                "type": "object",
                "properties": {
                    "key": {"type": "string"},
                    "value": {"type": "string"}
                },
                "required": ["key", "value"]
            }
        }
    },
    "required": ["id", "label"]
}


#### **RELATIONSHIPS**

As explain above has attributes source, target, type and properties.

Source and target *must* be instances of the Node class. However, some LLMs do not handle well nested structures. The solution to this would be to flatten these attributes so we would have the following:

`source_node_id: str` 

`source_node_label: str`

`target_node_id: str`

`target_node_label: str`

Below, we set up the *json_relationship* with the same nested structure (so inside the field "source" we put all the same things as in the json_node variable) to mimic the nested structure of the classes. 

**Making them temporal** \\
The temporal nature of a TKG manifests in the _relationships_. To make them temporal we include a *start_date* and *end_date* as attributes of the class. Here we leave a static Relationship class and make the TemporalRelationship as a child class.

In [7]:
import pydantic
print(pydantic.__version__)


2.11.3


In [8]:
## pydantic ##
    
class Relationship(BaseRelationship):
    """
    A single relationship consisting of a source node, target node, and type.
    """
    source: Node = Field(..., description="Source node object participating in the relationship")
    target: Node = Field(..., description="Target node object participating in the relationship")
    type: str = Field(..., description="Type of relationship such as 'is_a', 'located_in', etc.")
    properties: Optional[List[Property]] = Field(None, description="List of relationship properties")


class TemporalRelationship(Relationship):
    """
    A single temporal relationship consisting of a source node, target node, type, start time, and end time.
    """
    start_time: Optional[datetime] = Field(
        None, description="The start time, meaning the time at which the relationship became valid."
    )
    end_time: Optional[datetime] = Field(
        None, description="The end time, meaning the time at which the relationship ceased to be valid."
    )


## json ##

json_relationship = {
  "$defs": {
    "Node": {
      "description": "Inherits from LangChain's Node class. We specify id and label for the LLM",
      "properties": {
        "id": {
          "description": "Name or human-readable unique identifier",
          "title": "Id",
          "type": "string"
        },
        "type": {
          "description": "Available options are ['Person', 'Organization', 'Place', 'Event']",
          "title": "Type",
          "type": "string"
        },
        "properties": {
          "anyOf": [
            {
              "items": {
                "$ref": "#/$defs/Property"
              },
              "type": "array"
            },
            {
              "type": "null"
            }
          ],
          "title": "Properties"
        }
      },
      "required": [
        "id",
        "type",
        "properties"
      ],
      "title": "Node",
      "type": "object"
    },
    "Property": {
      "description": "A single property consisting of key and value.\nMainly used for models that do not support dictionaries as object values (OpenAI)",
      "properties": {
        "key": {
          "description": "Available options are []",
          "title": "Key",
          "type": "string"
        },
        "value": {
          "description": "value",
          "title": "Value",
          "type": "string"
        }
      },
      "required": [
        "key",
        "value"
      ],
      "title": "Property",
      "type": "object"
    }
  },
  "description": "A single relationship consisting of a source node, target node, and type.",
  "properties": {
    "source": {
      "$ref": "#/$defs/Node",
      "description": "Source node object participating in the relationship"
    },
    "target": {
      "$ref": "#/$defs/Node",
      "description": "Target node object participating in the relationship"
    },
    "type": {
      "description": "Type of relationship such as 'is_a', 'located_in', etc.",
      "title": "Type",
      "type": "string"
    },
    "properties": {
      "anyOf": [
        {
          "items": {
            "$ref": "#/$defs/Property"
          },
          "type": "array"
        },
        {
          "type": "null"
        }
      ],
      "default": 'null',
      "description": "List of relationship properties",
      "title": "Properties"
    }
  },
  "required": [
    "source",
    "target",
    "type"
  ],
  "title": "Relationship",
  "type": "object"
}


You can print the schema by doing this below. 

* The `$defs` section holds reusable schema definitions. If a schema contains nested models (like Node or Property), those sub-models will be defined once inside `$defs`.
* A `$ref` is a pointer to one of those definitions in $defs.

Benefits:
* It helps avoid duplication and circular references when nesting models.
* Makes complex schemas modular and efficient
* Helps validators, code generators, and other tools understand and reuse components without duplication

In [9]:
# print(json.dumps(Relationship.model_json_schema(), indent=2))  # Pydantic v2

#### **KNOWLEDGE GRAPH**

In [51]:
class KnowledgeGraph(BaseModel):
    """A knowledge graph composed of entities (nodes) and their relationships."""
    
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph"
    )
    
    relationships: List[Relationship] = Field(
        ..., description="List of relationships between the nodes"
    )
    def model_post_init(self, __context) -> None:
        node_ids = {node.id for node in self.nodes}

        # for rel in self.relationships:
        #     source_id = rel.source.id if hasattr(rel.source, "id") else rel.source
        #     target_id = rel.target.id if hasattr(rel.target, "id") else rel.target

        #     if source_id not in node_ids:
        #         raise ValueError(f"Relationship source '{source_id}' not found in nodes")

        #     if target_id not in node_ids:
        #         raise ValueError(f"Relationship target '{target_id}' not found in nodes")
            
        #     if isinstance(rel, TemporalRelationship):
        #         if rel.start_time and rel.end_time and rel.start_time > rel.end_time:
        #             raise ValueError(f"Start time {rel.start_time} is after end time {rel.end_time} in relationship {rel}")


In [None]:
print(json.dumps(KnowledgeGraph.model_json_schema(), indent=2))  # Pydantic v2

# Example KG

In [None]:
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
os.getenv("GEMINI_API_KEY") is not None

True

In [27]:
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not set in environment.")

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",  # use actual available model name
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    google_api_key = api_key
)

schema = KnowledgeGraph.model_json_schema()
document = initially_imported_df[1,-1]

prompt = f"""
You are an expert in knowledge representation. 
Read the input text and extract a structured knowledge graph.

The output must strictly conform to the following JSON schema:

{schema}

Text to analyze:
{document}

Return only the JSON.
"""

response = llm.invoke(prompt)
response.content

'```json\n{"nodes": [{"id": "Mike Parson", "type": "Person", "properties": [{"key": "position", "value": "Governor of Missouri"}, {"key": "party", "value": "Republican"}]}, {"id": "Missouri", "type": "Place", "properties": []}, {"id": "Medicaid Expansion", "type": "Event", "properties": []}, {"id": "Kansas City", "type": "Place", "properties": []}, {"id": "Health Care Advocacy Groups", "type": "Organization", "properties": []}, {"id": "Medicaid", "type": "Event", "properties": []}, {"id": "Trinity United Methodist Church", "type": "Place", "properties": [{"key": "location", "value": "Kansas City"}]}, {"id": "Rev. Vernon Howard", "type": "Person", "properties": [{"key": "position", "value": "President of the Kansas City chapter of the Southern Christian Leadership Conference"}]}, {"id": "Jameson Wells", "type": "Person", "properties": [{"key": "position", "value": "Jobs With Justice organizer"}]}, {"id": "Centers For Medicare and Medicaid Services", "type": "Organization", "properties":

In [None]:
import pprint

pprint.pprint(response.content)

In [21]:
response

AIMessage(content='```json\n{"nodes": [{"id": "Mike Parson", "type": "Person", "properties": [{"key": "position", "value": "Governor of Missouri"}, {"key": "party", "value": "Republican"}]}, {"id": "Missouri", "type": "Place", "properties": []}, {"id": "Medicaid Expansion", "type": "Event", "properties": []}, {"id": "Kansas City", "type": "Place", "properties": []}, {"id": "Health Care Advocacy Groups", "type": "Organization", "properties": []}, {"id": "Medicaid", "type": "Event", "properties": []}, {"id": "Trinity United Methodist Church", "type": "Place", "properties": [{"key": "location", "value": "Kansas City"}]}, {"id": "Rev. Vernon Howard", "type": "Person", "properties": [{"key": "position", "value": "President of the Kansas City chapter of the Southern Christian Leadership Conference"}]}, {"id": "Jameson Wells", "type": "Person", "properties": [{"key": "position", "value": "Jobs With Justice organizer"}]}, {"id": "Centers For Medicare and Medicaid Services", "type": "Organizati

In [38]:
df_selected = df[:100, -1]
df_selected

full_text
str
"""It’s been a year since COVID-1…"
"""A day after Missouri Gov. Mike…"
"""Elderly woman sucker-punched t…"
""
"""Historically Speaking: Adventu…"
…
""
"""Aides to the Duke of York and …"
"""A Martin City man has been sen…"
"""John Stoehr, who has been one …"


## doing it for a set of documents

In [32]:
# Store all nodes and relationships across documents
all_nodes = []
all_relationships = []

# Loop over each article
for i, text in enumerate(initially_imported_df[:101, -1]):
    
    try:
        response = llm.invoke(prompt)
        result_json = response.content.strip().strip("```json").strip("```")
        result = json.loads(result_json)

        all_nodes.extend(result.get("nodes", []))
        all_relationships.extend(result.get("relationships", []))
    except Exception as e:
        print(f"[Warning] Skipped article {i} due to error: {e}")
        continue

# Optional: Deduplicate nodes and relationships
def dedup_list(dicts, key_fields):
    seen = set()
    unique = []
    for d in dicts:
        key = tuple(d.get(k) for k in key_fields)
        if key not in seen:
            seen.add(key)
            unique.append(d)
    return unique

all_nodes = dedup_list(all_nodes, key_fields=["id", "type"])
all_relationships = dedup_list(all_relationships, key_fields=["source", "target", "type"])

# Final merged knowledge graph
joint_knowledge_graph = {
    "nodes": all_nodes,
    "relationships": all_relationships
}

In [36]:
joint_knowledge_graph

{'nodes': [{'id': 'Mike Parson',
   'type': 'Person',
   'properties': [{'key': 'position', 'value': 'Governor of Missouri'},
    {'key': 'party', 'value': 'Republican'}]},
  {'id': 'Missouri', 'type': 'Place', 'properties': []},
  {'id': 'Medicaid Expansion', 'type': 'Event', 'properties': []},
  {'id': 'Kansas City', 'type': 'Place', 'properties': []},
  {'id': 'Health Care Advocacy Groups',
   'type': 'Organization',
   'properties': []},
  {'id': 'Medicaid', 'type': 'Event', 'properties': []},
  {'id': 'Trinity United Methodist Church',
   'type': 'Place',
   'properties': [{'key': 'location', 'value': 'Kansas City'}]},
  {'id': 'Rev. Vernon Howard',
   'type': 'Person',
   'properties': [{'key': 'position',
     'value': 'President of the Kansas City chapter of the Southern Christian Leadership Conference'}]},
  {'id': 'Jameson Wells',
   'type': 'Person',
   'properties': [{'key': 'position',
     'value': 'Jobs With Justice organizer'}]},
  {'id': 'Centers For Medicare and Medic


## Neo4j
username: neo4j \
password: 3qP0uol2yFB9pTMDocp450ROzPBbxX1nS5uKd7onLqc

In [23]:
## CONNECTION TO NEO4J

from langchain_community.graphs import Neo4jGraph


graph = Neo4jGraph(
    url="neo4j+s://d750d001.databases.neo4j.io",
    username="neo4j",
    password="3qP0uol2yFB9pTMDocp450ROzPBbxX1nS5uKd7onLqc",
    refresh_schema=False
)

  graph = Neo4jGraph(


In [40]:
import json

# CLEAN GRAPH IF NECESSARY
graph.query("MATCH (n) DETACH DELETE n")

# Strip out the Markdown syntax if necessary
#raw_json_str = joint_knowledge_graph.content.strip("```json\n").strip("```")
data = json.loads(joint_knowledge_graph)

nodes = data["nodes"]
relationships = data["relationships"]

for node in nodes:
    node_id = node["id"]
    label = node["type"]
    properties = {prop["key"]: prop["value"] for prop in node.get("properties", [])}
    # Ensure 'id' is part of the properties for matching later
    properties["id"] = node_id

    # Dynamically create Cypher query
    props_str = ", ".join(f"{key}: ${key}" for key in properties.keys())
    query = f"MERGE (n:{label} {{id: $id}}) SET n += {{{props_str}}}"

    graph.query(query, params=properties)

for rel in relationships:
    source_id = rel["source"]
    target_id = rel["target"]
    rel_type = rel["type"]
    properties = {prop["key"]: prop["value"] for prop in rel.get("properties", [])}

    # Build Cypher query for relationship
    props_str = ", ".join(f"{key}: ${key}" for key in properties.keys())
    if props_str:
        query = (
            f"""
            MATCH (a {{id: $source_id}}), (b {{id: $target_id}})
            MERGE (a)-[r:{rel_type}]->(b)
            SET r += {{{props_str}}}
            """
        )
    else:
        query = (
            f"""
            MATCH (a {{id: $source_id}}), (b {{id: $target_id}})
            MERGE (a)-[r:{rel_type}]->(b)
            """
        )

    params = {"source_id": source_id, "target_id": target_id, **properties}
    graph.query(query, params=params)


TypeError: the JSON object must be str, bytes or bytearray, not dict

# QUESTIONS THAT COME UP

1. how should we filter news that are "anecdotes"/isolated events unrelated to the country's general state (see example below)

In [18]:
# 1 - ANECDOTES
initially_imported_df[2,-2:]

title,full_text
str,str
"""Elderly woman sucker-punched t…","""Elderly woman sucker-punched t…"
