In [None]:
import urllib.parse        
import requests
from bs4 import BeautifulSoup
import re

import numpy as np
from tqdm import tqdm_notebook
from tqdm import tqdm

import json

from utils.structured_aligner import Aligner
from utils.openai_utils import LLMTripletExtractor
from utils.structured_inference_with_db import extract_triplets

import pandas as pd
import networkx as nx
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

from pymongo import MongoClient
import os
from dotenv import load_dotenv, find_dotenv

from utils.structured_aligner import Aligner

In [None]:
# --- Mongo Setup ---
_ = load_dotenv(find_dotenv())
mongo_client = MongoClient(os.getenv("MONGO_URI"))
db = mongo_client.get_database("wikidata_ontology")

# --- Extractor Setup ---
# extractor = LLMTripletExtractor(model='gpt-4.1-mini')
aligner = Aligner(db)

In [None]:
def get_wiki_paragraphs_by_entity(entity_name):

    url = f"https://en.wikipedia.org/wiki/{entity_name}"
    response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find(id="firstHeading")

    text_div = soup.find("div", class_='mw-content-ltr mw-parser-output')

    regex = re.compile('infobox.*')
    last_marked = text_div.find("table", {"class" : regex})

    texts = []
    last_marked = text_div.find("h2")

    for text in last_marked.find_all_previous('p'):
        texts.append(text)

    texts.reverse()
    
    text_metadata = []
    for text in texts:
        external_entities = []
        for entity in text.find_all("a"):
            title = entity.get('title')
            if title:
                external_entities.append(title)
        
        content = text.text.strip()
        if len(content) > 0:
            text_metadata.append((content, external_entities))
    
    return text_metadata


In [None]:
entity_name = "Steve Jobs"
get_wiki_paragraphs_by_entity(entity_name)

In [None]:
texts_metadata = get_wiki_paragraphs_by_entity(entity_name)
wiki_entities = []

for text, metadata in texts_metadata:
    wiki_entities.extend(metadata)
wiki_entities

In [None]:
with open("wiki_entities.json", 'w') as f:
    json.dump(wiki_entities, f)

## Inference

In [None]:
model_name = 'gpt-4.1'
extractor = LLMTripletExtractor(model=model_name)
aligner = Aligner(db)

In [None]:
entity_name = "Steve Jobs"
jobs_paragraphs = get_wiki_paragraphs_by_entity(entity_name)

for paragraph in jobs_paragraphs:
    initial_triplets, refined_triplets, filtered_triplets = extract_triplets(paragraph, sample_id='wiki-texts', aligner=aligner, extractor=extractor)
    print("Refined triplets: ", refined_triplets)
    print("Filtered triplets: ", filtered_triplets)

## Neo4j

In [None]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7688"
# username = "neo4j"
# password = "12345678"
# driver = GraphDatabase.driver(uri, auth=(username, password))
driver = GraphDatabase.driver(uri)

In [None]:
def fetch_neo4j_triplets():
    with driver.session() as session:
        query = """
        MATCH (s)-[r]->(o)
        RETURN s.name AS subject, type(r) AS predicate, o.name AS object
        LIMIT 100
        """
        result = session.run(query)
        triplets = [(r["subject"], r["predicate"], r["object"]) for r in result]
    return triplets

fetch_neo4j_triplets()

In [None]:

def find_any_node(tx):
    query = "MATCH (n) RETURN n LIMIT 1"
    result = tx.run(query)
    return result.single()[0] if result.peek() else None

def find_some_nodes(tx, limit=5):
    query = f"MATCH (n) RETURN n LIMIT {limit}"
    return [record["n"] for record in tx.run(query)]

with driver.session() as session:
    any_node = session.read_transaction(find_any_node)
    node_collection = session.read_transaction(find_some_nodes, 10)

    print("Any single node:")
    print(any_node)

    print("\nCollection of nodes:")
    for node in node_collection:
        print(node)

driver.close()

In [None]:
def find_some_nodes(tx, limit=5):
    query = f"MATCH (n) RETURN n LIMIT {limit}"
    return [record["n"] for record in tx.run(query)]


In [None]:
def add_node(tx, name):
    tx.run("CREATE (n:Person {name: $name})", name=name)

with driver.session() as session:
    session.write_transaction(add_node, "Alla")

In [None]:
def get_triples(tx):
    tx.run(f"""
            MATCH (a)-[r]->(b)
            RETURN a, r, b
            """)
    

with driver.session() as session:
    print(session.execute_read(get_triples))

In [None]:
def add_relation(tx, head, tail, relation):
    query = f"""
        MATCH (a {{name: $head}}), (b {{name: $tail}})
        CREATE (a)-[r:{relation}]->(b)
        RETURN type(r)
        """
    result = tx.run(query, head=head, tail=tail)

with driver.session() as session:        
    session.write_transaction(add_relation, "DevOps", "Practice", "is")
    # session.write_transaction(add_node, "Practice")


In [None]:
def add_node(tx, node_name):
    tx.run("CREATE (n:Node {name: $node_name})", node_name=node_name)

def add_relation(tx, head, tail, relation):
    query = f"""
        MATCH (a {{name: $head}}), (b {{name: $tail}})
        CREATE (a)-[r:{relation}]->(b)
        RETURN type(r)
        """
    result = tx.run(query, head=head, tail=tail)

def get_node(tx, name):
    result = tx.run("MATCH (n:Node {name: $name}) RETURN n.name AS name", name=name)
    return [record["name"] for record in result]


# for i, row in full_df.iterrows():
#     head = row['subject']
#     tail = row['object']
#     relation = "_".join(row['relation'].replace("-", "").replace(".", "").split())
#     # print(head, tail, relation)
#     with driver.session() as session:
#         if not session.read_transaction(get_node, head):
#             session.write_transaction(add_node, head)

#         if not session.read_transaction(get_node, tail):
#             session.write_transaction(add_node, tail)
            
#         session.write_transaction(add_relation, head, tail, relation)


In [None]:
def get_person(tx):
    
    result = tx.run("MATCH (subject:Node {name: $name1}) -[r:RELATION]-> (object:Node {name: $name2}) RETURN r.name as relation", name1="Steven Paul Jobs", name2="Next")
    
    return [rel["relation"] for rel in result]

with driver.session() as session:
    names = session.read_transaction(get_person)

names

In [None]:
def delete_all(tx):
    
    result = tx.run("MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE r, n")
    
    return result

with driver.session() as session:
    names = session.write_transaction(delete_all)
    

In [None]:
driver.close()

## Post-analysis

In [None]:
import pandas as pd
from collections import defaultdict

df = pd.read_csv('full_triplets.csv', index_col=0)
df

In [None]:
df_instance_of = df[df['relation'] == 'instance of']

In [None]:
df[(df['subject'] == 'Apple II') & (df['relation'] != 'instance of')].relation.to_list()

In [None]:
relation2head = defaultdict(set)
relation2tail = defaultdict(set)

for i, row in df_instance_of.iterrows():
    relations = df[(df['subject'] == row['subject']) & (df['relation'] != 'instance of')].relation.to_list()
    
    for rel in relations:
        relation2head[rel].add(row['object'])

    relations = df[(df['object'] == row['subject']) & (df['relation'] != 'instance of')].relation.to_list()
    
    for rel in relations:
        relation2tail[rel].add(row['object'])
        

In [None]:
relation2tail

In [None]:
relation2head

In [None]:
df_instance_of