In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import openai

from utils import *

import KG_full as AKG
from dotenv import load_dotenv
import base64
import json
import requests

In [None]:
# Load .env 
load_dotenv()

# Read all config from environment variables
COMPLETIONS_MODEL = os.getenv("OPENAI_API_MODEL", "gpt-4")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
my_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = my_api_key

os.environ['OPENAI_API_KEY'] = my_api_key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [None]:
KG_class_chat = AKG.autoKG(texts=None,
                           source=None,
                           embedding_model=EMBEDDING_MODEL,
                           llm_model=COMPLETIONS_MODEL,
                           openai_api_key=OPENAI_API_KEY,
                           main_topic=None,
                           embedding=False)

In [None]:
KG_class_chat.load_data(os.path.join('KG_data', 'ref_KG.npy'), include_texts=True)

## Inspect keywords and visualize a larger graph
Use the cells below to print keywords and render a larger labeled graph snapshot.

In [None]:
# Print keywords (adjust the slice to see more/less)
try:
    kws = getattr(KG_class_chat, 'keywords', []) or []
    print(f"Total keywords: {len(kws)}")
    preview_n = min(100, len(kws))
    for i, k in enumerate(kws[:preview_n]):
        print(f"{i+1}. {k}")
    if len(kws) > preview_n:
        print(f"... (+{len(kws) - preview_n} more)")
except Exception as e:
    print('Failed to print keywords:', e)

In [None]:
# Draw a larger labeled subgraph and save it
try:
    large_query = "overview of the graph with details"
    record_large = KG_class_chat.KG_prompt(large_query,
                                           search_nums=(30, 15, 8, 6, 4),
                                           search_mtd='pair_dist',
                                           use_u=False)
    os.makedirs('KG_outputs', exist_ok=True)
    KG_class_chat.draw_graph_from_record(record_large,
                                         node_colors=([0, 1, 1], [0, 1, 0.5], [1, 0.7, 0.75]),
                                         node_shape='o',
                                         edge_color='black',
                                         edge_widths=(2, 0.5),
                                         node_sizes=(700, 220, 80),
                                         font_color='black',
                                         font_size=7,
                                         show_text=True,
                                         save_fig=True,
                                         save_path='KG_outputs/Subgraph_vis_large.png')
    print("Saved larger labeled subgraph to KG_outputs/Subgraph_vis_large.png")
except Exception as e:
    print('Failed to draw larger graph:', e)

## Export to Neo4j (HTTPS default)
Exports use Neo4jâ€™s HTTPS Query API by default. Set these env vars:
- NEO4J_HTTP_HOST (e.g. 3dc7fb65.databases.neo4j.io)
- NEO4J_DATABASE (e.g. neo4j)
- NEO4J_USER, NEO4J_PASSWORD

### Check Neo4j connectivity
Run this once to confirm your credentials and URI are correct before exporting.

### HTTP Query API (Default path)
If bolt routing fails or is unavailable, use Neo4j Aura over HTTPS using Basic auth and the Query API endpoint. This notebook now uses this path by default.

In [None]:
# Export the KG over HTTPS in small batches
import math
assert hasattr(KG_class_chat, 'keywords'), 'Load KG first (run above cells).'

# 1) Upsert Keyword nodes
kw_batch = 200
for start in range(0, len(KG_class_chat.keywords), kw_batch):
    chunk = KG_class_chat.keywords[start:start+kw_batch]
    stmt = """UNWIND $names AS name MERGE (k:Keyword {name: name})"""
    payload = {"statement": stmt, "parameters": {"names": chunk}}
    r = requests.post(endpoint, headers=headers, data=json.dumps(payload))
    if r.status_code >= 300:
        print('Keyword batch failed:', r.status_code, r.text[:300])
        break
    else:
        print(f'Upserted keywords {start}..{start+len(chunk)-1}')

# 2) Upsert Text nodes (if present)
txts = getattr(KG_class_chat, 'texts', None)
srcs = getattr(KG_class_chat, 'sources', None)
if txts is not None:
    text_batch = 50  # keep payload small
    for i in range(0, len(txts), text_batch):
        names = list(range(i, min(i+text_batch, len(txts))))
        records = []
        for idx in names:
            rec = {"id": int(idx), "content": str(txts[idx])}
            if srcs is not None and idx < len(srcs):
                rec["source"] = str(srcs[idx])
            else:
                rec["source"] = None
            records.append(rec)
        stmt = """UNWIND $rows AS row MERGE (t:Text {id: row.id}) SET t.content = row.content, t.source = row.source"""
        payload = {"statement": stmt, "parameters": {"rows": records}}
        r = requests.post(endpoint, headers=headers, data=json.dumps(payload))
        if r.status_code >= 300:
            print('Text batch failed:', r.status_code, r.text[:300])
            break
        else:
            print(f'Upserted texts {i}..{i+len(records)-1}')

# 3) Create Keyword-Keyword edges from adjacency
A = getattr(KG_class_chat, 'A', None)
if A is not None:
    A_dense = np.array(A.todense()) if hasattr(A, 'todense') else np.array(A)
    n = A_dense.shape[0]
    edge_batch = 2000
    buffer = []
    for i in range(n):
        for j in range(i+1, n):
            w = float(A_dense[i, j])
            if w > 0:
                buffer.append({"ai": KG_class_chat.keywords[i], "bj": KG_class_chat.keywords[j], "w": w})
            if len(buffer) >= edge_batch:
                stmt = """UNWIND $edges AS e MATCH (a:Keyword {name: e.ai}), (b:Keyword {name: e.bj}) MERGE (a)-[r:RELATES_TO]->(b) SET r.weight = e.w"""
                payload = {"statement": stmt, "parameters": {"edges": buffer}}
                r = requests.post(endpoint, headers=headers, data=json.dumps(payload))
                print('Edge batch status:', r.status_code)
                if r.status_code >= 300:
                    print('Edge batch failed:', r.text[:300])
                    buffer = []
                    break
                buffer = []
    if buffer:
        stmt = """UNWIND $edges AS e MATCH (a:Keyword {name: e.ai}), (b:Keyword {name: e.bj}) MERGE (a)-[r:RELATES_TO]->(b) SET r.weight = e.w"""
        payload = {"statement": stmt, "parameters": {"edges": buffer}}
        r = requests.post(endpoint, headers=headers, data=json.dumps(payload))
        print('Final edge batch status:', r.status_code)
print('HTTPS export complete. In Neo4j Browser, run: MATCH (n) RETURN n LIMIT 50;')

In [None]:
# Build Basic auth header from env credentials
http_host = os.getenv('NEO4J_HTTP_HOST', '3dc7fb65.databases.neo4j.io')
http_db = os.getenv('NEO4J_DATABASE', 'neo4j')
http_user = os.getenv('NEO4J_USER', 'neo4j')
http_pass = os.getenv('NEO4J_PASSWORD', 'neo4j')
endpoint = f"https://{http_host}/db/{http_db}/query/v2"
creds = f"{http_user}:{http_pass}".encode('utf-8')
auth_header = "Basic " + base64.b64encode(creds).decode('utf-8')
headers = {
    "Authorization": auth_header,
    "Content-Type": "application/json"
}
print('HTTP endpoint:', endpoint)
print('Authorization:', auth_header[:20] + '...')

In [None]:
# OPTIONAL: Delete all existing nodes and relationships before export
# Set WIPE_NEO4J=true in your .env to enable this
if os.getenv('WIPE_NEO4J', 'false').lower() == 'true':
    wipe_stmt = "MATCH (n) DETACH DELETE n"
    payload = {"statement": wipe_stmt}
    r = requests.post(endpoint, headers=headers, data=json.dumps(payload))
    if r.status_code >= 300:
        print('Wipe failed:', r.status_code, r.text[:300])
    else:
        print('Neo4j database wiped clean.')

In [None]:
# Quick summary + mini-visualization of saved KG
try:
    # Ensure data is loaded (Cell 4 does this already)
    if not hasattr(KG_class_chat, 'A') or KG_class_chat.A is None:
        KG_class_chat.load_data(os.path.join('KG_data', 'ref_paper_KG.npy'), include_texts=True)
    
    # Basic summary
    num_texts = len(getattr(KG_class_chat, 'texts', []) or [])
    num_keywords = len(getattr(KG_class_chat, 'keywords', []) or [])
    A = getattr(KG_class_chat, 'A', None)
    num_edges = int(np.sum(np.array(A.todense()) > 0) // 2) if A is not None else 0
    print(f"Texts: {num_texts}\nKeywords: {num_keywords}\nEdges: {num_edges}")
    
    # Degree histogram
    if A is not None:
        deg_mat = np.sum(np.array(A.todense()) > 0, axis=0)
        plt.figure(figsize=(6,3))
        plt.hist(deg_mat, bins=20)
        plt.title('Node degree distribution')
        plt.xlabel('degree')
        plt.ylabel('count')
        plt.show()
    
    # Mini subgraph draw from a query term (optional)
    test_query = "overview of the graph"
    record = KG_class_chat.KG_prompt(test_query,
                                     search_nums=(10, 5, 3, 3, 2),
                                     search_mtd='pair_dist',
                                     use_u=False)
    KG_class_chat.draw_graph_from_record(record,
                                         node_colors=([0, 1, 1], [0, 1, 0.5], [1, 0.7, 0.75]),
                                         node_shape='o',
                                         edge_color='black',
                                         edge_widths=(2, 0.5),
                                         node_sizes=(500, 150, 50),
                                         font_color='black',
                                         font_size=6,
                                         show_text=False,
                                         save_fig=False)
except Exception as e:
    print('Preview failed:', e)

In [None]:
query = """
Please tell me how to use pre-trained language models to construct a knowledge graph.
"""
record = KG_class_chat.KG_prompt(query,
                                 search_nums=(15, 7, 3, 4, 2),
                                 search_mtd='pair_dist',
                                 use_u=False)
response, keywords_info, ref_info, all_tokens = KG_class_chat.completion_from_record(
                               record,
                               output_tokens=1024,
                               prompt_language='English',
                               show_prompt=False,
                               prompt_keywords=False,
                               include_source=False,
                               )

In [None]:
print(all_tokens)
print(response)


In [None]:
KG_class_chat.draw_graph_from_record(record,
                                     node_colors=([0, 1, 1], [0, 1, 0.5], [1, 0.7, 0.75]),
                                     node_shape='o',
                                     edge_color='black',
                                     edge_widths=(2, 0.5),
                                     node_sizes=(500, 150, 50),
                                     font_color='black',
                                     font_size=6,
                                     show_text=False,
                                     save_fig=True,
                                     save_path='KG_outputs/Subgraph_vis.png')


In [None]:
KG_class_chat.draw_graph_from_record(record,
                                     node_colors=([0, 1, 1], [0, 1, 0.5], [1, 0.7, 0.75]),
                                     node_shape='o',
                                     edge_color='black',
                                     edge_widths=(2, 0.5),
                                     node_sizes=(500, 150, 50),
                                     font_color='black',
                                     font_size=6,
                                     show_text=True,
                                     save_fig=True,
                                     save_path='KG_outputs/Subgraph_vis_text.png')