# Intro

This notebook presents how to use HeXtractor to turn text data into heterogeneous graph using LLM structured ouput - LLMGraphTransformer from Lanchain.

# Lib imports

In [22]:
import autoroot #noqa
from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from collections import defaultdict

from langchain_openai import ChatOpenAI

import hextractor.integrations.langchain_graphdoc as langchain2graph
import hextractor.structures as structures
import hextractor.visualization as viz

# LLM config

You can configure the any Langchain-compatible LLM model here. It must be a model that can be used with the `LLMGraphTransformer`. 

Example config below:

```python
base_url = "YOUR_BASE_URL"
api_key = "YOUR_API_KEY"

llm = ChatOpenAI(base_url=base_url, api_key=api_key)
llm_graph_transformer = LLMGraphTransformer(llm=llm)
```


Example usage:

```python
text = """Filip Wójcik and Marcin Malczewski are data scientists, who developed HeXtractor. It is a library
that helps in extracting heterogeneous knowledge graphs from various data source.
Heterogeneous knowledge graphs are graphs that contain different types of nodes and edges."""

docs = [Document(page_content=text)]

data = await llm_graph_transformer.aconvert_to_graph_documents(docs)
```

# GraphDocument extraction

Below we will manually construct graph_doc for the sake of the example. You can use the real output if you want.

In [23]:
text = """Filip Wójcik and Marcin Malczewski are data scientists, who developed HeXtractor. It is a library
that helps in extracting heterogeneous knowledge graphs from various data source.
Heterogeneous knowledge graphs are graphs that contain different types of nodes and edges."""

docs = [Document(page_content=text)]

In [24]:
use_real_llm = False
if use_real_llm:
    base_url = "YOUR_BASE_URL"
    api_key = "YOUR_API_KEY"

    llm = ChatOpenAI(base_url=base_url, api_key=api_key)
    llm_graph_transformer = LLMGraphTransformer(llm=llm)

    data = await llm_graph_transformer.aconvert_to_graph_documents(docs)
else:
    fw_node = Node(type="Person", id="Filip Wójcik")
    mm_node = Node(type="Person", id="Marcin Malczewski")
    hx_node = Node(type="Library", id="HeXtractor")
    kg_node = Node(type="Graph", id="Heterogeneous knowledge graph")

    fw_developed_hx = Relationship(source=fw_node, target=hx_node, type="Developed")
    mm_developer_hx = Relationship(source=mm_node, target=hx_node, type="Developed")
    hx_extracts_kg = Relationship(source=hx_node, target=kg_node, type="Extracts")

    data = [
        GraphDocument(
            nodes=[fw_node, mm_node, hx_node, kg_node], 
            relationships=[fw_developed_hx, mm_developer_hx, hx_extracts_kg],
            source=docs[0]
            )
        ]


In [25]:
graph_doc = data[0]
for node in graph_doc.nodes:
    print(node)

id='Filip Wójcik' type='Person' properties={}
id='Marcin Malczewski' type='Person' properties={}
id='HeXtractor' type='Library' properties={}
id='Heterogeneous knowledge graph' type='Graph' properties={}


In [26]:
for rel in graph_doc.relationships:
   print(rel) 

source=Node(id='Filip Wójcik', type='Person', properties={}) target=Node(id='HeXtractor', type='Library', properties={}) type='Developed' properties={}
source=Node(id='Marcin Malczewski', type='Person', properties={}) target=Node(id='HeXtractor', type='Library', properties={}) type='Developed' properties={}
source=Node(id='HeXtractor', type='Library', properties={}) target=Node(id='Heterogeneous knowledge graph', type='Graph', properties={}) type='Extracts' properties={}


# Automatic hetero data extraction

In [27]:
hetero_data, node_mapping = langchain2graph.convert_graph_document_to_hetero_data(graph_doc)
hetero_data

HeteroData(
  Person={ x=[2, 1] },
  Library={ x=[1, 1] },
  Graph={ x=[1, 1] },
  (Library, Extracts, Graph)={ edge_index=[2, 1] },
  (Person, Developed, Library)={ edge_index=[2, 2] }
)

## Visualization

In [28]:
mapped_node_names = defaultdict(lambda: dict())
for (type, name), id in node_mapping.items():
    mapped_node_names[type][id] = name

In [29]:
simple_cfg = dict(
    node_types_to_colors=dict(
        Person='red',
        Library='green',
        Graph='blue'
    ),
    edge_type_to_colors={
        ('Library', 'Extracts', 'Graph'): 'black',
        ('Person', 'Developed', 'Library'): 'purple'
    },
    node_type_label_attr_name={"Person": "x", "Library": "x", "Graph": "x"},
    node_type_label_attr_idx={"Person": 0, "Library": 0, "Graph": 0},
    notebook_visualization=True
)



simple_viz_cfg = structures.VisualizationConfig(**simple_cfg)
simple_net_viz = viz.VisualizationBuilder.build_visualization(simple_viz_cfg, hetero_data, mapped_node_names)
simple_net_viz.show(name="llm_graph.html")

llm_graph.html


# Example of an invalid graph data from LLM

In [30]:
doc = Document(page_content="Michael Scott knows Pam and Elon Musk.")
node_michael = Node(id="Michael Scott", type="Person")
node_pam = Node(id="Pam", type="Person")
node_elon_musk = Node(id="Elon Musk", type="Person")


invalid_gd = GraphDocument(
    nodes=[
        node_michael,
        node_pam,
    ],
    relationships=[
        Relationship(source=node_michael, target=node_pam, type="knows"),
        Relationship(source=node_michael, target=node_elon_musk, type="knows"),
    ],
    source=doc
)

In [32]:
try:
    invlid_hetero_g = langchain2graph.convert_graph_document_to_hetero_data(invalid_gd)
except ValueError as e:
    print(e)

Unknown target node: Elon Musk of type Person
