## Run KGC with LLM + Ontology

In [1]:
import os
import re
import sys
import json
import torch
import spacy
from pathlib import Path
from openai import OpenAI
from fuzzywuzzy import fuzz
from datetime import datetime
# from dataloc import directory_path
from collections import defaultdict

sys.path.append(os.getcwd())

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ! python -m spacy download en_core_web_sm
# ! pip install spacy
# Spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# ChatGPT
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("API_KEY")


#### Run LLM function (chatgpt)

In [3]:

# models:
#   - gpt-3.5-turbo
#   - gpt-4
#   - gpt-4-turbo
#   - gpt-4o-mini
def query(prompt, model='gpt-4o-mini', temperature=0):
    client = OpenAI(api_key=api_key)

    res = client.chat.completions.create(
            model=model,
            temperature=temperature,
            messages=[
            {"role": "user", "content": prompt}
            ])
    # print(prompt)
    # print('output is: ', res)
    return res


In [80]:
#load json file
with open('LLM_Ontologies/ontology/geo_ontology.json', 'r') as file:
    ontology = json.load(file)

print(ontology)

ontology_entities = ""
for concept in ontology["concepts"]:
    ontology_entities += f"{concept["label"]}, "


ontology_entities = ontology_entities[:-2]
print(ontology_entities)

relations_list = []

ontology_relations = ""

for relation in ontology["relations"]:
    ontology_relations += f"{relation["label"]}{relation["domain"], relation["range"]}, "
    relations_list.append(relation["label"])
    print(f"{relation["label"]}{relation["domain"], relation["range"]}, ")

print(ontology_relations)

print("___________________________________________________________________________________________________________")

text = "Mapping and geochronology by the Geological Society of Australia (Arriens, 1971) reveal that the granitic rocks in the western part of the Yalgoo 1:250,000 map sheet are in the order of 2,800 to 3,000 Ma."
def ontology_prompt(text, ontology_entities, ontology_relations, entities):

    prompt= f"""Given the following ontology and sentences, please extract the triples from the sentence according to the relations in the ontology. If the sentence suggests relations not in the ontology but fit the entity types, include them as well, you are not restricted to the provided relations. In the output, only include the triples in the given output format.

context:
Ontology Concepts: {ontology_entities}
Ontology Relations: {ontology_relations}

Example Sentence: The stratigraphy of the area includes the Triassic Sandstone, which is approximately 250 Ma old and contains quartz and feldspar.
Example Output:
hasAge(Sandstone (Rock), Triassic (GeologicalTimescale))
hasAge(Sandstone (Rock), 250 Ma (GeologicalTimescale))
contains(Sandstone (Rock), quartz (Mineral))
contains(Sandstone (Rock), feldspar (Mineral))

Now, using the following provided entities as a guide for which entities to find, extract triples for the sentence:
Provided entities:
{entities}

Test Sentence: {text}
Test Output:
    """

    return prompt

prompt = ontology_prompt(text, ontology_entities, ontology_relations, "yes")

print(prompt)

{'title': 'GeologicalSurveys Ontology', 'id': 'ont_geologicalsurveys', 'concepts': [{'qid': 'GeologicalTimescale', 'label': 'GeologicalTimescale'}, {'qid': 'Location', 'label': 'Location'}, {'qid': 'Mineral', 'label': 'Mineral'}, {'qid': 'OreDeposit', 'label': 'OreDeposit'}, {'qid': 'Rock', 'label': 'Rock'}, {'qid': 'Stratigraphy', 'label': 'Stratigraphy'}, {'qid': 'Date', 'label': 'Date'}], 'relations': [{'pid': 'contains', 'label': 'contains', 'domain': 'Rock', 'range': 'Mineral'}, {'pid': 'contains', 'label': 'contains', 'domain': 'Mineral', 'range': 'OreDeposit'}, {'pid': 'belongsTo', 'label': 'belongsTo', 'domain': 'Stratigraphy', 'range': 'Rock'}, {'pid': 'foundIn', 'label': 'foundIn', 'domain': 'Rock', 'range': 'Location'}, {'pid': 'foundIn', 'label': 'foundIn', 'domain': 'Mineral', 'range': 'Location'}, {'pid': 'foundIn', 'label': 'foundIn', 'domain': 'OreDeposit', 'range': 'Location'}, {'pid': 'foundAt', 'label': 'foundAt', 'domain': 'Rock', 'range': 'Date'}, {'pid': 'foundAt'

In [81]:
def get_entities(doc, iob_tags):
    entities = []
    current_entity = []
    current_tag = None
    for token, tag in zip(doc, iob_tags):
        if tag.startswith('B-'):
            if current_entity:
                # Save the previous entity before starting a new one
                entities.append((" ".join(current_entity), current_tag))
            # Start a new entity
            current_entity = [token.text]
            current_tag = tag.split('-')[1]  # Get the entity type (e.g., DATE, LOCATION)
        elif tag.startswith('I-') and current_tag == tag.split('-')[1]:
            # Continue the current entity
            current_entity.append(token.text)
        else:
            if current_entity:
                # If we reach an 'O' or unrelated tag, save the current entity
                entities.append((" ".join(current_entity), current_tag))
                current_entity = []
                current_tag = None

    # Catch any remaining entity at the end of the loop
    if current_entity:
        entities.append((" ".join(current_entity), current_tag))

    return entities

In [82]:
# load json file
# with open("../Results/all_tags.json", "r") as file:
#     sall_tagged = json.load(file)

# # load json file    REPLACE THIS LATER WITH THE FINAL TAG FILES
with open("../Results/testing_tags.json", "r") as file:
    all_tagged = json.load(file)

In [4]:
def extract_temporal_relations(all_tagged):
    for a in all_tagged:
        if a != "a092458_e09_1213_2011_a_12624596.json":
            continue
        for b in all_tagged[a]:
            # print(b)
            # print(all_tagged[a][b]["preprocess"])
            doc = nlp(all_tagged[a][b]["preprocess"])
            iob_tags = all_tagged[a][b]["combine_tags"][0].split(" ")
            print(b)
            if len(set(iob_tags)) > 1:
                if all_tagged[a][b]["combine_tags"][0] != 'error':
                    entities = get_entities(doc, iob_tags)
                    print(entities)
                    
                # print(all_tagged[a][b]["combine_tags"])
                # print(entities)
                all_tagged[a][b]["llm_results"] = None
                if entities:
                    entity_text = "\n".join([f"- {entity[0]} ({entity[1]})" for entity in entities])
                    prompt = ontology_prompt(b, ontology_entities, ontology_relations, entity_text)
                    # print(prompt)
                    results = query(prompt).choices[0].message.content.strip()
                    print(results)
                    all_tagged[a][b]["llm_results"] = results
                print()
            else:
                # Text does not contain any entities
                all_tagged[a][b]["llm_results"] = None
    return all_tagged

# Perform relationship extraction
# all_tagged = extract_temporal_relations(all_tagged)

In [39]:
# Function to extract entities and relation from the input string and convert to knowledge graph triple
def parse_and_convert_to_kg_types(input_string):
    # Regular expression to extract the relation, head entity, head type, tail entity, and tail type
    pattern = r'(\w+)\(([^()]+)\s\((\w+)\),\s([^()]+)\s\((\w+)\)\)'
    match = re.match(pattern, input_string)
    
    if match:
        relation = match.group(1)
        head_entity = match.group(2).strip()
        head_type = match.group(3)
        tail_entity = match.group(4).strip()
        tail_type = match.group(5)
        
        # Return the extracted data as a triple
        return (head_entity, head_type, relation, tail_entity, tail_type)
    else:
        return None
    
# Function to insert the triple into the knowledge graph
def add_to_knowledge_graph(kg, triple):
    head_entity, head_type, relation, tail_entity, tail_type = triple
    # Add the relation and tail entity to the knowledge graph
    kg[(head_entity, head_type)][relation].add((tail_entity, tail_type))


# Input string
input_string = "foundIn(Rainy RocksE2 (Rock), 18 April 2011 (Location))"

# Convert to knowledge graph triple
kg_triple = parse_and_convert_to_kg_types(input_string)

# Print the result
if kg_triple:
    print(f"({kg_triple[0]} ({kg_triple[1]}), {kg_triple[2]}, {kg_triple[3]} ({kg_triple[4]}))")
else:
    print("Invalid format")

(Rainy RocksE2 (Rock), foundIn, 18 April 2011 (Location))


### Clean KG with NER Results
- If both entities in a relation is not in tagged in NER, then triple is removed

In [75]:
"a092458_e09_1213_2011_a_12624596.json"

# Load the NER tagged data
# with open("../Results/testing_tags.json", 'r') as file:
#     llm_results = json.load(file)

llm_results = all_tagged

model_kg = []

for a in llm_results:
    if a != "a092458_e09_1213_2011_a_12624596.json":
        continue
    for b in llm_results[a]:
        iob_tags = all_tagged[a][b]["combine_tags"][0].split(" ")
        doc = nlp(all_tagged[a][b]["preprocess"])
        if all_tagged[a][b]["combine_tags"][0] != 'error':
            entities = get_entities(doc, iob_tags)
            if llm_results[a][b]["llm_results"]:
                triples = llm_results[a][b]["llm_results"].split("\n")
                for triple in triples:
                    kg_triple = parse_and_convert_to_kg_types(triple)
                    if kg_triple:
                        left_entity_type = []
                        right_entity_type = []
    
                        for entity in entities:
                            if kg_triple[0] in entity[0] or entity[0] in kg_triple[0]:
                                left_entity_type.append(entity[1])
                            if kg_triple[3] in entity[0] or entity[0] in kg_triple[3]:
                                right_entity_type.append(entity[1])
                        # If the entity types for the head entity are not consistent, set to None
                        if len(set(left_entity_type)) != 1:
                            left_entity_type = None
                        if left_entity_type and right_entity_type:
                            print(kg_triple)
                            model_kg.append(kg_triple)

('Mango Bore Project', 'Location', 'foundIn', 'Gascoyne Junction', 'Location')
('Mango Bore Project', 'Location', 'foundIn', 'Gascoyne Complex', 'Stratigraphy')
('Gascoyne Junction', 'Location', 'LocatedIn', 'Gascoyne Complex', 'Location')
('Wabli Creek', 'Location', 'LocatedIn', 'Wabli Creek project E09/1178', 'Location')
('Wabli Creek', 'Location', 'hasCommonBoundaries', 'Wabli Creek project E09/1178', 'Location')
('Mango Bore', 'Location', 'foundIn', 'Wajarri-Yamatji Claim', 'Location')
('Wajarri-Yamatji Claim', 'Location', 'foundAt', '1st December 2005', 'Date')
('Mango Bore', 'Location', 'foundIn', 'Gascoyne Complex', 'Stratigraphy')
('Gascoyne Complex', 'Stratigraphy', 'foundIn', 'Capricorn', 'Location')
('Gascoyne Complex', 'Stratigraphy', 'foundAt', '2007', 'Date')
('Gascoyne Complex', 'Stratigraphy', 'limitedTo', 'Phanerozoic', 'Stratigraphy')
('Gascoyne Complex', 'Stratigraphy', 'foundIn', 'Carnarvon basin', 'Location')
('Phanerozoic', 'Stratigraphy', 'foundIn', 'Carnarvon ba

In [77]:
print(model_kg)
print(len(model_kg))

[('Mango Bore Project', 'Location', 'foundIn', 'Gascoyne Junction', 'Location'), ('Mango Bore Project', 'Location', 'foundIn', 'Gascoyne Complex', 'Stratigraphy'), ('Gascoyne Junction', 'Location', 'LocatedIn', 'Gascoyne Complex', 'Location'), ('Wabli Creek', 'Location', 'LocatedIn', 'Wabli Creek project E09/1178', 'Location'), ('Wabli Creek', 'Location', 'hasCommonBoundaries', 'Wabli Creek project E09/1178', 'Location'), ('Mango Bore', 'Location', 'foundIn', 'Wajarri-Yamatji Claim', 'Location'), ('Wajarri-Yamatji Claim', 'Location', 'foundAt', '1st December 2005', 'Date'), ('Mango Bore', 'Location', 'foundIn', 'Gascoyne Complex', 'Stratigraphy'), ('Gascoyne Complex', 'Stratigraphy', 'foundIn', 'Capricorn', 'Location'), ('Gascoyne Complex', 'Stratigraphy', 'foundAt', '2007', 'Date'), ('Gascoyne Complex', 'Stratigraphy', 'limitedTo', 'Phanerozoic', 'Stratigraphy'), ('Gascoyne Complex', 'Stratigraphy', 'foundIn', 'Carnarvon basin', 'Location'), ('Phanerozoic', 'Stratigraphy', 'foundIn', 