### Text2KG with ontologies + LLM

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("API_KEY")

In [2]:
import os
import sys
import json

sys.path.append(os.getcwd())
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# from huggingface_hub import notebook_login
# notebook_login()

In [4]:
from openai import OpenAI

# Initialize OpenAI API

def query(prompt, model='gpt-4o-mini', temperature=0):
    client = OpenAI(api_key=api_key)

    res = client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    print('output is: ', res)
    return res

# # Use your custom query function
# completed_prompt = query(prompt)
# print(completed_prompt)

In [5]:
import json

#load json file
with open('ontology/geo_ontology.json', 'r') as file:
    ontology = json.load(file)

print(ontology)

ontology_entities = ""
for concept in ontology["concepts"]:
    ontology_entities += f"{concept["label"]}, "


ontology_entities = ontology_entities[:-2]
print(ontology_entities)

relations_list = []

ontology_relations = ""

for relation in ontology["relations"]:
    ontology_relations += f"{relation["label"]}{relation["domain"], relation["range"]}, "
    relations_list.append(relation["label"])

print(ontology_relations)

print("___________________________________________________________________________________________________________")

text = "Mapping and geochronology by the Geological Society of Australia (Arriens, 1971) reveal that the granitic rocks in the western part of the Yalgoo 1:250,000 map sheet are in the order of 2,800 to 3,000 Ma."
def ontology_prompt(text, ontology_entities, ontology_relations):

    prompt= f"""Given the following ontology and sentences, please extract the triples from the sentence according to the relations in the ontology. In the output, only include the triples in the given output format.

context:
Ontology Concepts: {ontology_entities}
Ontology Relations: {ontology_relations}

Example Sentence: The stratigraphy of the area includes the Triassic Sandstone, which is approximately 250 Ma old and contains quartz and feldspar.
Example Output:
belongsTo(Triassic (Stratigraphy), Sandstone (Rock))
contains(Sandstone (Rock), quartz (Mineral))
contains(Sandstone (Rock), feldspar (Mineral))

Test Sentence: {text}
Test Output:
    """

    return prompt

prompt = ontology_prompt(text, ontology_entities, ontology_relations)

print(prompt)

{'title': 'GeologicalSurveys Ontology', 'id': 'ont_geologicalsurveys', 'concepts': [{'qid': 'GeologicalTimescale', 'label': 'GeologicalTimescale'}, {'qid': 'Location', 'label': 'Location'}, {'qid': 'Mineral', 'label': 'Mineral'}, {'qid': 'OreDeposit', 'label': 'OreDeposit'}, {'qid': 'Rock', 'label': 'Rock'}, {'qid': 'Stratigraphy', 'label': 'Stratigraphy'}, {'qid': 'Date', 'label': 'Date'}], 'relations': [{'pid': 'contains', 'label': 'contains', 'domain': 'Rock', 'range': 'Mineral'}, {'pid': 'contains', 'label': 'contains', 'domain': 'Mineral', 'range': 'OreDeposit'}, {'pid': 'belongsTo', 'label': 'belongsTo', 'domain': 'Stratigraphy', 'range': 'Rock'}, {'pid': 'foundIn', 'label': 'foundIn', 'domain': 'Rock', 'range': 'Location'}, {'pid': 'foundIn', 'label': 'foundIn', 'domain': 'Mineral', 'range': 'Location'}, {'pid': 'foundAt', 'label': 'foundAt', 'domain': 'Rock', 'range': 'Date'}, {'pid': 'foundAt', 'label': 'foundAt', 'domain': 'Mineral', 'range': 'Date'}, {'pid': 'foundAt', 'labe

In [6]:
#load json file
with open('../Results/all_tags.json', 'r') as file:
    all_tagged = json.load(file)

In [7]:
print(all_tagged["a075237_ep_a_all_2007_10582280.json"])

{'Managed By: GME Resources Ltd Level 2 907 Canning Highway Mt.': {'preprocess': 'Managed By : GME Resources Ltd Level 2 907 Canning Highway Mt .', 'time_tagged': 'Managed By : GME Resources Ltd Level 2 907 Canning Highway Mt .', 'ner_tagged': ['O O O O O O O O O B-LOCATION I-LOCATION I-LOCATION O O'], 'combine_tags': ['error']}, 'Pleasant WA 6153 Distribution: Department of Industry and Resources - Perth GME Resources Limited NiWest June 2007 1': {'preprocess': 'Pleasant WA 6153 Distribution : Department of Industry and Resources - Perth GME Resources Limited NiWest June 2007 1', 'time_tagged': 'Pleasant WA 6153 Distribution : Department of Industry and Resources - Perth GME Resources Limited NiWest <TIMEX3 tid="t1" type="DATE" value="">June 2007</TIMEX3> 1', 'ner_tagged': ['B-LOCATION I-LOCATION O O O O O O O O O B-LOCATION O O O O O O O O O'], 'combine_tags': ['error']}, 'The Eucalyptus Bore Project is located 85km ENE of Kookynie and 45km southeast of the Murrin Murrin mine site.':

In [8]:
# Load json
with open('LLM_Results.json', 'r') as file:
    text2kg = json.load(file)

def get_triples(all_tagged, text2kg, relations_list):
    for i, a in enumerate(all_tagged):
        for b in all_tagged[a]:
            prompt = ontology_prompt(b, ontology_entities, ontology_relations)
            results = query(prompt).choices[0].message.content
            print(results)
            found_keywords = [keyword for keyword in relations_list if keyword in results]
            if found_keywords:
                text2kg[a][b]["LLM_Results"] = results
            else:
                text2kg[a][b]["LLM_Results"] = "None"

    return text2kg

# text2kg = get_triples(all_tagged, text2kg, relations_list)

In [9]:
for a in text2kg:
    for b in text2kg[a]:
        try:
            if "None" in text2kg[a][b]["LLM_Results"]:
                continue
            else:
                print(b)
                print(text2kg[a][b]["LLM_Results"])
                print("___________________________________________________________________________________________________________")
        except:
            pass

The Eucalyptus Bore Project is located 85km ENE of Kookynie and 45km southeast of the Murrin Murrin mine site.
foundIn(Eucalyptus Bore Project (Location), Kookynie (Location))
foundIn(Eucalyptus Bore Project (Location), Murrin Murrin mine site (Location))
___________________________________________________________________________________________________________
Access is via the Kookynie - Mt.
foundIn(Kookynie (Location), Mt. (Location))
___________________________________________________________________________________________________________
Several relatively steep N-S striking ridges of weathered ultramafic rocks occasionally capped by a yellowish brown silcrete and deep red ferruginous laterite and small incised watercourses has made access and drilling in certain areas difficult.
contains(ultramafic (Rock), silcrete (Mineral))
contains(ultramafic (Rock), laterite (Mineral))
___________________________________________________________________________________________________________

In [10]:
# Save json
# with open('LLM_Results.json', 'w') as file:
#     json.dump(text2kg, file)

# Load json
with open('LLM_Results.json', 'r') as file:
    text2kg = json.load(file)

In [11]:
for a in text2kg:
    for b in text2kg[a]:
        try:
            if "None" in text2kg[a][b]["LLM_Results"]:
                continue
            else:
                print(b)
                print(text2kg[a][b]["LLM_Results"])
                print("___________________________________________________________________________________________________________")
        except:
            pass

The Eucalyptus Bore Project is located 85km ENE of Kookynie and 45km southeast of the Murrin Murrin mine site.
foundIn(Eucalyptus Bore Project (Location), Kookynie (Location))
foundIn(Eucalyptus Bore Project (Location), Murrin Murrin mine site (Location))
___________________________________________________________________________________________________________
Access is via the Kookynie - Mt.
foundIn(Kookynie (Location), Mt. (Location))
___________________________________________________________________________________________________________
Several relatively steep N-S striking ridges of weathered ultramafic rocks occasionally capped by a yellowish brown silcrete and deep red ferruginous laterite and small incised watercourses has made access and drilling in certain areas difficult.
contains(ultramafic (Rock), silcrete (Mineral))
contains(ultramafic (Rock), laterite (Mineral))
___________________________________________________________________________________________________________

### convert to KG

In [29]:
import re

# Function to extract entities and relation from the input string and convert to knowledge graph triple
def parse_and_convert_to_kg(input_string):
    # Regular expression to extract the relation, head entity, head type, tail entity, and tail type
    pattern = r'(\w+)\(([^()]+)\s\((\w+)\),\s([^()]+)\s\((\w+)\)\)'
    match = re.match(pattern, input_string)
    
    if match:
        relation = match.group(1)
        head_entity = match.group(2).strip()
        head_type = match.group(3)
        tail_entity = match.group(4).strip()
        tail_type = match.group(5)
        
        # Return the extracted data as a triple
        return (head_entity, head_type, relation, tail_entity, tail_type)
    else:
        return None
    
# Function to insert the triple into the knowledge graph
def add_to_knowledge_graph(kg, triple):
    head_entity, head_type, relation, tail_entity, tail_type = triple
    # Add the relation and tail entity to the knowledge graph
    kg[(head_entity, head_type)][relation].add((tail_entity, tail_type))


# Input string
input_string = "foundIn(Rainy RocksE2 (Rock), 18 April 2011 (Location))"

# Convert to knowledge graph triple
kg_triple = parse_and_convert_to_kg(input_string)

# Print the result
if kg_triple:
    print(f"({kg_triple[0]} ({kg_triple[1]}), {kg_triple[2]}, {kg_triple[3]} ({kg_triple[4]}))")
else:
    print("Invalid format")

(Rainy RocksE2 (Rock), foundIn, 18 April 2011 (Location))


In [30]:
from collections import defaultdict

# Initialize the knowledge graph with sets to avoid duplicates
knowledge_graph = defaultdict(lambda: defaultdict(set))

for a in text2kg:
    for b in text2kg[a]:
        try:
            if "None" in text2kg[a][b]["LLM_Results"]:
                continue
            else:
                print(b)
                print(text2kg[a][b]["LLM_Results"])
                for i in text2kg[a][b]["LLM_Results"].split("\n"):
                    kg_triple = parse_and_convert_to_kg(i)
                    if kg_triple:
                        add_to_knowledge_graph(knowledge_graph, kg_triple)
                    print(kg_triple)
                # kg_triple = parse_and_convert_to_kg(input_string)
                print("___________________________________________________________________________________________________________")
        except:
            pass

The Eucalyptus Bore Project is located 85km ENE of Kookynie and 45km southeast of the Murrin Murrin mine site.
foundIn(Eucalyptus Bore Project (Location), Kookynie (Location))
foundIn(Eucalyptus Bore Project (Location), Murrin Murrin mine site (Location))
('Eucalyptus Bore Project', 'Location', 'foundIn', 'Kookynie', 'Location')
('Eucalyptus Bore Project', 'Location', 'foundIn', 'Murrin Murrin mine site', 'Location')
___________________________________________________________________________________________________________
Access is via the Kookynie - Mt.
foundIn(Kookynie (Location), Mt. (Location))
('Kookynie', 'Location', 'foundIn', 'Mt.', 'Location')
___________________________________________________________________________________________________________
Several relatively steep N-S striking ridges of weathered ultramafic rocks occasionally capped by a yellowish brown silcrete and deep red ferruginous laterite and small incised watercourses has made access and drilling in certain 

In [32]:
# Print the knowledge graph
for (head_entity, head_type), relations in knowledge_graph.items():
    print(f"{head_entity} ({head_type})")
    for relation, tail_entities in relations.items():
        for tail_entity, tail_type in tail_entities:
            print(f"  - {relation} -> {tail_entity} ({tail_type})")

Eucalyptus Bore Project (Location)
  - foundIn -> Murrin Murrin mine site (Location)
  - foundIn -> Mt (Location)
  - foundIn -> Kookynie (Location)
Kookynie (Location)
  - foundIn -> Mt. (Location)
ultramafic (Rock)
  - contains -> magnesite (Mineral)
  - contains -> olivine mesocumulate (Mineral)
  - contains -> asbestos (Mineral)
  - contains -> silcrete (Mineral)
  - contains -> cap rock (Mineral)
  - contains -> magnetite (Mineral)
  - contains -> silica-rich (Mineral)
  - contains -> laterite (Mineral)
  - contains -> granitic (Rock)
  - contains -> olivine orthocumulate (Mineral)
  - contains -> silica (Mineral)
  - alteredBy -> Proterozoic Dyke (GeologicalProcess)
  - alteredBy -> granite (GeologicalProcess)
  - foundIn -> Gossan Island (Location)
  - foundIn -> narrow belt (Location)
  - foundIn -> YALGOO IRON ORE PROJECT (Location)
  - foundIn -> Yalgoo (Location)
  - occursIn -> Emerald mine (OreDeposit)
RC holes (Rock)
  - foundIn -> Camelback (Location)
Camelback (Rock)
  

In [12]:
text = ""
for i, a in enumerate(all_tagged):
    for b in all_tagged[a]:
        if i == 5:
            text += b + " "
        # if "GEO_TIME" in all_tagged[a][b]["combine_tags"][0]:
        #     print(i)
print(text)
print(len(text))

prompt = ontology_prompt(text, ontology_entities, ontology_relations)
print(prompt)
# results = query(prompt).choices[0].message.content
# print(results)

An extensive review of all historical databases, publicly available and remotely sensed data generated exploration targets which were followed up with rock chip sampling and soil sampling programs. The geochemical anomalies identified have been prioritised resulting in the voluntary partial surrender of exploration tenements 08/1717, 08/1718 and 08/1725 of the Capricorn Project. Coodardoo Formation - The formation comprises a series of prominent sandstone dominated ridges that define the Wanna Syncline to the southwest of the Capricorn Project area. The formation ranges in thickness from 150 to 350 metres and is dominated by lithic quartz arenites and quartz arenites with angular to rounded quartz comprising 60-80% of the rock. It is the youngest stratigraphic unit in the Edmund Group. The lithologies present are thought to represent a turbidite fan deposit with a source most likely in the northwest. The formation stands out with distinct sub rounded boulders forming scree slopes and a

### Create Manual KGC for Evaluation 

In [33]:
for a in all_tagged["a092458_e09_1213_2011_a_12624596.json"]:
    print(a)

The Mango Bore Project (E09/1213) is situated approximately 105 km east-northeast of Gascoyne Junction (Figure 1) in the Gascoyne Complex and covers a total area of roughly 99 km2.
The project is accessed from the Dairy Creek to the Cobra Station road and a station track approximately 2 km to the south of Yinnietharra Homestead.
The project area lies on the Yinnietharra and Mooloo Downs Pastoral leases (Figure 2).
2 TENURE 2.1 Tenement Details The Mango Bore Project is owned by Skytone Pty Ltd and operated by U3O8 Limited.
It has common boundaries with U3O8 Limiteds Wabli Creek project E09/1178 (Figure 4).
Figure 1 shows the boundaries of the project.
2.2 Native Title The Mango Bore Project is situated within the Wajarri-Yamatji Claim (WC04-010) area, which was registered and passed the registration test on the 1st December 2005.
2 3 4 3 GEOLOGY 3.1 Regional Geology The Mango Bore project lies near the south-western margin of the Gascoyne Complex.
The Gascoyne Complex comprises Palaeop

In [35]:
text_list = ["The Mango Bore Project (E09/1213) is situated approximately 105 km east-northeast of Gascoyne Junction (Figure 1) in the Gascoyne Complex and covers a total area of roughly 99 km2.",
             "The project is accessed from the Dairy Creek to the Cobra Station road and a station track approximately 2 km to the south of Yinnietharra Homestead.",
             "The project area lies on the Yinnietharra and Mooloo Downs Pastoral leases (Figure 2).",
             "2 TENURE 2.1 Tenement Details The Mango Bore Project is owned by Skytone Pty Ltd and operated by U3O8 Limited.",
             "It has common boundaries with U3O8 Limiteds Wabli Creek project E09/1178 (Figure 4).",
             "2.2 Native Title The Mango Bore Project is situated within the Wajarri-Yamatji Claim (WC04-010) area, which was registered and passed the registration test on the 1st December 2005.",
             "2 3 4 3 GEOLOGY 3.1 Regional Geology The Mango Bore project lies near the south-western margin of the Gascoyne Complex.",
             "The Gascoyne Complex comprises Palaeoproterozoic granitic units as well as medium to high-grade meta-sedimentary units that form the high grade core of the Capricorn Orogen (Sheppard et al., 2007; Figure 3).",
             "The Gascoyne Complex is limited to the west by the Phanerozoic successions of the Carnarvon basin.",
             "(2007), recent dating suggests that three separate orogenic events took place throughout the district.",
             "A fourth one, the Glenburgh Orogeny (2005-1960 Ma), is only known from the southern end of the Gascoyne Province.",
             "The Capricorn Orogeny (1830-1780 Ma) followed the deposition of the protoliths of the medium-grade primarily siliciclastic metasedimentary rocks of the Morrissey Metamorphics.",
             "Maximum depositional age derived from detrital zircons indicate ~1840 Ma for the Morrissey Metamorphics (Varvell, 2001).",
             "The Orogeny is marked by the deformation and intrusion of the Morrissey Metamorphics by the granites of the Moorarie Supersuite, comprising primarily some monzogranite and granodiorite, with minor syenogranite, tonalite and quartz diorite.",
             "The Mangaroon Orogeny (1680-1620 Ma) is best developed in the northern part of the Gascoyne Province (in the Mangaroon zone, Figure 3) and related structures are only poorly developed in the Morrissey Metamorphics.",
             "The lowto medium-grade metasedimentary rocks of the Pooranoo Metamorphics correlate with rocks of the fluviatile Mt James Formation, marking a series of fault-bounded basins deposited on the Gascoyne Complex (Hunter, 1990).",
             "The latter comprises primarily low-grades ~1,700 Ma meta-conglomerates and coarse metasandstones, overlying the Mount Morrissey Metamorphics.",
             "The Edmundian Orogeny (1030-950 Ma) reactivated shear and fault zones.",
             "Based on dates obtained on syn-metamorphic monazite and xenotime, this event has been associated with the peak regional metamorphism (greenschist to amphibolite facies), followed by pegmatite intrusion (Sheppard et al., 2007).",
             "These pegmatites show a regional association with beryllium and tantalumniobium occurrences The Mango Bore project is located close to the boundary between the northern domain of the Glenburgh Terrane and the southern boundary of the Mutherbukin zone of the Gascoyne Complex and extends to the Chalba shear zone to the north.",
             "5 3.2 Project Geology The stratigraphy at Mango Bore comprises the lowermost units of the Morrissey Metamorphics, in contact with reworked Archaean granitoids and Proterozoic pegmatites.",
             "The regolith environment is dominated by an erosional regime with minor relict domains preserved and belongs to a zone of limited U-Ce-La-Pb-Th enrichment in the regolith.",
             "A part of the tenement lies upstream of the Minindi Creek prospect which is known for calcrete hosted uranium mineralisation (Figure 4).",
             "A reverse circulation drilling programme had been planned previously to follow up the calcrete hosted uranium mineralisation south of the Minindi prospect, however, this has been delayed until the next reporting period.",
             "There remains potential for a small, shallow, calcrete hosted uranium resource in the area, but rig availability for a small programme of approximately 50, ten metre deep holes is difficult."
]

In [40]:
def manual_prompt(sentence):    
    prompt2 = f"""
    Task: Extract information from the following geological survey sentence and represent it as subject-predicate-object triples using the format predicate(subject, object). The sentence may contain information about geological timescales, locations, minerals, ore deposits, rocks, stratigraphy, and temporal entities such as real-time and geological timescales (e.g., 1000 Ma).

    Instructions:

    Identify the key entities from the sentence (e.g., rocks, minerals, geological timescales, locations).
    For each pair of entities, describe the relationship between them using the format predicate(subject, object).
    If a geological timescale or temporal entity is mentioned, relate it to the corresponding rock or stratigraphy.
    Ensure that each triple accurately reflects the relationship between the entities in the sentence.
    Example sentence: "Mapping and geochronology by the Geological Society of Australia reveal that the granitic rocks in the western part of the Yalgoo 1:250,000 map sheet are in the order of 2,800 to 3,000 Ma."

    Extracted triples:

    foundIn(Granitic rocks, western part of Yalgoo map sheet)
    hasTimescale(Granitic rocks, 2,800 to 3,000 Ma)
    conductedBy(Geological Society of Australia, mapping and geochronology)
    Your task: Extract triples from the following sentence:

    Extract the triples from the following sentence:
    Sentence: {sentence}
    """

    return prompt2

In [41]:
trips = {}

for i in text_list:
    print(i)
    prompt = manual_prompt(i)
    results = query(prompt).choices[0].message.content
    print(results)
    trips[i] = results
# results = query(prompt).choices[0].message.content

The Mango Bore Project (E09/1213) is situated approximately 105 km east-northeast of Gascoyne Junction (Figure 1) in the Gascoyne Complex and covers a total area of roughly 99 km2.
output is:  ChatCompletion(id='chatcmpl-A5V12UTCQjrZCMprg7rtzQvfytAj4', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Extracted triples:\n\nsituatedIn(Mango Bore Project, Gascoyne Complex)  \nlocatedApproximately(Mango Bore Project, 105 km east-northeast of Gascoyne Junction)  \ncoversArea(Mango Bore Project, 99 km²)  ', role='assistant', function_call=None, tool_calls=None, refusal=None))], created=1725874228, model='gpt-4o-mini-2024-07-18', object='chat.completion', system_fingerprint='fp_483d39d857', usage=CompletionUsage(completion_tokens=54, prompt_tokens=381, total_tokens=435))
Extracted triples:

situatedIn(Mango Bore Project, Gascoyne Complex)  
locatedApproximately(Mango Bore Project, 105 km east-northeast of Gascoyne Junction)  
coversArea(Mang

In [43]:
for a in trips:
    # print(a)
    print(trips[a])
    # print("___________________________________________________________________________________________________________")

Extracted triples:

situatedIn(Mango Bore Project, Gascoyne Complex)  
locatedApproximately(Mango Bore Project, 105 km east-northeast of Gascoyne Junction)  
coversArea(Mango Bore Project, 99 km²)  
Extracted triples:

accessedFrom(Project, Dairy Creek to Cobra Station road)
accessedFrom(Project, station track)
locatedApproximately(Project, 2 km to the south of Yinnietharra Homestead)
Extracted triples:

liesOn(Project area, Yinnietharra and Mooloo Downs Pastoral leases)
Extracted triples:

ownedBy(Mango Bore Project, Skytone Pty Ltd)  
operatedBy(Mango Bore Project, U3O8 Limited)  
Extracted triples:

hasCommonBoundaries(Wabli Creek project, U3O8 Limited)
isPartOf(E09/1178, Wabli Creek project)
Extracted triples:

situatedIn(Mango Bore Project, Wajarri-Yamatji Claim area)
registeredOn(Wajarri-Yamatji Claim area, 1st December 2005)
passedRegistrationTest(Wajarri-Yamatji Claim area, registration test)
Extracted triples:

liesNear(Mango Bore project, south-western margin of Gascoyne Comp