### Text2KG with ontologies + LLM

In [14]:
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("API_KEY")

In [15]:
import os
import sys
import json

sys.path.append(os.getcwd())
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
# from huggingface_hub import notebook_login
# notebook_login()

In [17]:
from openai import OpenAI

# Initialize OpenAI API

def query(prompt, model='gpt-4o-mini', temperature=0):
    client = OpenAI(api_key=api_key)

    res = client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    print('output is: ', res)
    return res

# # Use your custom query function
# completed_prompt = query(prompt)
# print(completed_prompt)

In [23]:
import json

#load json file
with open('ontology/geo_ontology.json', 'r') as file:
    ontology = json.load(file)

print(ontology)

ontology_entities = ""
for concept in ontology["concepts"]:
    ontology_entities += f"{concept["label"]}, "


ontology_entities = ontology_entities[:-2]
print(ontology_entities)

relations_list = []

ontology_relations = ""

for relation in ontology["relations"]:
    ontology_relations += f"{relation["label"]}{relation["domain"], relation["range"]}, "
    relations_list.append(relation["label"])

print(ontology_relations)

print("___________________________________________________________________________________________________________")

text = "Mapping and geochronology by the Geological Society of Australia (Arriens, 1971) reveal that the granitic rocks in the western part of the Yalgoo 1:250,000 map sheet are in the order of 2,800 to 3,000 Ma."
def ontology_prompt(text, ontology_entities, ontology_relations):

    prompt= f"""Given the following ontology and sentences, please extract the triples from the sentence according to the relations in the ontology. In the output, only include the triples in the given output format.

context:
Ontology Concepts: {ontology_entities}
Ontology Relations: {ontology_relations}

Example Sentence: The stratigraphy of the area includes the Triassic Sandstone, which is approximately 250 Ma old and contains quartz and feldspar.
Example Output:
belongsTo(Triassic (Stratigraphy), Sandstone (Rock))
contains(Sandstone (Rock), quartz (Mineral))
contains(Sandstone (Rock), feldspar (Mineral))

Test Sentence: {text}
Test Output:
    """

    return prompt

prompt = ontology_prompt(text, ontology_entities, ontology_relations)

print(prompt)

{'title': 'GeologicalSurveys Ontology', 'id': 'ont_geologicalsurveys', 'concepts': [{'qid': 'GeologicalTimescale', 'label': 'GeologicalTimescale'}, {'qid': 'Location', 'label': 'Location'}, {'qid': 'Mineral', 'label': 'Mineral'}, {'qid': 'OreDeposit', 'label': 'OreDeposit'}, {'qid': 'Rock', 'label': 'Rock'}, {'qid': 'Stratigraphy', 'label': 'Stratigraphy'}, {'qid': 'Date', 'label': 'Date'}], 'relations': [{'pid': 'contains', 'label': 'contains', 'domain': 'Rock', 'range': 'Mineral'}, {'pid': 'contains', 'label': 'contains', 'domain': 'Mineral', 'range': 'OreDeposit'}, {'pid': 'belongsTo', 'label': 'belongsTo', 'domain': 'Stratigraphy', 'range': 'Rock'}, {'pid': 'foundIn', 'label': 'foundIn', 'domain': 'Rock', 'range': 'Location'}, {'pid': 'foundIn', 'label': 'foundIn', 'domain': 'Mineral', 'range': 'Location'}, {'pid': 'foundAt', 'label': 'foundAt', 'domain': 'Rock', 'range': 'Date'}, {'pid': 'foundAt', 'label': 'foundAt', 'domain': 'Mineral', 'range': 'Date'}, {'pid': 'foundAt', 'labe

In [19]:
#load json file
with open('../Results/all_tags.json', 'r') as file:
    all_tagged = json.load(file)

In [20]:
print(all_tagged["a075237_ep_a_all_2007_10582280.json"])

{'Managed By: GME Resources Ltd Level 2 907 Canning Highway Mt.': {'preprocess': 'Managed By : GME Resources Ltd Level 2 907 Canning Highway Mt .', 'time_tagged': 'Managed By : GME Resources Ltd Level 2 907 Canning Highway Mt .', 'ner_tagged': ['O O O O O O O O O B-LOCATION I-LOCATION I-LOCATION O O'], 'combine_tags': ['error']}, 'Pleasant WA 6153 Distribution: Department of Industry and Resources - Perth GME Resources Limited NiWest June 2007 1': {'preprocess': 'Pleasant WA 6153 Distribution : Department of Industry and Resources - Perth GME Resources Limited NiWest June 2007 1', 'time_tagged': 'Pleasant WA 6153 Distribution : Department of Industry and Resources - Perth GME Resources Limited NiWest <TIMEX3 tid="t1" type="DATE" value="">June 2007</TIMEX3> 1', 'ner_tagged': ['B-LOCATION I-LOCATION O O O O O O O O O B-LOCATION O O O O O O O O O'], 'combine_tags': ['error']}, 'The Eucalyptus Bore Project is located 85km ENE of Kookynie and 45km southeast of the Murrin Murrin mine site.':

In [21]:
# Load json
with open('LLM_Results.json', 'r') as file:
    text2kg = json.load(file)

for i, a in enumerate(all_tagged):
    for b in all_tagged[a]:
        prompt = ontology_prompt(b, ontology_entities, ontology_relations)
        results = query(prompt).choices[0].message.content
        print(results)
        found_keywords = [keyword for keyword in relations_list if keyword in results]
        if found_keywords:
            text2kg[a][b]["LLM_Results"] = results
        else:
            text2kg[a][b]["LLM_Results"] = "None"

output is:  ChatCompletion(id='chatcmpl-A4Teke3HRza0ADckggISXsPGrKGM0', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='``` \n```', role='assistant', function_call=None, tool_calls=None, refusal=None))], created=1725630674, model='gpt-4o-mini-2024-07-18', object='chat.completion', system_fingerprint='fp_483d39d857', usage=CompletionUsage(completion_tokens=3, prompt_tokens=404, total_tokens=407))
``` 
```
output is:  ChatCompletion(id='chatcmpl-A4TekYdg2rP07BAXa0YZedju26XZP', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='foundIn(Pleasant WA (Location), 6153 (Location))\nfoundIn(Department of Industry and Resources (Location), Perth (Location))\ndiscoveredBy(GME Resources Limited (OreDeposit), Person)\nfoundAt(NiWest (OreDeposit), June 2007 (Date))', role='assistant', function_call=None, tool_calls=None, refusal=None))], created=1725630674, model='gpt-4o-mini-2024-07-18', object='c

KeyboardInterrupt: 

In [7]:
for a in text2kg:
    for b in text2kg[a]:
        try:
            if "None" in text2kg[a][b]["LLM_Results"]:
                continue
            else:
                print(b)
                print(text2kg[a][b]["LLM_Results"])
                print("___________________________________________________________________________________________________________")
        except:
            pass
    break

Murrin Murrin North area contains irregular outcrops of felsic volcanic rocks, gabbro and pyroxenite, surrounded by the olivine cumulates.
contains(Murrin Murrin North (Location), felsic volcanic rocks (Rock))
contains(Murrin Murrin North (Location), gabbro (Rock))
contains(Murrin Murrin North (Location), pyroxenite (Rock))
contains(Murrin Murrin North (Location), olivine cumulates (Rock))
___________________________________________________________________________________________________________


In [9]:
# Save json
# with open('LLM_Results.json', 'w') as file:
#     json.dump(text2kg, file)

# Load json
with open('LLM_Results.json', 'r') as file:
    text2kg = json.load(file)

In [49]:
for a in text2kg:
    for b in text2kg[a]:
        try:
            if "None" in text2kg[a][b]["LLM_Results"]:
                continue
            else:
                print(b)
                print(text2kg[a][b]["LLM_Results"])
                print("___________________________________________________________________________________________________________")
        except:
            pass

The sequence also contains mafic volcanic and intrusive rocks and has been intruded by porphyritic granodiorite and felsic to mafic dykes.
contains(sequence (Rock), mafic volcanic (Rock))
contains(sequence (Rock), intrusive rocks (Rock))
contains(sequence (Rock), porphyritic granodiorite (Rock))
contains(sequence (Rock), felsic to mafic dykes (Rock))
___________________________________________________________________________________________________________
Murrin Murrin North area contains irregular outcrops of felsic volcanic rocks, gabbro and pyroxenite, surrounded by the olivine cumulates.
contains(Murrin Murrin North area (Location), felsic volcanic rocks (Rock))
contains(Murrin Murrin North area (Location), gabbro (Rock))
contains(Murrin Murrin North area (Location), pyroxenite (Rock))
contains(Murrin Murrin North area (Location), olivine cumulates (Rock))
___________________________________________________________________________________________________________
This granite is of

In [44]:
text = ""
for i, a in enumerate(all_tagged):
    for b in all_tagged[a]:
        if i == 5:
            text += b + " "
        # if "GEO_TIME" in all_tagged[a][b]["combine_tags"][0]:
        #     print(i)
print(text)
print(len(text))

prompt = ontology_prompt(text, ontology_entities, ontology_relations)
print(prompt)
# results = query(prompt).choices[0].message.content
# print(results)

An extensive review of all historical databases, publicly available and remotely sensed data generated exploration targets which were followed up with rock chip sampling and soil sampling programs. The geochemical anomalies identified have been prioritised resulting in the voluntary partial surrender of exploration tenements 08/1717, 08/1718 and 08/1725 of the Capricorn Project. Coodardoo Formation - The formation comprises a series of prominent sandstone dominated ridges that define the Wanna Syncline to the southwest of the Capricorn Project area. The formation ranges in thickness from 150 to 350 metres and is dominated by lithic quartz arenites and quartz arenites with angular to rounded quartz comprising 60-80% of the rock. It is the youngest stratigraphic unit in the Edmund Group. The lithologies present are thought to represent a turbidite fan deposit with a source most likely in the northwest. The formation stands out with distinct sub rounded boulders forming scree slopes and a