## Testing on Data
Opening the database files


~corpus folder

In [2]:
import os
import sys
import json

sys.path.append(os.getcwd())
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
from dataloc import directory_path
from pathlib import Path

data_path = Path(directory_path)

print(data_path)

# full_path = os.path.join(directory_path, r'corpus\document2anumber.json')
# full_path = directory_path + r"\corpus\document2anumber.json"
full_path = data_path / "corpus" / "document2anumber.json"

print(full_path)

with open(full_path, 'r') as file:
    data = json.load(file)

..\..\wamex\data
..\..\wamex\data\corpus\document2anumber.json


# Open Domain Dictionary Files

In [4]:
Domain_Dictionary_Path = {"geological_timescales": "Domain_Dictionary/geological_timescales.txt",
                          "locations": "Domain_Dictionary/locations.txt",
                          "minerals": "Domain_Dictionary/minerals.txt",
                          "ores_deposits": "Domain_Dictionary/ores_deposits.txt",
                          "rocks": "Domain_Dictionary/rocks.txt",
                          "stratigraphy": "Domain_Dictionary/stratigraphy.txt"}

Domain_Dictionary = {}

for domain, path in Domain_Dictionary_Path.items():
    with open(path, "r") as file:
        info = file.read().splitlines()
        if domain == "geological_timescales":
            info = [i.lstrip().split()[0] for i in info]
        info = [i for i in info if i.strip()] # Make sure there are no empty strings
        Domain_Dictionary[domain] = info

        print(info)
        print(len(info))

# print(Domain_Dictionary["minerals"])

['Aalenian', 'Abereiddian', 'Acadian', 'Actonian', 'Adelaidean', 'Aegean', 'Aeronian', 'Aksayan', 'Aktastinian', 'Alaunian', 'Albertan', 'Albian', 'Aldingian', 'Alexandrian', 'Alportian', 'Altonian', 'Amgan', 'Animikean', 'Anisian', 'Aphebian', 'Aptian', 'Aquitanian', 'Aratauran', 'Archean', 'Archeozoic', 'Arenig', 'Arenigian', 'Arikareean', 'Aritan', 'Arnsbergian', 'Arowhanan', 'Artinskian', 'Arundian', 'Asbian', 'Ashgill', 'Asselian', 'Astian', 'Atdabanian', 'Atokan/Derryan', 'Atokan', 'Aurelucian', 'Austinian', 'Auversian', 'Awamoan', 'Ayusokkanian', 'Azoic', 'Baigendzinian', 'Bairnsdalian', 'Baishaean', 'Bajocian', 'Bala', 'Balan', 'Balcombian', 'Bananian', 'Baotan', 'Barremian', 'Barstovian', 'Bartonian', 'Bashkirian', 'Basin', 'Batesfordian', 'Bathonian', 'Batyrbayan', 'Begudian', 'Bendigonian', 'Berriasian', 'Bithynian', 'Black', 'Blackriveran', 'Blackriverian', 'Blancan', 'Bolindian', 'Bolsovian', 'Boomerangian', 'Bortonian', 'Botomian', 'Braxtonian', 'Bridgerian', 'Brigantian'

In [5]:
entity_type = {"geological_timescales": "TIMESCALE",
               "locations": "LOCATION",
               "minerals": "MINERAL",
               "ores_deposits": "ORE_DEPOSIT",
               "rocks": "ROCK",
               "stratigraphy": "STRAT"}

domain_training_data = []
for domain in Domain_Dictionary:
    for key in Domain_Dictionary[domain]:
        input = key.split(" ")
        labels = [f"I-{entity_type[domain]}"] * len(input)
        labels[0] = f"B-{entity_type[domain]}"
        domain_training_data.append({"output":input, "labels":labels})

# with open("NER/Training_data/DomainDictionary.json", "w") as file:
#     json.dump(domain_training_data, file)

### geo ontology
extract info

In [6]:
# https://github.com/CGI-IUGS/timescale-data/tree/master/rdf ontology grabbed here

from rdflib import Graph

# Create a graph
g = Graph()

# Parse the data from the TTL file
g.parse("Domain_Dictionary/isc2020.ttl", format="ttl")

geo_subject_label_vocab = {}

query = """
PREFIX gts: <http://resource.geosciml.org/ontology/timescale/gts#>
PREFIX isc: <http://resource.geosciml.org/classifier/ics/ischart/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT ?subject ?label
WHERE {
  ?subject a ?type ;
            rdfs:label ?label ;
  FILTER (?type IN (
    gts:Super-Eon,
    gts:Eon, 
    gts:Era, 
    gts:Period, 
    gts:Sub-Period,
    gts:Epoch, 
    gts:Age
  ))
}
"""

# Execute the query
results = g.query(query)

# Process the results
for row in results:
    subject = row.subject
    label = row.label

    geo_subject_label_vocab[subject] = label
    

# Define the query to fetch all subjects and their labels based on [Eon, Era, Period, Epoch, Age]
geo_dict_scale = {}
time_periods = ["Eon", "Era", "Sub-Period", "Period", "Epoch", "Age"]

for time_period_type in time_periods:
    query = f"""
    PREFIX gts: <http://resource.geosciml.org/ontology/timescale/gts#>
    PREFIX isc: <http://resource.geosciml.org/classifier/ics/ischart/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT ?subject ?label ?broader ?narrower ?comment
    WHERE {{
      ?subject a gts:{time_period_type} ;
                rdfs:label ?label ;
                rdfs:comment ?comment .
        OPTIONAL {{ ?subject skos:broader ?broader }} .
        OPTIONAL {{ ?subject skos:narrower ?narrower }} .
    }}
    """
    # Execute the query
    results = g.query(query)

    # Dictionary to hold the combined comments
    combined_dict = {}
    # Process the results
    for row in results:
        subject = row.subject
        label = row.label
        broader = row.broader
        comment = row.comment
        narrower = row.narrower
  
        # Initialize the dictionary entry if it doesn't exist
        if (subject, label) not in combined_dict:
            combined_dict[(subject, label)] = {
                "broader": broader,
                "narrower": [],
                "comments": []
            }
        
        # Append the comment to the dictionary entry
        if comment not in combined_dict[(subject, label)]["comments"]:
            combined_dict[(subject, label)]["comments"].append(comment)

        # Append to narrower list if not already present
        if narrower and narrower not in combined_dict[(subject, label)]["narrower"]:
            combined_dict[(subject, label)]["narrower"].append(narrower)

    # Print the results
    for (subject, label), data in combined_dict.items():
        broader = data["broader"]
        comments = [str(c) for c in data["comments"]]
        
        # print(f"Subject: {subject}, Label: {label}, Broader: {broader}")
        # print(f"Comments: {comments}")
        # print(f"Narrower: {data['narrower']}")

    geo_dict_scale[time_period_type] = combined_dict


In [7]:
def count_numbers(list):
    count = 0
    for item in list:
        try:
            # Attempt to convert the item to a float
            float(item.replace('|', '').replace('+', ''))
            count += 1
        except ValueError:
            pass
    return count

def find_geo_time_bound(timeBounds):
    bound_count = count_numbers(timeBounds)
    if bound_count == 1:
        bound = timeBounds[-2]
        uncertainty = 0
    elif bound_count == 2:
        bound = timeBounds[-3]
        uncertainty = timeBounds[-2]
    else:
        bound = None
        uncertainty = None
    return bound, uncertainty

# Place the geological time scale dictionary into the Domain Dictionary json file format
time_periods = ["Age", "Epoch", "Period", "Sub-Period", "Era", "Eon"]

timescale_sorted = {"Eon": {}, "Era": {}, "Sub-Period": {}, "Period": {}, "Epoch": {}, "Age": {}}
prev_type = None
for time_period_type in time_periods:
    current_list = {}
    for (subject, label), data in geo_dict_scale[time_period_type].items():
        broader = data["broader"]
        comments = [str(c) for c in data["comments"]]

        label = str(label)
        # Get the broader and narrower labels
        broader_label = str(geo_subject_label_vocab[broader]) if broader else None
        narrower = [str(geo_subject_label_vocab[n]) for n in data["narrower"] if n]
        # Find the upper and lower bounds
        older_bound = comments[0].split(" ") if comments else None
        younger_bound = comments[-1].split(" ") if comments else None
        upperbound, upper_uncertainty = find_geo_time_bound(older_bound)
        lowerbound, lower_uncertainty = find_geo_time_bound(younger_bound)

        timescale_sorted[time_period_type][label] = {
            "broader": broader_label,
            "narrower": narrower,
            "upperbound" : upperbound,
            "upper_uncertainty" : upper_uncertainty,
            "lowerbound" : lowerbound,
            "lower_uncertainty" : lower_uncertainty
        }

print(timescale_sorted)

{'Eon': {'Archean Eon': {'broader': 'Precambrian Supereon', 'narrower': ['Eoarchean Era', 'Mesoarchean Era', 'Neoarchean Era', 'Paleoarchean Era'], 'upperbound': '-4000', 'upper_uncertainty': 0, 'lowerbound': '-2500', 'lower_uncertainty': 0}, 'Hadean Eon': {'broader': 'Precambrian Supereon', 'narrower': [], 'upperbound': '-4567', 'upper_uncertainty': '+|-1', 'lowerbound': '-4000', 'lower_uncertainty': 0}, 'Phanerozoic Eon': {'broader': None, 'narrower': ['Cenozoic Era', 'Mesozoic Era', 'Paleozoic Era'], 'upperbound': '-541.0', 'upper_uncertainty': '+|-1.0', 'lowerbound': '-0.0', 'lower_uncertainty': 0}, 'Proterozoic Eon': {'broader': 'Precambrian Supereon', 'narrower': ['Mesoproterozoic Era', 'Neoproterozoic Era', 'Paleoproterozoic Era'], 'upperbound': '-2500', 'upper_uncertainty': 0, 'lowerbound': '-541.0', 'lower_uncertainty': '+|-1.0'}}, 'Era': {'Cenozoic Era': {'broader': 'Phanerozoic Eon', 'narrower': ['Neogene Period', 'Paleogene Period', 'Quaternary Period'], 'upperbound': '-66.

In [8]:
def ontology_to_triples(ontology_dict):
    """
    Convert ontology dictionary to a list of triples.
    
    Parameters:
    - ontology_dict: The dictionary representing the ontology
    
    Returns:
    - A list of triples (subject, predicate, object)
    """
    triples = []
    for category, items in ontology_dict.items():
        for item, details in items.items():
            if details.get('broader'):
                triples.append((item, 'broader', details['broader']))
            if details.get('narrower'):
                for narrower in details['narrower']:
                    triples.append((item, 'narrower', narrower))
            if details.get('upperbound'):
                triples.append((item, 'upperbound', details['upperbound']))
            if details.get('lowerbound'):
                triples.append((item, 'lowerbound', details['lowerbound']))
    return triples


triples = ontology_to_triples(timescale_sorted)
print(triples)

[('Archean Eon', 'broader', 'Precambrian Supereon'), ('Archean Eon', 'narrower', 'Eoarchean Era'), ('Archean Eon', 'narrower', 'Mesoarchean Era'), ('Archean Eon', 'narrower', 'Neoarchean Era'), ('Archean Eon', 'narrower', 'Paleoarchean Era'), ('Archean Eon', 'upperbound', '-4000'), ('Archean Eon', 'lowerbound', '-2500'), ('Hadean Eon', 'broader', 'Precambrian Supereon'), ('Hadean Eon', 'upperbound', '-4567'), ('Hadean Eon', 'lowerbound', '-4000'), ('Phanerozoic Eon', 'narrower', 'Cenozoic Era'), ('Phanerozoic Eon', 'narrower', 'Mesozoic Era'), ('Phanerozoic Eon', 'narrower', 'Paleozoic Era'), ('Phanerozoic Eon', 'upperbound', '-541.0'), ('Phanerozoic Eon', 'lowerbound', '-0.0'), ('Proterozoic Eon', 'broader', 'Precambrian Supereon'), ('Proterozoic Eon', 'narrower', 'Mesoproterozoic Era'), ('Proterozoic Eon', 'narrower', 'Neoproterozoic Era'), ('Proterozoic Eon', 'narrower', 'Paleoproterozoic Era'), ('Proterozoic Eon', 'upperbound', '-2500'), ('Proterozoic Eon', 'lowerbound', '-541.0'),

In [9]:
def create_prompt_with_ontology(text, triples):
    """
    Create a prompt for the LLM to extract triples from geological text.
    
    Parameters:
    - text: The unstructured geological text
    - triples: List of ontology triples
    
    Returns:
    - A formatted prompt string
    """
    triples_info = "\n".join([f"{subj} --{pred}--> {obj}" for subj, pred, obj in triples])
    
    prompt = f"""
Here is a list of geological triples based on the ontology:

{triples_info}

And here is some unstructured geological text:

{text}

Please extract any relevant triples from the text based on the provided ontology.
"""
    return prompt

# Example usage
text = "The Cenozoic Era includes the Neogene and Paleogene periods. It follows the Mesozoic Era."
prompt = create_prompt_with_ontology(text, triples)
print(prompt)
print(len(prompt))


Here is a list of geological triples based on the ontology:

Archean Eon --broader--> Precambrian Supereon
Archean Eon --narrower--> Eoarchean Era
Archean Eon --narrower--> Mesoarchean Era
Archean Eon --narrower--> Neoarchean Era
Archean Eon --narrower--> Paleoarchean Era
Archean Eon --upperbound--> -4000
Archean Eon --lowerbound--> -2500
Hadean Eon --broader--> Precambrian Supereon
Hadean Eon --upperbound--> -4567
Hadean Eon --lowerbound--> -4000
Phanerozoic Eon --narrower--> Cenozoic Era
Phanerozoic Eon --narrower--> Mesozoic Era
Phanerozoic Eon --narrower--> Paleozoic Era
Phanerozoic Eon --upperbound--> -541.0
Phanerozoic Eon --lowerbound--> -0.0
Proterozoic Eon --broader--> Precambrian Supereon
Proterozoic Eon --narrower--> Mesoproterozoic Era
Proterozoic Eon --narrower--> Neoproterozoic Era
Proterozoic Eon --narrower--> Paleoproterozoic Era
Proterozoic Eon --upperbound--> -2500
Proterozoic Eon --lowerbound--> -541.0
Cenozoic Era --broader--> Phanerozoic Eon
Cenozoic Era --narrowe

In [10]:
# print(completed_prompt.choices[0].message.content)

In [11]:

text = """
Mapping and geochronology by the Geological Society of Australia (Arriens, 1971) reveal that the granitic rocks in the western part of the Yalgoo 1:250,000 map sheet are in the order of 2,800 to 3,000 Ma.
"""

prompt = f"""
I will provide a geological text. Your task is to extract information as triples in the format (subject, predicate, object). 
Use the provided geological ontology, which includes the following entities and relationships, to identify and relate entities in the text:

Ontology:
{timescale_sorted}

Given the following text, extract the triples:

"{text}"

List each triple on a new line, following the format (subject, predicate, object). Use the ontology to ensure accuracy in identifying and relating the entities.
"""

# print(prompt)

def query(prompt, model='gpt-4o-mini', temperature=0):
    client = OpenAI(api_key=api_key)

    res = client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    print('output is: ', res)
    return res

# Execute the query
# completed_prompt = query(prompt)

In [12]:
# print(completed_prompt.choices[0].message.content)

In [13]:
ran = {"id": "ont_10_comicscharacter_test_1", "prompt": "Given the following ontology and sentences, please extract the triples from the sentence according \n    to the relations in the ontology. In the output, only include the triples in the given output format. \n\nCONTEXT:\n\nOntology Concepts: ComicsCharacter, Voice, Country, City, Place, Film, Organisation, Award, Person, Series,\nOntology Relations: voice(ComicsCharacter,Voice), city(ComicsCharacter,City), nationality(ComicsCharacter,Country), firstAired(Film,Date), distributor(Film,Organisation), award(Film,Award), lastAired(Film,Date), keyPerson(Organisation,Person), foundedBy(Organisation,Person), creator(ComicsCharacter,Person), starring(Film,ComicsCharacter), series(Film,Series), firstAppearanceInFilm(ComicsCharacter,Date), fullName(ComicsCharacter,string), child(ComicsCharacter,ComicsCharacter), alternativeName(ComicsCharacter,string), birthPlace(ComicsCharacter,Place), broadcastedBy(ComicsCharacter,Organisation)\n\nExample Sentence: The comic character Asterix, was created by Ren\u00e9 Goscinny and Albert Uderzo.\nExample Output:\ncreator(Asterix (comicsCharacter), Ren\u00e9 Goscinny)\nalternativeName(Asterix (comicsCharacter), \"Ast\u00e9rix\")\ncreator(Asterix (comicsCharacter), Albert Uderzo)\n\nTest Sentence: Jan Duursema and Paul Kupperberg created the comic book character of Arion who is also known as Ahri'ahn.\nTest Output: "}
print(ran["prompt"])

Given the following ontology and sentences, please extract the triples from the sentence according 
    to the relations in the ontology. In the output, only include the triples in the given output format. 

CONTEXT:

Ontology Concepts: ComicsCharacter, Voice, Country, City, Place, Film, Organisation, Award, Person, Series,
Ontology Relations: voice(ComicsCharacter,Voice), city(ComicsCharacter,City), nationality(ComicsCharacter,Country), firstAired(Film,Date), distributor(Film,Organisation), award(Film,Award), lastAired(Film,Date), keyPerson(Organisation,Person), foundedBy(Organisation,Person), creator(ComicsCharacter,Person), starring(Film,ComicsCharacter), series(Film,Series), firstAppearanceInFilm(ComicsCharacter,Date), fullName(ComicsCharacter,string), child(ComicsCharacter,ComicsCharacter), alternativeName(ComicsCharacter,string), birthPlace(ComicsCharacter,Place), broadcastedBy(ComicsCharacter,Organisation)

Example Sentence: The comic character Asterix, was created by René Gosci

In [14]:
# Not Finished, decided to do different method

# # Place the geological time scale dictionary into the Domain Dictionary json file format
# time_periods = ["Eon", "Era", "Sub-Period", "Period", "Epoch", "Age"]

# timescale_sorted = {}
# previous_list = {}
# for time_period_type in time_periods:
#     current_list = {}
#     for (subject, label), data in geo_dict_scale[time_period_type].items():
#         broader = data["broader"]
#         comments = [str(c) for c in data["comments"]]

#         # print(f"Subject: {subject}, Label: {label}, Broader: {broader}")
#         # print(f"Comments: {comments}")

#         label = str(label)
            
#         if time_period_type != "Eon":
#             broader_label = str(geo_subject_label_vocab[broader])
#             if broader_label not in current_list:
#                 current_list[broader_label] = {
#                     "type": time_period_type,
#                     label: {
#                         "upperbound" : None,
#                         "upper_uncertainty" : 0,
#                         "lowerbound" : None,
#                         "lower_uncertainty" : 0
#                     }
#                 }
#             else:
#                 current_list[broader_label][label] = {
#                     "upperbound" : None,
#                     "upper_uncertainty" : 0,
#                     "lowerbound" : None,
#                     "lower_uncertainty" : 0
#                 }

#             if label in previous_list:
#                 current_list[broader_label][label] = previous_list[label]
#                 print(previous_list[label])

#         else:
#             timescale_sorted

#     if (time_period_type == "Sub-Period"):
#         current_list.update(previous_list)

#     previous_list = current_list
#     print(current_list)

# print(current_list)



In [15]:
# import rdflib

# with open("Domain_Dictionary/geological_timescales.txt", "r") as file:
#     geological_timescale_text = file.read()

# print(geological_timescale_text)
# #  eons eras periods epochs ages 

# # Extract the geological timescales words


# # Extract the geological timescales
# geological_timescales = geological_timescale_text.split("\n")
# geo_temporal_scale = {}
# geo_temporal_scale["eons"] = []


In [16]:
from dataloc import directory_path

data_path = directory_path
print(Path(data_path) / "corpus" / "document2anumber.json")
print(os.path.join(directory_path, r'corpus\document2anumber.json'))

with open(Path(data_path) / "corpus" / "document2anumber.json", 'r') as file:
    # os.path.join(directory_path, r'corpus\document2anumber.json'), 'r') as file:
    data = json.load(file)

with open(Path(data_path) / "corpus" / "document2keywords.json", 'r') as file:
    # os.path.join(directory_path, r'corpus/document2keywords.json'), 'r') as file:
    datafd = json.load(file)

with open(Path(data_path) / "corpus" / "keyword2documents.json", 'r') as file:
    # os.path.join(directory_path, r'corpus/keyword2documents.json'), 'r') as file:
    datakeys = json.load(file)

for i, d in enumerate(data):
    if i > 0:
        break
    print("document2anumber.json:")
    print(d, data[d])

print()

for i, d in enumerate(datafd):
    if i > 0:
        break
    print("document2keywords.json:")
    print(d, datafd[d])

print()

for i, d in enumerate(datakeys):
    if i > 0:
        break
    print("keyword2documents.json:")
    print(d, datakeys[d])

print()

print("~~~~~~'a073533_nf_e09_1004_2006a_12145691.pdf' testing:")
print(data['a073533_nf_e09_1004_2006a_12145691.pdf'])
print(datafd['a073533_nf_e09_1004_2006a_12145691.pdf'])

print()


with open(Path(data_path) / "wamex_metadata" / "073533.json", 'r') as file:
    meta_data = json.load(file)

print("073533.json testing:")
print(meta_data)



..\..\wamex\data\corpus\document2anumber.json
..\..\wamex\data\corpus\document2anumber.json
document2anumber.json:
a087761_e28_1833_2010_16911280.pdf 087761

document2keywords.json:
a087761_e28_1833_2010_16911280.pdf {'laverton-karonie greenstone belt': {'textrank': 0, 'tf_idf': 0.1870187238}, 'felsic schists shales': {'textrank': 0.0322580645, 'tf_idf': 0.1745545671}, 'mt monger gold': {'textrank': 0.0418788908, 'tf_idf': 0.3740374477}, 'for gold': {'textrank': 0.0470761702, 'tf_idf': 0.2100083723}, 'dolerite': {'textrank': 0.0322580645, 'tf_idf': 0.0453549253}, 'nickel': {'textrank': 0.0322580645, 'tf_idf': 0.0429806772}, 'significant gold': {'textrank': 0.0248490117, 'tf_idf': 0.0708358369}, 'flexure within host greenstone sequences': {'textrank': 0.0182462575, 'tf_idf': 0.1798837496}, 'regolith': {'textrank': 0.0322580645, 'tf_idf': 0.0987627594}, 'ppb au': {'textrank': 0.0322580645, 'tf_idf': 0.0575461992}, 'cowarna rocks project': {'textrank': 0.0226372383, 'tf_idf': 0.4128977757

### Opening PDF 

In [17]:
# ! pip install pypdf
# https://www.geeksforgeeks.org/working-with-pdf-files-in-python/
from pypdf import PdfReader 

reader = PdfReader(Path(directory_path) / "wamex_pdf" / "a073533_nf_e09_1004_2006a_12145691.pdf")
    # os.path.join(directory_path, r'wamex_pdf/a073533_nf_e09_1004_2006a_12145691.pdf'))

print(len(reader.pages))

for i in range(len(reader.pages)):
        print(f"Page {i+1}:")
        print(reader.pages[i].extract_text())


23
Page 1:
  
 
 
 
 
 
32 Kings Park Rd West Perth 6005 
 
 
Midwest Corporation Limited 
 
New Forest Project 
Annual Report for E70 / 1004 
 
26 September 2005 – 25 September 2006 
 
 
 
 
 
 
 
 
 
Author:  David Broomfield  
B.Sc. (Hons), Dip. Ed. 
Validated: David Broomfield 
B.Sc. (Hons), Dip. Ed. 
Date:  20th October 2006 
Page 2:
Midwest Corporation Limited 
Annual Report E09/1004 September 2006  
 
 
 
 
 
 
TABLE OF CONTENTS 
                                   PAGE 
EXECUTIVE SUMMARY.....................................................................................................................1 
1.0 BIBLIOGRAPHIC DATA SHEET..........................................................................................2 
2.0 INTRODUCTION......................................................................................................................3 
2.1 NEW FOREST LOCATION AND ACCESS.........................................................................................

In [18]:
# ! python -m spacy download en_core_web_sm
# ! pip install spacy

In [19]:
import re
import spacy

# Spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# # nlp.Defaults.stop_words = 
# print(nlp.Defaults.stop_words)
# for word in nlp.Defaults.stop_words:
#     print(word)


# Perform Stopwords Removal, Lemmatisation and Tokenisation
def spacy_processing(x):
    doc = nlp(x)
    x = [word.lemma_ for word in doc] # if not word.is_stop]
    return x

# Function to preprocess the data
def preprocess_data(sentence_list):
    output_list = []
    for sentence in sentence_list:
        sentence = sentence.lower()                     # Case folding
        sentence = re.sub(r'[^\w\s]', ' ', sentence)    # Remove punctuation
        sentence = re.sub(r"\s+", " ", sentence)        # Remove extra spaces
        tokens = spacy_processing(sentence)             # Remove stopwords / Lemmatisation / Tokenise
        output_list.append(tokens)
    return output_list

In [20]:
pages = []

for i in range(len(reader.pages)):
    pages.append(reader.pages[i].extract_text())

output = preprocess_data(pages)
for i in range(len(output)):
    print(f"Page {i+1}:")
    print(output[i])

Page 1:
[' ', '32', 'king', 'park', 'rd', 'west', 'perth', '6005', 'midwest', 'corporation', 'limit', 'new', 'forest', 'project', 'annual', 'report', 'for', 'e70', '1004', '26', 'september', '2005', '25', 'september', '2006', 'author', 'david', 'broomfield', 'b', 'sc', 'hon', 'dip', 'ed', 'validate', 'david', 'broomfield', 'b', 'sc', 'hon', 'dip', 'ed', 'date', '20th', 'october', '2006']
Page 2:
['midwest', 'corporation', 'limit', 'annual', 'report', 'e09', '1004', 'september', '2006', 'table', 'of', 'content', 'page', 'executive', 'summary', '1', '1', '0', 'bibliographic', 'datum', 'sheet', '2', '2', '0', 'introduction', '3', '2', '1', 'new', 'forest', 'location', 'and', 'access', '3', '2', '2', 'new', 'forest', 'tenure', '5', '3', '0', 'new', 'forest', 'project', 'history', '8', '4', '0', 'geology', '9', '4', '1', 'regional', 'geology', 'of', 'the', 'melia', 'creek', 'tenement', '9', '4', '2', 'structure', 'and', 'metamorphism', '9', '4', '3', 'iron', 'mineralisation', '10', '4', '4'

In [21]:
print(pages[2].split()[177])

banded


In [22]:
import re
import spacy

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Perform Stopwords Removal, Lemmatisation and Tokenisation
def spacy_processing(x):
    doc = nlp(x)
    x = [word.lemma_ for word in doc] # if not word.is_stop]
    return x

# Function to preprocess the data
def preprocess_data(sentence_list):
    output_list = []
    for sentence in sentence_list:
        sentence = sentence.lower()                     # Case folding
        sentence = re.sub(r'[^\w\s]', ' ', sentence)    # Remove punctuation
        sentence = re.sub(r"\s+", " ", sentence)        # Remove extra spaces
        tokens = spacy_processing(sentence)             # Lemmatisation / Tokenise
        output_list.append(tokens)
    return output_list

def preprocess_word(word):
    word = word.lower()                     # Case folding
    word = re.sub(r'[^\w\s]', ' ', word)    # Remove punctuation
    word = re.sub(r"\s+", " ", word)        # Remove extra spaces
    return word

def preprocess_text(text):
    text = " ".join([token.text for token in nlp(text)])
    return text

In [23]:
# ! python -m spacy download en_core_web_sm

In [24]:
import spacy
import re

# Load the spaCy model
sp_sm = spacy.load('en_core_web_sm')

def spacy_large_ner(document):
    # Extract named entities using spaCy
    entities = {(ent.text.strip(), ent.label_) for ent in sp_sm(document).ents}
    
    # Extract geological timescales
    pattern = r'\b\d+(?:[\.,]\d+)?\s*(?:Ma|ka|Ga|MYA|KYA)\b'
    matches = re.finditer(pattern, document, re.IGNORECASE)
    geological_timescales = {(match.group().strip(), 'GEOLOGICAL_TIME') for match in matches}
    
    # Combine geological timescales with named entities
    all_entities = entities.union(geological_timescales)
    
    return all_entities

# Sample text from geological surveys
text = """
This report has been prepared as an investigation of the Mt Aubrey tenement, as part of Midwest’s New
Forest project in the Murchison Region of Western Australia. The report is presented as an Annual Report
to be submitted to the Department of Industry and Resources as part of the conditions of the granting of
E09/1004 and covers the period from the 26 September 2005 to the 25 September 2006. We also have dates like
50 Ma and 2000 ka which are geological timescales.
"""

# Extract entities and geological timescales
print(spacy_large_ner(text))



{('50 Ma', 'GEOLOGICAL_TIME'), ('50', 'CARDINAL'), ('the Department of Industry and Resources', 'ORG'), ('the 26 September 2005', 'DATE'), ('2000', 'DATE'), ('2000 ka', 'GEOLOGICAL_TIME'), ('the 25 September 2006', 'DATE'), ('New\nForest', 'ORG'), ('the Murchison Region of Western Australia', 'LOC'), ('Midwest', 'LOC')}


In [25]:
import spacy
import re
from dateutil import parser

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

def format_ner_output(text):
    # Tokenize the text using spaCy
    doc = nlp(text)
    
    # Define the geological timescales pattern
    timescale_pattern = r'\b\d+(?:[\.,]\d+)?\s*(?:Ma|ka|Ga|MYA|KYA)\b'
    
    # Extract geological timescales and their positions
    geological_timescales = re.finditer(timescale_pattern, text, re.IGNORECASE)
    
    # Create a dictionary to store geological timescale spans
    geo_timescale_spans = set()

    for match in geological_timescales:
        start, end = match.span()
        geo_timescale_spans.add((start, end))

    # print(geo_timescale_spans)
    
    # Prepare to format the output
    tokens_with_labels = []

    # Process tokens and assign labels
    for token in doc:
        token_text = token.text
        token_start = token.idx
        token_end = token_start + len(token_text)
        
        # Check if token is part of a geological timescale
        token_label = 'O'

        # Check if token is a recognized entity
        if token.ent_type_:
            token_label = token.ent_type_
        

        for start, end in geo_timescale_spans:
            if token_start >= start and token_end <= end:
                if (token_start == start):
                    token_label = f'B-GEO_TIME'
                else:
                    token_label = f'I-GEO_TIME'
                break
        
        tokens_with_labels.append(f"{token_text} {token_label}")
    
    return tokens_with_labels


# def extract_relationship_entities(doc):
#     relationships = []
    
#     for token in doc:
#         if token.dep_ in ('attr', 'dobj', 'nsubj'):
#             head = token.head
#             if head.dep_ in ('ROOT', 'attr', 'dobj'):
#                 relationship = {
#                     'entity': token.text,
#                     'relationship': head.text
#                 }
#                 relationships.append(relationship)
    
#     return relationships

# Sample text from geological surveys
text = "Mapping and geochronology by the Geological Society of Australia ( Arriens , 1971 ) reveal that the granitic rocks in the western part of the Yalgoo 1:250,000 map sheet are in the order of 2,800 to 3,000 ma ."

# Format the NER output
ner = format_ner_output(text)
print(ner)

# # Extract temporal information
# doc = nlp(text)

# # Extract relationship entities
# relationships = extract_relationship_entities(doc)
# print("\nRelationship Entities:")
# for rel in relationships:
#     print(f"Entity: {rel['entity']}, Relationship: {rel['relationship']}")

['Mapping O', 'and O', 'geochronology O', 'by O', 'the ORG', 'Geological ORG', 'Society ORG', 'of ORG', 'Australia ORG', '( O', 'Arriens GPE', ', O', '1971 DATE', ') O', 'reveal O', 'that O', 'the O', 'granitic O', 'rocks O', 'in O', 'the O', 'western O', 'part O', 'of O', 'the O', 'Yalgoo O', '1:250,000 O', 'map O', 'sheet O', 'are O', 'in O', 'the O', 'order O', 'of O', '2,800 CARDINAL', 'to CARDINAL', '3,000 B-GEO_TIME', 'ma I-GEO_TIME', '. O']


### Sort Data

In [26]:
filepath = Path(directory_path) / "wamex_xml"
content = {}
for root, dirs, files in os.walk(filepath):
    for file in files:
        try:
            with open(filepath / file, 'r') as f:
                content[file] = json.load(f)
        except:
            pass

In [27]:
for a in content:
    for b in content[a]:
        entities = spacy_large_ner(b)
        contains_geological_time = any(label == 'GEOLOGICAL_TIME' for _, label in entities)
        if contains_geological_time:
            print('"',b.strip(),'"')
            # print(entities)
        


" Mapping and geochronology by the Geological Society of Australia (Arriens, 1971) reveal that the granitic rocks in the western part of the Yalgoo 1:250,000 map sheet are in the order of 2,800 to 3,000 ma. "
" After the Mangaroon Orogeny had deformed and metamorphosed the Gascoyne Complex around 1620Ma, the Bangemall Basin comprising the Edmund and Collier sub-basins, developed between the Pilbara and Yilgarn Cratons. "
" It is younger than 1620Ma and older than the ca. "
" 1465Ma suite of dolerite sills that intrude it. "
" Most of the Collier Basin developed to the north and east, outside the Project area at about 1400 to 1070Ma. "
" A second system of dolerite sills intrudes both the Edmund and Collier Group rocks and is dated at 1078 to 1070Ma. "
" The 1070-755Ma Edmundian Orogeny was an intracratonic event that deformed the Bangemall Supergroup, reactivating basement structures as reverse faults. "
" Around 570Ma the Mulka Orogeny further deformed the rocks and saw intrusion of d

In [28]:
def clean(text):
    if text[-3:] == "Mt.":
        text = text[:-3] + "Mt ."
    else:
        text = text.replace("Mt.", "Mt")
    return text

all_tagged = {}

for a in content:
    all_tagged[a] = {}
    for sentence in content[a]:
        all_tagged[a][sentence] = {"preprocess" : preprocess_text(clean(sentence))}

In [29]:
# for a in all_tagged:
#     for b in all_tagged[a]:
#         print(f"Text: {b}")
#         print(all_tagged[a][b]["preprocess"])

In [30]:
from datetime import datetime
import re

def format_date_string(text):
    # Regular expression to match:
    # 1. Month Day , Year (with extra space before the comma)
    # 2. Month Day,Year (without space after the comma)
    pattern = re.compile(r"(\b\w+\s\d{1,2})\s?,\s?(\d{4})")
    # Replace the pattern with "Month Day, Year" with the correct spacing
    formatted_text = pattern.sub(r"\1, \2", text)
    return formatted_text

def find_date_pattern(text):
    # Regular expression for matching the pattern dd/mm/yyyy
    date_pattern = re.compile(r"\b(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[0-2])/(?:[0-9]{2}|[0-9]{4})\b")
    
    # Search for the pattern in the text
    match = date_pattern.search(text)
    
    if match:
        return match.group(0)
    return None

def ordinal(n):
    return "%d%s" % (n, "th" if 11 <= n <= 13 else {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th"))

def parse_date(found_date):
    # Determine if the year is two or four digits
    if len(found_date.split('/')[-1]) == 2:
        # Assume that two-digit years belong to the 2000s
        date_obj = datetime.strptime(found_date, "%d/%m/%y")
    else:
        date_obj = datetime.strptime(found_date, "%d/%m/%Y")
    
    return date_obj

# ________________________________________________________________________  

def find_month_year_pattern(text):
    # Regular expression for matching the pattern MM/YYYY (ensures month is two digits)
    month_year_pattern = re.compile(r"\b(0[1-9]|1[0-2])/\d{4}\b")
    
    # Search for the pattern in the text
    match = month_year_pattern.search(text)
    
    if match:
        return match.group(0)
    return None

def format_month_year(month_year):
    # Parse the date string
    date_obj = datetime.strptime(month_year, "%m/%Y")
    # Format as "Month Year"
    return date_obj.strftime("%B %Y")

# ________________________________________________________________________  
# Example usage
text = "This report was created on 14/12/2001 for the project."
found_date = find_date_pattern(text)

if found_date:
    print(f"Found date: {found_date}")
else:
    print("No date pattern found.")

# ________________________________________________________________________

for a in all_tagged:
    for text in all_tagged[a]:
        curr = all_tagged[a][text]["preprocess"]
        # curr = format_date_string(curr).replace(" , ", ", ")

        found_date = find_date_pattern(curr)
        while found_date:
            # print(found_date)
            # Format the date as "14th December 2001"
            date_obj = parse_date(found_date)
            formatted_date = f"{ordinal(date_obj.day)} {date_obj.strftime('%B')} {date_obj.year}"
            all_tagged[a][text]["preprocess"] = curr.replace(found_date, formatted_date)

            curr = all_tagged[a][text]["preprocess"]
            found_date = find_date_pattern(curr)
            
        found_month_year = find_month_year_pattern(curr)
        while found_month_year:
            # Format the month and year as "December 2001"
            formatted_month_year = format_month_year(found_month_year)
            all_tagged[a][text]["preprocess"] = curr.replace(found_month_year, formatted_month_year)

            curr = all_tagged[a][text]["preprocess"]
            found_month_year = find_month_year_pattern(curr)

Found date: 14/12/2001


In [31]:
# for a in all_tagged:
#     for b in all_tagged[a]:
#         print(f"{b}")
#         print(all_tagged[a][b]["preprocess"])
#         print()

### LOAD NER AND RUN

In [112]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

save_directory = "./models/first_saved_model"
# Load the tokenizer and model from the saved directory
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForSeq2SeqLM.from_pretrained(save_directory).to(device)

def tokenize_data(texts, tokenizer, max_length=256):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

def predict_entities(texts, model, tokenizer):
    inputs = tokenize_data(texts, tokenizer)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient calculation
        outputs = model.generate(**inputs, max_new_tokens=256)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# # Predict geological entities
# predictions = predict_entities(geo_texts, model, tokenizer)

# # Print the results
# print(predictions)
# for text, prediction in zip(geo_texts, predictions):
#     print(f"Text: {text}")
#     print(f"Prediction: {prediction}")
#     print()

In [140]:
def NER_tagging(model, tokenizer):
    for a in all_tagged:
        for b in all_tagged[a]:
            text = all_tagged[a][b]["preprocess"]
            all_tagged[a][b]["ner_tagged"] = predict_entities(text, model, tokenizer)

    return all_tagged

# Perform NER tagging
# ner_tagged = NER_tagging(model, tokenizer)


{'a092458_e09_1213_2011_a_12624596.json': {'The Mango Bore Project (E09/1213) is situated approximately 105 km east-northeast of Gascoyne Junction (Figure 1) in the Gascoyne Complex and covers a total area of roughly 99 km2.': {'preprocess': 'The Mango Bore Project ( E09/1213 ) is situated approximately 105 km east - northeast of Gascoyne Junction ( Figure 1 ) in the Gascoyne Complex and covers a total area of roughly 99 km2 .', 'tagged': ['O B-LOCATION I-LOCATION O O O O O O O O O O O O O B-LOCATION I-LOCATION O O O O O O B-STRAT I-STRAT O O O O O O O O O O']}, 'The project is accessed from the Dairy Creek to the Cobra Station road and a station track approximately 2 km to the south of Yinnietharra Homestead.': {'preprocess': 'The project is accessed from the Dairy Creek to the Cobra Station road and a station track approximately 2 km to the south of Yinnietharra Homestead .', 'tagged': ['O O O O O O B-LOCATION I-LOCATION O O O O O O O O O O O O O O O O B-LOCATION B-LOCATION O']}, 'Th

In [34]:
# save json file
# with open("Results/all_tags.json", "w") as file:
#     json.dump(all_tagged, file)

# load json file
with open("Results/all_tags.json", "r") as file:
    all_tagged = json.load(file)

In [35]:
count = 0
error = 0
for a in all_tagged:
    for b in all_tagged[a]:
        count += 1
        if len(all_tagged[a][b]["ner_tagged"][0].split(" ")) != len(all_tagged[a][b]["preprocess"].split(" ")):
            error += 1
print(f"Total: {count}, Error: {error}")
print(f"Error rate: {error/count}")

Total: 1945, Error: 659
Error rate: 0.33881748071979434


In [36]:
# Write to JSON file
# with open("Results/NER_tagged.json", "w") as json_file:
#     json.dump(ner_tagged, json_file)

# Load the NER tagged data
with open("Results/NER_tagged.json", 'r') as file:
    ner_tagged = json.load(file)

### LOAD Temporal NER AND RUN

In [37]:
# from datetime import datetime
# import re

# def format_date_string(text):
#     # Regular expression to match:
#     # 1. Month Day , Year (with extra space before the comma)
#     # 2. Month Day,Year (without space after the comma)
#     pattern = re.compile(r"(\b\w+\s\d{1,2})\s?,\s?(\d{4})")
#     # Replace the pattern with "Month Day, Year" with the correct spacing
#     formatted_text = pattern.sub(r"\1, \2", text)
#     return formatted_text

# def find_date_pattern(text):
#     # Regular expression for matching the pattern dd/mm/yyyy
#     date_pattern = re.compile(r"\b(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[0-2])/(?:[0-9]{2}|[0-9]{4})\b")
    
#     # Search for the pattern in the text
#     match = date_pattern.search(text)
    
#     if match:
#         return match.group(0)
#     return None

# def ordinal(n):
#     return "%d%s" % (n, "th" if 11 <= n <= 13 else {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th"))

# def parse_date(found_date):
#     # Determine if the year is two or four digits
#     if len(found_date.split('/')[-1]) == 2:
#         # Assume that two-digit years belong to the 2000s
#         date_obj = datetime.strptime(found_date, "%d/%m/%y")
#     else:
#         date_obj = datetime.strptime(found_date, "%d/%m/%Y")
    
#     return date_obj

# # ________________________________________________________________________  

# def find_month_year_pattern(text):
#     # Regular expression for matching the pattern MM/YYYY (ensures month is two digits)
#     month_year_pattern = re.compile(r"\b(0[1-9]|1[0-2])/\d{4}\b")
    
#     # Search for the pattern in the text
#     match = month_year_pattern.search(text)
    
#     if match:
#         return match.group(0)
#     return None

# def format_month_year(month_year):
#     # Parse the date string
#     date_obj = datetime.strptime(month_year, "%m/%Y")
#     # Format as "Month Year"
#     return date_obj.strftime("%B %Y")

# # ________________________________________________________________________  
# # Example usage
# text = "This report was created on 14/12/2001 for the project."
# found_date = find_date_pattern(text)

# if found_date:
#     print(f"Found date: {found_date}")
# else:
#     print("No date pattern found.")

# time_tagged = ner_tagged
# for a in time_tagged:

#     for text in time_tagged[a]:
#         curr = format_date_string(text).replace(" , ", ", ")
#         time_tagged[a][text] = curr
#         found_date = find_date_pattern(text)
#         while found_date:
#             # print(found_date)
#             # Format the date as "14th December 2001"
#             date_obj = parse_date(found_date)
#             formatted_date = f"{ordinal(date_obj.day)} {date_obj.strftime('%B')} {date_obj.year}"
#             # print(text)
#             time_tagged[a][text] = curr.replace(found_date, formatted_date)
#             # print(time_tagged[a][text])
#             found_date = find_date_pattern(time_tagged[a][text])
#             curr = time_tagged[a][text]
            
#         found_month_year = find_month_year_pattern(text)
#         while found_month_year:
#             # Format the month and year as "December 2001"
#             formatted_month_year = format_month_year(found_month_year)
#             time_tagged[a][text] = curr.replace(found_month_year, formatted_month_year)
#             found_month_year = find_month_year_pattern(time_tagged[a][text])
#             curr = time_tagged[a][text]

In [38]:
# for a in time_tagged:
#     print(a)
#     for b in time_tagged[a]:
#         print(b)
#         print(time_tagged[a][b])

In [39]:
from transformers import AutoTokenizer, BertForTokenClassification
import torch

from temporal_taggers.evaluation import merge_tokens, insert_tags_in_raw_text


import pdb

def do_nothing():
    pass

pdb.set_trace = do_nothing

def clean_timex_tags(text):
    # Regular expression to find nested TIMEX3 tags
    # Regular expression patterns to match and clean up spaces
    patterns = {
        r'<\s+TIMEX3': r'<TIMEX3',             # Clean up leading spaces before <TIMEX3
        r'</TIMEX3\s+>': r'</TIMEX3>',          # Clean up trailing spaces after </TIMEX3
        r'(\w+)="([^"]*?)\s+"': r'\1="\2"'      # Clean up spaces inside attributes (from previous example)
    }

    # Apply each pattern replacement
    for pattern, replacement in patterns.items():
        text = re.sub(pattern, replacement, text)
    
    nested_timex_pattern = re.compile(r'<TIMEX3[^>]*>(<TIMEX3[^>]*>[^<]+</TIMEX3>)</TIMEX3>')
        
    # Replace the nested TIMEX3 tags with a single TIMEX3 tag
    while nested_timex_pattern.search(text):
        text = nested_timex_pattern.sub(r'\1', text)
    
    return text

# time_tagged_full = time_tagged

time_model = BertForTokenClassification.from_pretrained("satyaalmasian/temporal_tagger_BERT_tokenclassifier").to(device)
time_tokenizer = AutoTokenizer.from_pretrained("satyaalmasian/temporal_tagger_BERT_tokenclassifier", use_fast=False)

id2label = {v: k for k, v in time_model.config.label2id.items()}
for a in all_tagged:
    annotation_id = 1
    for b in all_tagged[a]:
        try:
            # text = nlp(time_tagged_full[a][b])
            # text = " ".join([token.text for token in text])
            text = all_tagged[a][b]["preprocess"]
            processed_text = time_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

            with torch.no_grad():
                result = time_model(**processed_text)

            classification = torch.argmax(result[0], dim=2)

            # Merge the tokens
            merged_tokens = merge_tokens(processed_text["input_ids"][0], classification[0], id2label, time_tokenizer)
            annotated_text, annotation_id = insert_tags_in_raw_text(text, merged_tokens, annotation_id)
            annotated_text = clean_timex_tags(annotated_text)
            all_tagged[a][b]["time_tagged"] = annotated_text
            print(annotated_text)
        except Exception as e:
            print(f"An error occurred while processing the text: {b}")
            all_tagged[a][b]["time_tagged"] = all_tagged[a][b]["preprocess"]
            continue

Managed By : GME Resources Ltd Level 2 907 Canning Highway Mt .
Pleasant WA 6153 Distribution : Department of Industry and Resources - Perth GME Resources Limited NiWest <TIMEX3 tid="t1" type="DATE" value="">June 2007</TIMEX3> 1
The Eucalyptus Bore Project is located 85 km ENE of Kookynie and 45 km southeast of the Murrin Murrin mine site .
Access is via the Kookynie - Mt .
Remarkable Road to Yundamindra Station and thence by station tracks and along fence lines .
The former Mount Burgess Gold Mining NL haul road bisects the project area and can also be used to access most areas ( Figure 1 ) .
Several relatively steep N - S striking ridges of weathered ultramafic rocks occasionally capped by a yellowish brown silcrete and deep red ferruginous laterite and small incised watercourses has made access and drilling in certain areas difficult .
See figure 14 Digital Elevation Model ( DEM ) .
Vegetation within the project area is generally sparse with thicker growth in the creek systems .
The

In [40]:
# time_model = BertForTokenClassification.from_pretrained("satyaalmasian/temporal_tagger_BERT_tokenclassifier").to(device)
# time_tokenizer = AutoTokenizer.from_pretrained("satyaalmasian/temporal_tagger_BERT_tokenclassifier", use_fast=False)

# id2label = {v: k for k, v in time_model.config.label2id.items()}

# for a in ner_tagged:
#     annotation_id = 1
#     for b in ner_tagged[a]:
#         try:
#             print(f"{b}")
#             processed_text = time_tokenizer(b, return_tensors="pt", truncation=True, padding=True).to(device)
#             with torch.no_grad():
#                 result = time_model(**processed_text)

#             classification = torch.argmax(result[0], dim=2)
#             merged_tokens = merge_tokens(processed_text["input_ids"][0], classification[0], id2label, time_tokenizer)
#             annotated_text, annotation_id = insert_tags_in_raw_text(b, merged_tokens, annotation_id)
#             annotated_text = clean_timex_tags(annotated_text)
#             print(annotated_text)
#         except Exception as e:
#             print(f"An error occurred while processing the text: {b}")
#             continue

# ______________________________________________________________________________________________________________________



In [41]:
# # Testings

# from transformers import AutoTokenizer, BertForTokenClassification
# import torch

# from temporal_taggers.evaluation import merge_tokens, insert_tags_in_raw_text

# text = "Sample rate 0.1 Seconds ( 10Hz ) "



# time_model = BertForTokenClassification.from_pretrained("satyaalmasian/temporal_tagger_BERT_tokenclassifier")

# time_tokenizer = AutoTokenizer.from_pretrained("satyaalmasian/temporal_tagger_BERT_tokenclassifier", use_fast=False)

# id2label = {v: k for k, v in time_model.config.label2id.items()}
# annotation_id = 1
# print(text)
# processed_text = time_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# try:
#     with torch.no_grad():
#         result = time_model(**processed_text)

#     classification = torch.argmax(result[0], dim=2)

#     # Merge the tokens
#     merged_tokens = merge_tokens(processed_text["input_ids"][0], classification[0], id2label, time_tokenizer)
#     annotated_text, annotation_id = insert_tags_in_raw_text(text, merged_tokens, annotation_id)
#     print(annotated_text)
# except Exception as e:
#     print(f"An error occurred while processing the text: {text}")
#     print(e)


    

In [42]:
# save json file
# with open("Results/all_tags.json", "w") as file:
#     json.dump(all_tagged, file)

# load json file
with open("Results/all_tags.json", "r") as file:
    all_tagged = json.load(file)

In [43]:
import re

def extract_timex_and_spans(text):
    # Regular expression to find all TIMEX3 tags and extract their content
    timex_pattern = re.compile(r'<TIMEX3[^>]*>([^<]+)</TIMEX3>')
    
    timex_values = []
    spans = []
    types = []
    cleaned_text = text
    match = timex_pattern.search(cleaned_text)
    
    # Find all TIMEX3 tags in the text
    # for match in timex_pattern.finditer(cleaned_text):
    while match:
        timex_value = match.group(1)
        timex_values.append(timex_value)

        type_pattern = re.compile(r'type="([^"]*)"')
        type = type_pattern.search(match.group(0)).group(1)
        types.append(type)
        
        # Calculate the start and end span in the original text
        start_span = match.span()[0]
        end_span = start_span + len(timex_value)
        spans.append((start_span, end_span))
        
        # Update the cleaned text by removing the TIMEX3 tag
        cleaned_text = cleaned_text.replace(match.group(0), timex_value)
        match = timex_pattern.search(cleaned_text)
        
    return timex_values, cleaned_text, spans, types

for a in all_tagged:
    for sentence in all_tagged[a]:
        # print(time_tagged_full[a][b])
        time_tagged_sentence = all_tagged[a][sentence]["time_tagged"]
        if "TIMEX3" in time_tagged_sentence:
            print(time_tagged_sentence)
            timex_values, cleaned_text, spans, types = extract_timex_and_spans(time_tagged_sentence)
            print(timex_values)
            print(cleaned_text)
            print(spans)
            print(types)
            print()


Pleasant WA 6153 Distribution : Department of Industry and Resources - Perth GME Resources Limited NiWest <TIMEX3 tid="t1" type="DATE" value="">June 2007</TIMEX3> 1
['June 2007']
Pleasant WA 6153 Distribution : Department of Industry and Resources - Perth GME Resources Limited NiWest June 2007 1
[(106, 115)]
['DATE']

NiWest <TIMEX3 tid="t2" type="DATE" value="">June 2007 26</TIMEX3> Soil sampling was undertaken on a one hundred by one hundred metre grid sampling the -80 mesh material 10 centimetres below the surface .
['June 2007 26']
NiWest June 2007 26 Soil sampling was undertaken on a one hundred by one hundred metre grid sampling the -80 mesh material 10 centimetres below the surface .
[(7, 19)]
['DATE']

NiWest has undertaken an Aboriginal Heritage Survey ( Machin and Glendenning <TIMEX3 tid="t3" type="DATE" value="">2002</TIMEX3> ) of the <TIMEX3 tid="t4" type="DATE" value="">former</TIMEX3> tenements that make up the GME Resources Eucalyptus Bore Project area .
['2002', 'former

In [44]:
text = 'The anomalous PGE elements, Cu with high Cr YALGOO IRON ORE PROJECT Annual Geological Report, <TIMEX3 tid="t53" type="DATE" value="">April 2014 26</TIMEX3> and Ni within magnetised ultramafic and gossan indicates a possibility of Contact Type CuNi - PGE / Reef type PGE mineralisation or VMS type Golden Grove style mineralisation .'

timex_values, cleaned_text, spans, types = extract_timex_and_spans(text)
print(timex_values)
print(cleaned_text)
print(spans)
print(types)

['April 2014 26']
The anomalous PGE elements, Cu with high Cr YALGOO IRON ORE PROJECT Annual Geological Report, April 2014 26 and Ni within magnetised ultramafic and gossan indicates a possibility of Contact Type CuNi - PGE / Reef type PGE mineralisation or VMS type Golden Grove style mineralisation .
[(94, 107)]
['DATE']


In [163]:
with open("Results/NER_tagged.json", 'r') as file:
    ner_tagged = json.load(file)

with open("Results/all_tags.json", "r") as file:
    all_tagged = json.load(file)

count = 0
error = 0
numOfTemporals = 0

def fix_size():
    pass


for a in all_tagged:
    for b in all_tagged[a]:
        # print(len(b.split(" ")))
        # print(len(ner_tagged[a][b][0].split(" ")))

        # Check if the number of tokens match
        if len(all_tagged[a][b]["preprocess"].split(" ")) != len(all_tagged[a][b]["ner_tagged"][0].split(" ")):
            tags = all_tagged[a][b]["ner_tagged"][0].split(" ")
            size = len(all_tagged[a][b]["preprocess"].split(" "))
            tag_size = len(tags)
            if size > tag_size:
                tags.extend(["O"] * (size - tag_size))
            else:
                tags = tags[:size]

            all_tagged[a][b]["ner_tagged"] = [" ".join(tags)]

            # all_tagged[a][b]["combine_tags"] = ["error"]
            # error += 1
        # else:
        entities = spacy_large_ner(b)

        tag_labels = all_tagged[a][b]["ner_tagged"][0].split(" ")

        contains_geological_time = any(label == 'GEOLOGICAL_TIME' for _, label in entities)
        time_tagged_sentence = all_tagged[a][b]["time_tagged"]
        contains_real_time = "TIMEX3" in time_tagged_sentence
        if contains_geological_time or contains_real_time:
            numOfTemporals += 1

        #  Add the geological time tags
        if contains_geological_time:
            temporalLabels = format_ner_output(b)
            # print(b)
            # print(temporalLabels)
            
            for i, word in enumerate(temporalLabels):
                label = word.split(" ")[1]
                if "GEO_TIME" in label or "DATE" in label:
                    tag_labels[i] = label

        #  Add the real time tags
        if contains_real_time:
            timex_values, cleaned_text, spans, types = extract_timex_and_spans(time_tagged_sentence)
            doc = nlp(cleaned_text.strip())
            numOfTemporals = len(timex_values)
            temporalNum = 0
            span = spans[temporalNum]
            # print(doc)
            # print("____________________")
            # print(len(doc))
            # print(len(tag_labels))
            for i, word in enumerate(doc):

                idx = word.idx
                endIdx = idx + len(word.text)
                if idx >= span[0] and endIdx <= span[1]:
                    # print(span)
                    # print(word.text)
                    # print(idx)
                    # print(endIdx)
                    if idx == span[0]:
                        tag_labels[i] = "B-DATE"
                    else:
                        tag_labels[i] = "I-DATE"
                if idx > span[1]:
                    temporalNum += 1
                    if temporalNum < len(spans):
                        span = spans[temporalNum]

        all_tagged[a][b]["combine_tags"] = [" ".join(tag_labels)]
        # print(ner_tagged[a][b])
        count += 1
        
print(f"Total number of sentences: {count}")
print(f"Error rate: {error/count}")
print(f"Number of Temporals: {numOfTemporals/count}")


Total number of sentences: 1945
Error rate: 0.0
Number of Temporals: 0.002056555269922879


In [168]:
# save json file
# with open("Results/testing_tags.json", "w") as file:
#     json.dump(all_tagged, file)

# # load json file
# with open("Results/testing_tags.json", "r") as file:
#     all_tagged = json.load(file)

# # load json file
with open("Results/all_tags.json", "r") as file:
    all_tagged = json.load(file)

In [167]:
for a in all_tagged:
    for b in all_tagged[a]:
        print(f"Text: {b}")
        print(f"Preprocess: {all_tagged[a][b]['preprocess']}")
        print(f"NER Tagged: {all_tagged[a][b]['ner_tagged']}")
        print(f"Time Tagged: {all_tagged[a][b]['time_tagged']}")
        print(f"Combine Tags: {all_tagged[a][b]['combine_tags']}")
        print()

Text: Managed By: GME Resources Ltd Level 2 907 Canning Highway Mt.
Preprocess: Managed By : GME Resources Ltd Level 2 907 Canning Highway Mt .
NER Tagged: ['O O O O O O O O O B-LOCATION I-LOCATION I-LOCATION O']
Time Tagged: Managed By : GME Resources Ltd Level 2 907 Canning Highway Mt .
Combine Tags: ['O O O O O O O O O B-LOCATION I-LOCATION I-LOCATION O']

Text: Pleasant WA 6153 Distribution: Department of Industry and Resources - Perth GME Resources Limited NiWest June 2007 1
Preprocess: Pleasant WA 6153 Distribution : Department of Industry and Resources - Perth GME Resources Limited NiWest June 2007 1
NER Tagged: ['B-LOCATION I-LOCATION O O O O O O O O O B-LOCATION O O O O O O O']
Time Tagged: Pleasant WA 6153 Distribution : Department of Industry and Resources - Perth GME Resources Limited NiWest <TIMEX3 tid="t1" type="DATE" value="">June 2007</TIMEX3> 1
Combine Tags: ['B-LOCATION I-LOCATION O O O O O O O O O B-LOCATION O O O O B-DATE I-DATE O']

Text: The Eucalyptus Bore Projec

In [48]:
text = "Copies to : O Aurora Resources Pty Ltd , Perth O Department of Mines and Petroleum , WA Page | 2 BIBLIOGRAPHIC DATA MANAGER : Aurora Minerals Ltd REPORT TITLE : Partial Surrender Report : Combined Reporting Number C63/2010 Capricorn Southeast Project , Bangemall Basin , Western Australia PROJECT NAME : Capricorn Southeast Project TENEMENT NUMBERS : E52/2137 ; E52/2139 ; ESeptember 1605 ; and ESeptember 1427 ."
print(len(text.split(" ")))

68


In [49]:
# # # Write to JSON file
# with open("Results/Temporal_NER_tagged.json", "w") as json_file:
#     json.dump(ner_tagged, json_file)

# Load the NER tagged data
with open("Results/Temporal_NER_tagged.json", 'r') as file:
    temporal_ner_tagged = json.load(file)

In [50]:
# for a in temporal_ner_tagged:
#     for b in temporal_ner_tagged[a]:
#         doc = nlp(b.strip())
#         entity = temporal_ner_tagged[a][b][0].split(" ")
#         if len(doc) == len(entity):
#             for i in range(len(doc)):
#                 print(f"{doc[i].text} {entity[i]}")

In [51]:
# load json file
with open("Results/all_tags.json", "r") as file:
    all_tagged = json.load(file)

for a in all_tagged:
    for b in all_tagged[a]:
        doc = nlp(all_tagged[a][b]["preprocess"])
        entity = all_tagged[a][b]["combine_tags"][0].split(" ")
        if len(doc) == len(entity):
            for i in range(len(doc)):
                print(f"{doc[i].text} {entity[i]}")

The O
Eucalyptus B-LOCATION
Bore I-LOCATION
Project O
is O
located O
85 O
km O
ENE O
of O
Kookynie B-LOCATION
and O
45 O
km O
southeast O
of O
the O
Murrin B-LOCATION
Murrin I-LOCATION
mine I-LOCATION
site O
. O
Access O
is O
via O
the O
Kookynie B-LOCATION
- O
Mt B-LOCATION
. O
Remarkable O
Road O
to O
Yundamindra B-LOCATION
Station O
and O
thence O
by O
station O
tracks O
and O
along O
fence O
lines O
. O
Several O
relatively O
steep O
N O
- O
S O
striking O
ridges O
of O
weathered O
ultramafic B-ROCK
rocks I-ROCK
occasionally O
capped O
by O
a O
yellowish O
brown O
silcrete O
and B-ROCK
deep O
red O
ferruginous O
laterite O
and B-ROCK
small O
incised O
watercourses O
has O
made O
access O
and O
drilling O
in O
certain O
areas O
difficult O
. O
Vegetation O
within O
the O
project O
area O
is O
generally O
sparse O
with O
thicker O
growth O
in O
the O
creek O
systems O
. O
The O
vegetation O
is O
dominated O
by O
mulga O
and O
to O
a O
lesser O
extent O
by O
mallee O
, O
salt B-MINERA

Size of variables
- Content Shape - {filename: [sentence,...]}
- NER_tagging shape - {filename: {sentence: [tagged_sentence,...]}}

In [52]:
# Write to JSON file
# with open("Results/NER_tagged.json", "w") as json_file:
#     json.dump(ner_tagged, json_file)

# Load the NER tagged data
with open("Results/NER_tagged.json", 'r') as file:
    ner_tagged = json.load(file)

### RE

In [53]:
# https://huggingface.co/models?other=relation-extraction

### REBEL Model

In [54]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# https://www.superteams.ai/blog/knowledge-graph-from-unstructured-text
# https://github.com/gswycf/Joint-Extraction-of-Entities-and-Relations-Based-on-a-Novel-Tagging-Scheme
# https://academic.oup.com/jamia/article/20/5/828/727128?login=false
# https://dl.acm.org/doi/fullHtml/10.1145/3462475
# https://huggingface.co/Babelscape/mrebel-large

def extract_triplets_typed(text):
    triplets = []
    relation = ''
    text = text.strip()
    current = 'x'
    subject, relation, object_, object_type, subject_type = '','','','',''

    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").replace("tp_XX", "").replace("__en__", "").split():
        if token == "<triplet>" or token == "<relation>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
                relation = ''
            subject = ''
        elif token.startswith("<") and token.endswith(">"):
            if current == 't' or current == 'o':
                current = 's'
                if relation != '':
                    triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
                object_ = ''
                subject_type = token[1:-1]
            else:
                current = 'o'
                object_type = token[1:-1]
                relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '' and object_type != '' and subject_type != '':
        triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
    return triplets

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/mrebel-large", src_lang="en_XX", tgt_lang="tp_XX")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-large").to(device)
gen_kwargs = {
    "max_length": 256,
    "length_penalty": 0,
    "num_beams": 2,
    "num_return_sequences": 2,
    "forced_bos_token_id": None,
}

# Text to extract triplets from
text = """
This report has been prepared as an investigation of the Mt Aubrey tenement, as part of Midwest’s New Forest project in the Murchison Region of Western Australia. The report is presented as an Annual Report to be submitted to the Department of Industry and Resources as part of the conditions of the granting of E09/1004 and covers the period from the 26 September 2005 to the 25 September 2006.
"""
# Tokenizer text
model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')

# Generate
generated_tokens = model.generate(
    model_inputs["input_ids"].to(model.device),
    attention_mask=model_inputs["attention_mask"].to(model.device),
    decoder_start_token_id = tokenizer.convert_tokens_to_ids("tp_XX"),
    **gen_kwargs,
)

# Extract text
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

# Extract triplets
for idx, sentence in enumerate(decoded_preds):
    print(f'Prediction triplets sentence {idx}')
    print(extract_triplets_typed(sentence))


Prediction triplets sentence 0
[{'head': 'Mt Aubrey', 'head_type': 'loc', 'type': 'located in the administrative territorial entity', 'tail': 'Western Australia', 'tail_type': 'loc'}, {'head': 'New Forest', 'head_type': 'loc', 'type': 'located in the administrative territorial entity', 'tail': 'Western Australia', 'tail_type': 'loc'}, {'head': 'Murchison Region', 'head_type': 'loc', 'type': 'located in the administrative territorial entity', 'tail': 'Western Australia', 'tail_type': 'loc'}]
Prediction triplets sentence 1
[{'head': 'Mt Aubrey', 'head_type': 'loc', 'type': 'located in the administrative territorial entity', 'tail': 'Western Australia', 'tail_type': 'loc'}, {'head': 'Murchison Region', 'head_type': 'loc', 'type': 'located in the administrative territorial entity', 'tail': 'Western Australia', 'tail_type': 'loc'}]


In [55]:
def extract_relationships(content, model, tokenizer):
    relation_extraction = {}
    for c in content:
        relation_extraction[c] = {}
        for content_text in content[c]:
            model_inputs = tokenizer(content_text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')
            generated_tokens = model.generate(
                model_inputs["input_ids"].to(model.device),
                attention_mask=model_inputs["attention_mask"].to(model.device),
                decoder_start_token_id = tokenizer.convert_tokens_to_ids("tp_XX"),
                **gen_kwargs,
            )
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

            relation_extraction[c][content_text] = []
            for extraction in decoded_preds:
                relation_extraction[c][content_text].append(extract_triplets_typed(extraction))
    return relation_extraction

# Perform relationship extraction using REBEL
# relation_extraction = extract_relationships(all_tagged, model, tokenizer)

In [56]:
# # Write to JSON file
# with open("Results/REBEL_results.json", "w") as json_file:
#     json.dump(relation_extraction, json_file)

# Load the NER tagged data
with open("Results/REBEL_results.json", 'r') as file:
    relation_extraction = json.load(file)

In [57]:
for c in relation_extraction:
    print(c)
    for r in relation_extraction[c]:
        print("sentence:" , r)  
        for content_text in relation_extraction[c][r]:
            print(content_text)

a075237_ep_a_all_2007_10582280.json
sentence: Managed By: GME Resources Ltd Level 2 907 Canning Highway Mt.
[{'head': 'Canning Highway Mt.', 'head_type': 'loc', 'type': 'length', 'tail': '2 907', 'tail_type': 'num'}]
[{'head': 'Canning Highway Mt.', 'head_type': 'loc', 'type': 'maintained by', 'tail': 'GME Resources Ltd', 'tail_type': 'org'}]
sentence: Pleasant WA 6153 Distribution: Department of Industry and Resources - Perth GME Resources Limited NiWest June 2007 1
[{'head': 'NiWest', 'head_type': 'org', 'type': 'headquarters location', 'tail': 'Perth', 'tail_type': 'loc'}]
[{'head': 'Pleasant WA 6153', 'head_type': 'misc', 'type': 'inception', 'tail': 'June 2007', 'tail_type': 'date'}]
sentence: The Eucalyptus Bore Project is located 85km ENE of Kookynie and 45km southeast of the Murrin Murrin mine site.
[{'head': 'Eucalyptus Bore Project', 'head_type': 'loc', 'type': 'located in the administrative territorial entity', 'tail': 'Kookynie', 'tail_type': 'loc'}]
[{'head': 'Eucalyptus B

In [58]:
from transformers import pipeline


# Function to parse the generated text and extract the triplets
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large', device=device)

def extract_triplets(text_raw, triplet_extractor=triplet_extractor):
    text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text_raw, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
    print(text)
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text[0].strip()
    current = 'x'
    for token in text.replace("", "").replace("", "").replace("", "").split():
        if token == "":
            current = 't'
            if relation != '':
                triplets.append((subject.strip(), relation.strip(), object_.strip()))
                relation = ''
            subject = ''
        elif token == "":
            current = 's'
            if relation != '':
                triplets.append((subject.strip(), relation.strip(), object_.strip()))
            object_ = ''
        elif token == "":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append((subject.strip(), relation.strip(), object_.strip()))
    return set(triplets)




In [59]:
# for a in all_tagged:
#     for b in all_tagged[a]:
#         print(b)
#         text_raw = b
#         text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text_raw, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
#         print(text)

### RE with LLM

In [60]:
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("API_KEY")

In [61]:
from openai import OpenAI

# models:
#   - gpt-3.5-turbo
#   - gpt-4
#   - gpt-4-turbo
#   - gpt-4o-mini
def query(prompt, model='gpt-4o-mini', temperature=0):
    client = OpenAI(api_key=api_key)

    res = client.chat.completions.create(
            model=model,
            temperature=temperature,
            messages=[
            {"role": "user", "content": prompt}
            ])

    # print(prompt)
    print('output is: ', res)

    return res


In [62]:
from openai import OpenAI

def query(prompt, model='gpt-4o-mini', temperature=0):
    client = OpenAI(api_key=api_key)

    res = client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    # print(prompt)
    print('output is: ', res)

    return res

def extract_geotime_relations(text, combined_tags):
    # Format the prompt to include the text and tags
    prompt = f"""Extract the geological time periods or date entities and relate them to other entities in the following text:
    
    Text: {text}
    
    Tags: {combined_tags}
    
    Please provide the output in the following format, based on the context of the text:
    
    Entity: [Entity Name] [Entity Type]
    Relation: [Relation]
    Geo_Time/Date Entity: [Extracted Geo_Time or Date Entity] [Entity Type]

    Some examples are provided below:
    Example 1:
    Entity: [Mount Morrissey Metamorphics] [STRAT]
    Relation: [underlies]
    Geo_Time/Date Entity: [~1,700 Ma] [GEO_TIME]

    Example 2:
    Entity: [NiWest] [Organization]
    Relation: [conducted]
    Geo_Time/Date Entity: [June 2007] [DATE]
    
    Ensure that the relations reflect the actual context of the entities in the text.
    """

    # Query OpenAI with the formatted prompt
    response = query(prompt).choices[0].message.content.strip()

    return response

# Example usage
text = "The latter comprises primarily low - grades ~1,700 Ma meta - conglomerates and coarse metasandstones , overlying the Mount Morrissey Metamorphics ."
combined_tags = ["O O O O O O O B-GEO_TIME I-GEO_TIME O O B-ROCK O O B-ROCK O O O B-STRAT I-STRAT I-STRAT O"]

# result = extract_geotime_relations(text, combined_tags)
# print(result)

In [63]:
# print(result)

In [64]:
import re

def clean_relations(entries):
    cleaned_entries = []

     # Define the geological timescales pattern
    timescale_pattern = re.compile(r'\b\d+(?:[\.,]\d+)?\s*(?:Ma|ka|Ga|MYA|KYA)\b')

    for entry in entries:
        tail_type = entry.get('tail', {}).get('Type', '')
        tail_name = entry.get('tail', {}).get('Name', '')

         # Extract geological timescales and their positions
        geological_timescales = timescale_pattern.search(tail_name)
       
        # Check if Geo_Time/Date Entity is valid
        if  tail_type =="DATE" or (tail_type == "GEO_TIME" and geological_timescales):
            cleaned_entries.append(entry)

    return cleaned_entries


def extract_temporal_relations(text):
    structured_data = []

    # Patterns for entity, relation, and geo_time/date entity
    # Updated to handle cases with or without brackets
    entity_pattern = re.compile(r"Entity:\s*(?:\[(.*?)\]\s*\[(.*?)\]|\s*(\w+)\s*\[(.*?)\])")
    relation_pattern = re.compile(r"Relation:\s*(?:\[(.*?)\]|\s*(\w+))")
    geo_time_pattern = re.compile(r"Geo_Time/Date Entity:\s*(?:\[(.*?)\]\s*\[(.*?)\]|\s*(\w+)\s*\[(.*?)\])")

    while True:
        # Find the first occurrence of each pattern
        entity_match = entity_pattern.search(text)
        relation_match = relation_pattern.search(text)
        geo_time_match = geo_time_pattern.search(text)

        # Break the loop if no matches are found
        if not (entity_match and relation_match and geo_time_match):
            break

        # Extract the values from the matches
        entity_name = entity_match.group(1) or entity_match.group(3)
        entity_type = entity_match.group(2) or entity_match.group(4)
        relation = relation_match.group(1) or relation_match.group(2)
        geo_time_name = geo_time_match.group(1) or geo_time_match.group(3)
        geo_time_type = geo_time_match.group(2) or geo_time_match.group(4)

        # Create a dictionary with the extracted values
        entry_dict = {
            "head": {"Name": entity_name, "Type": entity_type},
            "relation": relation,
            "tail": {"Name": geo_time_name, "Type": geo_time_type}
        }

        # Add the dictionary to the list
        structured_data.append(entry_dict)

        # Remove the first set of matched lines (Entity, Relation, Geo_Time/Date Entity) from the text
        text = text.replace(entity_match.group(0), "", 1)
        text = text.replace(relation_match.group(0), "", 1)
        text = text.replace(geo_time_match.group(0), "", 1)

        # Clean up any leading/trailing whitespace
        text = text.strip()
    structured_data = clean_relations(structured_data)

    return structured_data


# Example usage
text = """
Entity: [meta-conglomerates] [ROCK]  
Relation: [overlies]  
Geo_Time/Date Entity: [~1,700 Ma] [GEO_TIME]  

Entity: [coarse metasandstones] [ROCK]  
Relation: [overlies]  
Geo_Time/Date Entity: [~1,700 Ma] [GEO_TIME]  

Entity: [Mount Morrissey Metamorphics] [STRAT]  
Relation: [underlies]  
Geo_Time/Date Entity: [~1,700 Ma] [GEO_TIME]
"""

# Extract and remove entries
structured_data = extract_temporal_relations(text)

# Print the result
for entry in structured_data:
    print(entry)


print()

# Example usage
data = [
{"head": {"Name": "minimal clay", "Type": "ROCK"}, "relation": "is present in", "tail": {"Name": "N/A", "Type": "N/A"}},
{"head": {"Name": "uranium content", "Type": "ORE_DEPOSIT"}, "relation": "increases", "tail": {"Name": "0 - 60 cm, 60 - 100 cm, 1.5 m", "Type": "GEO_TIME"}},
{"head": {"Name": "granite", "Type": "ROCK"}, "relation": "formed during", "tail": {"Name": "Permian", "Type": "GEO_TIME"}},
{'head': {'Name': 'Geological Society of Australia', 'Type': 'Organization'}, 'relation': 'conducted', 'tail': {'Name': '2,800 to ~3,000 Ma', 'Type': 'GEO_TIME'}}
]

# Clean the data
cleaned_data = clean_relations(data)

# Print the result
for entry in cleaned_data:
    print(entry)


{'head': {'Name': 'meta-conglomerates', 'Type': 'ROCK'}, 'relation': 'overlies', 'tail': {'Name': '~1,700 Ma', 'Type': 'GEO_TIME'}}
{'head': {'Name': 'coarse metasandstones', 'Type': 'ROCK'}, 'relation': 'overlies', 'tail': {'Name': '~1,700 Ma', 'Type': 'GEO_TIME'}}
{'head': {'Name': 'Mount Morrissey Metamorphics', 'Type': 'STRAT'}, 'relation': 'underlies', 'tail': {'Name': '~1,700 Ma', 'Type': 'GEO_TIME'}}

{'head': {'Name': 'Geological Society of Australia', 'Type': 'Organization'}, 'relation': 'conducted', 'tail': {'Name': '2,800 to ~3,000 Ma', 'Type': 'GEO_TIME'}}


Extract geo-Temporal relations

In [65]:
def query_geotime_relations():
    temporal_relation = {}
    for a in all_tagged:
        temporal_relation[a] = {}
        for b in all_tagged[a]:
            tags = all_tagged[a][b]["combine_tags"][0]
            if "GEO_TIME" in tags or "DATE" in tags:
                text = all_tagged[a][b]["preprocess"]
                result = extract_geotime_relations(text, tags)
                temporal_relation[a][b] = result
            else:
                temporal_relation[a][b] = None

    return temporal_relation

# Perform relationship extraction
# temporal_relation = query_geotime_relations()

In [66]:
# Write to JSON file
# with open("Results/temporal_RE.json", "w") as json_file:
#     json.dump(temporal_relation, json_file)

# Load the RE
with open("Results/temporal_RE.json", 'r') as file:
    temporal_relation = json.load(file)

In [67]:
for a in temporal_relation:
    for b in temporal_relation[a]:
        if temporal_relation[a][b]:
            # print("___________________")
            # print(b)
            # print(all_tagged[a][b]["combine_tags"])
            # print(temporal_relation[a][b])
            structured_data = extract_temporal_relations(temporal_relation[a][b])
            print(structured_data)

            # for entry in temporal_relation[a][b]:
        #     #     print(entry)
        # else:
        #     print("No temporal relations found.")
        # print()

[{'head': {'Name': 'NiWest', 'Type': 'Organization'}, 'relation': 'conducted', 'tail': {'Name': 'June 2007', 'Type': 'DATE'}}, {'head': {'Name': 'Soil sampling', 'Type': 'Activity'}, 'relation': 'was undertaken', 'tail': {'Name': 'June 2007', 'Type': 'DATE'}}]
[{'head': {'Name': 'NiWest', 'Type': 'Organization'}, 'relation': 'undertook', 'tail': {'Name': '2002', 'Type': 'DATE'}}, {'head': {'Name': 'GME Resources Eucalyptus Bore Project area', 'Type': 'LOCATION'}, 'relation': 'is located in', 'tail': {'Name': '2002', 'Type': 'DATE'}}]
[]
[{'head': {'Name': 'Detail infill and replacement drilling', 'Type': 'Activity'}, 'relation': 'undertaken in', 'tail': {'Name': 'April 2007', 'Type': 'DATE'}}]
[{'head': {'Name': 'NiWest', 'Type': 'Organization'}, 'relation': 'recorded', 'tail': {'Name': 'June 2007', 'Type': 'DATE'}}]
[{'head': {'Name': 'GME', 'Type': 'Organization'}, 'relation': 'received information from', 'tail': {'Name': '2004', 'Type': 'DATE'}}, {'head': {'Name': 'CSA', 'Type': 'Or

### Construct Triples

In [68]:
from collections import defaultdict

# Initialize the knowledge graph with sets to avoid duplicates
knowledge_graph = defaultdict(lambda: defaultdict(set))

# Function to check if combined tags contain any entity types
def has_entity_types(tags):
    return any(tag != 'O' for tag in tags.split())

# Function to map each word in preprocess text to its tag
def map_tags_to_text(preprocess_text, combined_tags):
    return dict(zip(preprocess_text.split(), combined_tags.split()))

# Iterate over the relation_extraction dictionary
for doc, sentences in relation_extraction.items():
    for sentence, relations in sentences.items():
        # Find the corresponding entry in all_tagged
        if doc in all_tagged:
            for sent_text in all_tagged[doc]:
                combined_tags = all_tagged[doc][sent_text]['combine_tags'][0]
                preprocess_text = all_tagged[doc][sent_text]['preprocess']
                
                # Check if combined_tags contain entity types
                if combined_tags != 'error' and has_entity_types(combined_tags):
                    # Map words in preprocess text to their corresponding tags
                    word_tag_map = map_tags_to_text(preprocess_text, combined_tags)
                    
                    # Iterate over each relation in the list
                    for relation in relations:
                        if relation:
                            relation = relation[0]
                            head = relation['head']
                            tail = relation['tail']
                            rel_type = relation['type']

                            head_tag = []
                            tail_tag = []
                            for word in head.split():
                                head_tag.append(word_tag_map.get(word, 'O'))
                            for word in tail.split():
                                tail_tag.append(word_tag_map.get(word, 'O'))

                            head_count = head_tag.count('O')
                            tail_count = tail_tag.count('O')

                            if not any('DATE' in tag for tag in head_tag):

                                if head_count < len(head.split()): # and tail_count < len(tail.split()):
                                    knowledge_graph[head][rel_type].add(tail)

# Process the temporal_relation dictionary
for a in temporal_relation:
    for b in temporal_relation[a]:
        if temporal_relation[a][b]:
        
            combined_tags = all_tagged[a][b]['combine_tags'][0]
            preprocess_text = all_tagged[a][b]['preprocess']

            # Check if combined_tags contain entity types
            if combined_tags != 'error' and has_entity_types(combined_tags):
                # Map words in preprocess text to their corresponding tags
                word_tag_map = map_tags_to_text(preprocess_text, combined_tags)

                # print(temporal_relation[a][b])
                structured_data = extract_temporal_relations(temporal_relation[a][b])
                for data in structured_data:
                    head = data['head']["Name"]
                    tail = data['tail']["Name"]
                    rel_type = data['relation']

                    # # Check if head and tail are entities with specific tags
                    # head_tag = word_tag_map.get(head, 'O')
                    # tail_tag = word_tag_map.get(tail, 'O')

                    head_tag = []
                    tail_tag = []
                    for word in head.split():
                        head_tag.append(word_tag_map.get(word, 'O'))
                    for word in tail.split():
                        tail_tag.append(word_tag_map.get(word, 'O'))

                    head_count = head_tag.count('O')
                    tail_count = tail_tag.count('O')
                    if not any('DATE' in tag for tag in head_tag):
                        # Add the temporal relation to the knowledge graph
                        if head_count < len(head.split()): # and tail_count < len(tail.split()):
                            knowledge_graph[head][rel_type].add(tail)

# Output the knowledge graph with duplicates removed
for entity, relations in knowledge_graph.items():
    print(f"Entity: {entity}") 
    for rel_type, tails in relations.items():
        for tail in tails:
            print(f"  - {rel_type} -> {tail}")
    print()


Entity: NiWest
  - headquarters location -> Perth
  - field of work -> Earlier exploration
  - inception -> June 2007
  - instance of -> brown
  - instance of -> public company
  - industry -> gold working
  - owned by -> Western Metals
  - owned by -> GME Resources Ltd
  - published -> 2003
  - published -> June 2007

Entity: Eucalyptus Bore Project
  - located in the administrative territorial entity -> Acacia
  - located in the administrative territorial entity -> Acacia Option area
  - located in the administrative territorial entity -> Kookynie
  - location -> Mt.
  - location -> Murrin Murrin mine
  - inception -> 2002
  - sponsor -> GME Resources
  - sponsor -> Department of Indigenous Affairs
  - instance of -> Aboriginal Heritage Site
  - located on terrain feature -> Mt.
  - is located within -> 2007

Entity: Kookynie - Mt.
  - named after -> Mt.
  - located on terrain feature -> Mt.

Entity: Yundamindra Station
  - instance of -> station
  - instance of -> station tracks

En

In [69]:
for doc, sentences in relation_extraction.items():
    print(doc)
    print(sentences)
    for a, b in sentences.items():
        print(a)
        print(b)
        print()
        break
    break

a075237_ep_a_all_2007_10582280.json
{'Managed By: GME Resources Ltd Level 2 907 Canning Highway Mt.': [[{'head': 'Canning Highway Mt.', 'head_type': 'loc', 'type': 'length', 'tail': '2 907', 'tail_type': 'num'}], [{'head': 'Canning Highway Mt.', 'head_type': 'loc', 'type': 'maintained by', 'tail': 'GME Resources Ltd', 'tail_type': 'org'}]], 'Pleasant WA 6153 Distribution: Department of Industry and Resources - Perth GME Resources Limited NiWest June 2007 1': [[{'head': 'NiWest', 'head_type': 'org', 'type': 'headquarters location', 'tail': 'Perth', 'tail_type': 'loc'}], [{'head': 'Pleasant WA 6153', 'head_type': 'misc', 'type': 'inception', 'tail': 'June 2007', 'tail_type': 'date'}]], 'The Eucalyptus Bore Project is located 85km ENE of Kookynie and 45km southeast of the Murrin Murrin mine site.': [[{'head': 'Eucalyptus Bore Project', 'head_type': 'loc', 'type': 'located in the administrative territorial entity', 'tail': 'Kookynie', 'tail_type': 'loc'}], [{'head': 'Eucalyptus Bore Projec

In [70]:
# from rdflib import Graph, URIRef, Literal, Namespace
# from rdflib.namespace import RDF, RDFS

# # Define namespaces
# EX = Namespace("http://example.org/")
# GEO = Namespace("http://example.org/geo/")
# REL = Namespace("http://example.org/rel/")

# # Initialize a graph
# g = Graph()

# # Add namespace bindings
# g.bind("ex", EX)
# g.bind("geo", GEO)
# g.bind("rel", REL)

# # Populate the graph with triples from the knowledge graph
# for entity, relations in knowledge_graph.items():
#     entity_uri = URIRef(EX[entity.replace(" ", "_")])
#     g.add((entity_uri, RDF.type, GEO.Entity))
    
#     for rel_type, tails in relations.items():
#         rel_uri = URIRef(REL[rel_type.replace(" ", "_")])
        
#         for tail in tails:
#             tail_uri = URIRef(EX[tail.replace(" ", "_")])
#             g.add((entity_uri, rel_uri, tail_uri))

# # Serialize the graph into a TTL file
# ttl_data = g.serialize(format='turtle')

# # Save the TTL file
# with open("Results/knowledge_graph.ttl", "w") as f:
#     f.write(ttl_data)

In [71]:
# for a in knowledge_graph.items():
#     for b in a[1]:
#         print(b)
#     break

# if relation != []:

In [72]:
# # Clean up the relation_extraction dictionary
# cleaned_relation_extraction = {}

# for doc, sentences in relation_extraction.items():
#     cleaned_sentences = {}
#     for sentence, relations in sentences.items():
#         # Filter out sentences with empty relations
#         print(relations)
#         filtered_relations = [rel for rel in relations if rel]
#         if filtered_relations:
#             cleaned_sentences[sentence] = filtered_relations
    
#     if cleaned_sentences:
#         cleaned_relation_extraction[doc] = cleaned_sentences

# # Output the cleaned relation_extraction
# for doc, sentences in cleaned_relation_extraction.items():
#     print(f"Document: {doc}")
#     for sentence, relations in sentences.items():
#         print(f"Sentence: {sentence}")
#         for relation in relations:
#             print(relation)
#         print()


In [73]:
# tag = ["O O O O O O"]

# print(has_entity_types(tag[0]))

# print(tag[0].split(" "))

In [74]:
# # for c in relation_extraction:
# #     print(c)
# #     for sentence, relations in zip(ner_tagged[c], relation_extraction[c]):
# #         entities = ner_tagged[c][sentence]
# #         print(sentence)
# #         # print(ner)
# #         print(relation_extraction[c])
# #         doc = nlp(sentence.strip())
# #         doc = [token.text for token in doc]
# #         print(entities)
# #         # print(doc)
# #         print(len(doc))
# #         print(len(entities[0].split(" ")))
# #         # for content_text in r:
# #         #     print(content_text)

# # print(ner_tagged)

# for c in relation_extraction:
#     print(c)
#     for idx, sentence in enumerate(ner_tagged[c]):

#         entities = ner_tagged[c][sentence]
#         r1 = relation_extraction[c][idx*2]
#         r2 = relation_extraction[c][idx*2+1]
#         print("Sentence: ", sentence)
#         print(r1)
#         print(r2)
#         # print(ner)
#         # print(relation_extraction[c])
#         doc = nlp(sentence.strip())
#         doc = [token.text for token in doc]
#         print(entities)
#         # print(doc)
#         print(len(doc))
#         print(len(entities[0].split(" ")))
#         # for content_text in r:
#         #     print(content_text)
#     break

# print(ner_tagged)

In [187]:
def manual_prompt_with_entities(sentence, entities):

    prompt2= f"""
    Task: Extract information from the following geological survey sentence and represent it as subject-predicate-object triples using the format predicate(subject, object). 
    The sentence may contain information about geological timescales, locations, minerals, ore deposits, rocks, stratigraphy, and temporal entities such as real-time and geological timescales (e.g., 1000 Ma).

    Instructions:
    
    - You will be provided with key entities extracted from the text, such as rocks, minerals, geological timescales, and locations.
    - For each pair of entities, describe the relationship between them using the format predicate(subject, object).
    - If a geological timescale or temporal entity is mentioned, relate it to the corresponding rock or stratigraphy.
    - Ensure that each triple accurately reflects the relationship between the entities in the sentence.
    
    Example sentence: "Mapping and geochronology by the Geological Society of Australia reveal that the granitic rocks in the western part of the Yalgoo 1:250,000 map sheet are in the order of 2,800 to 3,000 Ma."
    
    Example extracted entities:
    - Geological Society of Australia (LOCATION)
    - granitic rocks (ROCK)
    - 2,800 to 3,000 Ma (GEO_TIME)

    Example extracted triples:
    foundIn(Granitic rocks, western part of Yalgoo map sheet)
    hasTimescale(Granitic rocks, 2,800 to 3,000 Ma)
    hasConducted(Geological Society of Australia, mapping and geochronology)

    Now, using the following provided entities, extract triples for the sentence:

    Provided entities:
    {entities}

    Your task: Extract triples from the following sentence:
    Sentence: {sentence}
    """

    prompt2 = "\n".join([line.lstrip() for line in prompt2.splitlines()])
    
    return prompt2


# Example usage:
sentence = "Mapping and geochronology by the Geological Society of Australia (Arriens, 1971) reveal that the granitic rocks in the western part of the Yalgoo 1:250,000 map sheet are in the order of 2,800 to 3,000 Ma."
entities = [
    ("Geological Society of Australia", "LOCATION"), 
    ("Arriens, 1971", "DATE"), 
    ("granitic rocks", "ROCK"), 
    ("western part of Yalgoo map sheet", "LOCATION"), 
    ("2,800 to 3,000 Ma", "GEO_TIME")
]

prompt = manual_prompt_with_entities(sentence, entities)
print(prompt)



Task: Extract information from the following geological survey sentence and represent it as subject-predicate-object triples using the format predicate(subject, object). 
The sentence may contain information about geological timescales, locations, minerals, ore deposits, rocks, stratigraphy, and temporal entities such as real-time and geological timescales (e.g., 1000 Ma).

Instructions:

- You will be provided with key entities extracted from the text, such as rocks, minerals, geological timescales, and locations.
- For each pair of entities, describe the relationship between them using the format predicate(subject, object).
- If a geological timescale or temporal entity is mentioned, relate it to the corresponding rock or stratigraphy.
- Ensure that each triple accurately reflects the relationship between the entities in the sentence.

Example sentence: "Mapping and geochronology by the Geological Society of Australia reveal that the granitic rocks in the western part of the Yalgoo 

In [169]:
def get_entities(doc, entities, iob_tags):
    entities = []
    current_entity = []
    current_tag = None
    for token, tag in zip(doc, iob_tags):
        if tag.startswith('B-'):
            if current_entity:
                # Save the previous entity before starting a new one
                entities.append((" ".join(current_entity), current_tag))
            # Start a new entity
            current_entity = [token.text]
            current_tag = tag.split('-')[1]  # Get the entity type (e.g., DATE, LOCATION)
        elif tag.startswith('I-') and current_tag == tag.split('-')[1]:
            # Continue the current entity
            current_entity.append(token.text)
        else:
            if current_entity:
                # If we reach an 'O' or unrelated tag, save the current entity
                entities.append((" ".join(current_entity), current_tag))
                current_entity = []
                current_tag = None

    # Catch any remaining entity at the end of the loop
    if current_entity:
        entities.append((" ".join(current_entity), current_tag))

    return entities

In [170]:
# load json file
with open("Results/testing_tags.json", "r") as file:
    all_tagged = json.load(file)

In [188]:
def extract_temporal_relations(all_tagged):
    for a in all_tagged:
        if a != "a092458_e09_1213_2011_a_12624596.json":
            continue
        for b in all_tagged[a]:
            # print(b)
            # print(all_tagged[a][b]["preprocess"])
            entities = None
            doc = nlp(all_tagged[a][b]["preprocess"])
            iob_tags = all_tagged[a][b]["combine_tags"][0].split(" ")
            if len(set(iob_tags)) > 1:
                if all_tagged[a][b]["combine_tags"][0] != 'error':
                    entities = get_entities(doc, entities, iob_tags)
                    
                # print(all_tagged[a][b]["combine_tags"])
                # print(entities)
                all_tagged[a][b]["llm_results"] = None
                if entities:
                    entity_text = "\n".join([f"- {entity[0]} ({entity[1]})" for entity in entities])
                    prompt = manual_prompt_with_entities(b, entity_text)
                    # print(prompt)
                    results = query(prompt).choices[0].message.content.strip()
                    print(results)
                    all_tagged[a][b]["llm_results"] = results
                print()
            else:
                # Text does not contain any entities
                all_tagged[a][b]["llm_results"] = None
    return all_tagged

# Perform relationship extraction
all_tagged = extract_temporal_relations(all_tagged)

output is:  ChatCompletion(id='chatcmpl-A6KaDK3f0JoMnkvIFFPDdXsZQnlPG', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are the extracted triples based on the provided sentence and entities:\n\n1. situatedIn(Mango Bore Project, Gascoyne Complex)\n2. locatedApproximately(Mango Bore Project, 105 km east-northeast of Gascoyne Junction)\n3. coversArea(Mango Bore Project, 99 km²)', role='assistant', function_call=None, tool_calls=None, refusal=None))], created=1726072453, model='gpt-4o-mini-2024-07-18', object='chat.completion', system_fingerprint='fp_483d39d857', usage=CompletionUsage(completion_tokens=65, prompt_tokens=421, total_tokens=486))
Here are the extracted triples based on the provided sentence and entities:

1. situatedIn(Mango Bore Project, Gascoyne Complex)
2. locatedApproximately(Mango Bore Project, 105 km east-northeast of Gascoyne Junction)
3. coversArea(Mango Bore Project, 99 km²)

output is:  ChatCompletion(id='cha

In [189]:
# # #save json file
# with open(n", "w") as json_file:
#     json.dump(all_tagged, json_file)"Results/testing_tags.jso

In [99]:
# #save json file
# with open("Results/LLM_results.json", "w") as json_file:
#     json.dump(all_tagged, json_file)

# Load the NER tagged data
with open("Results/LLM_results.json", 'r') as file:
    llm_results = json.load(file)


In [106]:
import re

# Function to extract entities and relation from the input string and convert to knowledge graph triple
def parse_and_convert_to_kg(input_string):
    # Regular expression to extract the relation, head entity, head type, tail entity, and tail type
    pattern = r'(?:\d+\.\s*)?(\w+)\(([^()]+),\s([^()]+)\)'


    match = re.match(pattern, input_string)
    
    if match:
        relation = match.group(1)
        head_entity = match.group(2).strip()
        tail_entity = match.group(3).strip()
        
        # Return the extracted data as a triple
        return (head_entity, relation, tail_entity)
    else:
        return None

In [178]:
for a in llm_results:
    for b in llm_results[a]:
        # print(b)
        iob_tags = all_tagged[a][b]["combine_tags"][0].split(" ")
        doc = nlp(all_tagged[a][b]["preprocess"])
        if all_tagged[a][b]["combine_tags"][0] != 'error':
            entities = get_entities(doc, entities, iob_tags)
            # print(entities)
            if llm_results[a][b]["llm_results"]:
                triples = llm_results[a][b]["llm_results"].split("\n")
                for triple in triples:
                    kg_triple = parse_and_convert_to_kg(triple)
                    if kg_triple:
                        # print(kg_triple)
                        left_entity_type = None
                        right_entity_type = None
                        for entity in entities:
                            if kg_triple[0] in entity[0]:
                                left_entity_type = entity[1]
                            if kg_triple[2] in entity[0]:
                                right_entity_type = entity[1]
                        if left_entity_type or right_entity_type:
                            print(kg_triple)
        # print(llm_results[a][b]["entities"])
        # print()

('Eucalyptus Bore Project', 'locatedNear', 'Kookynie')
('Eucalyptus Bore Project', 'locatedNear', 'Murrin Murrin mine')
('Kookynie', 'accessVia', 'Mt')
('ultramafic rocks', 'hasCharacteristic', 'weathered')
('ultramafic rocks', 'cappedBy', 'yellowish brown silcrete')
('ultramafic rocks', 'cappedBy', 'deep red ferruginous laterite')
('ultramafic rocks', 'hasFeature', 'small incised watercourses')
('ultramafic rocks', 'affectsAccess', 'access and drilling in certain areas')
('kurrajong trees', 'foundIn', 'kurrajong')
('soil sampling', 'hasDate', 'June 2007 26')
('Aboriginal Heritage Survey', 'hasDate', '2002')
('tenements', 'isFormer', 'former')
('GME Resources Eucalyptus Bore Project area', 'isLocatedIn', 'Eucalyptus Bore')
('gold', 'hasConducted', 'soil geochemistry and RC drilling programs')
('Hepi', 'hasPotentialType', 'leachable saprock ore types')
('Mt Kilkenny', 'hasPotentialType', 'leachable saprock ore types')
('Hepi', 'hasMinimalType', 'clay')
('Mt Kilkenny', 'hasMinimalType', 

In [190]:
"a092458_e09_1213_2011_a_12624596.json"

# Load the NER tagged data
with open("Results/testing_tags.json", 'r') as file:
    llm_results = json.load(file)

model_kg = []

for a in llm_results:
    if a != "a092458_e09_1213_2011_a_12624596.json":
        continue
    for b in llm_results[a]:
        # print(b)
        iob_tags = all_tagged[a][b]["combine_tags"][0].split(" ")
        doc = nlp(all_tagged[a][b]["preprocess"])
        if all_tagged[a][b]["combine_tags"][0] != 'error':
            entities = get_entities(doc, entities, iob_tags)
            # print(entities)
            if llm_results[a][b]["llm_results"]:
                triples = llm_results[a][b]["llm_results"].split("\n")
                for triple in triples:
                    kg_triple = parse_and_convert_to_kg(triple)
                    if kg_triple:
                        # print(kg_triple)
                        left_entity_type = []
                        right_entity_type = []
                        for entity in entities:
                            if kg_triple[0] in entity[0] or entity[0] in kg_triple[0]:
                                left_entity_type.append(entity[1])
                            if kg_triple[2] in entity[0] or entity[0] in kg_triple[2]:
                                right_entity_type.append(entity[1])
                        # If the entity types for the head entity are not consistent, set to None
                        if len(set(left_entity_type)) != 1:
                            left_entity_type = None
                        if left_entity_type and right_entity_type:
                            # print(left_entity_type)
                            print(kg_triple)
                            model_kg.append(kg_triple)
        # print(llm_results[a][b]["entities"])
        # print()

('Mango Bore Project', 'situatedIn', 'Gascoyne Complex')
('Mango Bore Project', 'locatedApproximately', '105 km east-northeast of Gascoyne Junction')
('Wabli Creek', 'hasBoundary', "U3O8 Limited's Wabli Creek project E09/1178")
("U3O8 Limited's Wabli Creek project E09/1178", 'isPartOf', 'Wabli Creek')
('Mango Bore Project', 'situatedIn', 'Wajarri-Yamatji Claim area')
('Wajarri-Yamatji Claim area', 'hasRegistrationDate', '1st December 2005')
('Wajarri-Yamatji Claim area', 'passedRegistrationTest', '1st December 2005')
('Mango Bore', 'liesNear', 'south-western margin of Gascoyne Complex')
('Mango Bore', 'isPartOf', 'Gascoyne Complex')
('Gascoyne Complex, Sheppard et al.', 'hasCited', '2007')
('high grade core of the Capricorn Orogen', 'locatedIn', 'Capricorn')
('Gascoyne Complex', 'limitedTo', 'west by Phanerozoic successions of Carnarvon basin')
('Phanerozoic', 'hasSuccessions', 'Carnarvon basin')
('Glenburgh Orogeny', 'hasTimescale', '2005-1960 Ma')
('Glenburgh Orogeny', 'foundIn', 'so

In [82]:
from fuzzywuzzy import fuzz

# Define a function to compare relations with fuzzy matching
def fuzzy_compare(triple1, triple2, threshold=80):
    sub1, pred1, obj1 = triple1
    sub2, pred2, obj2 = triple2
    # print(sub1, pred1, obj1)
    
    # Compare the subject, predicate, and object using fuzzy matching
    sub_match = fuzz.ratio(sub1, sub2) >= threshold
    pred_match = fuzz.ratio(pred1, pred2) >= 0
    obj_match = fuzz.ratio(obj1, obj2) >= threshold
    # print(fuzz.ratio(pred1, pred2))
    # print(sub_match, pred_match, obj_match)
    
    # If all three parts match with similarity above threshold, consider them as equivalent
    return sub_match and pred_match and obj_match

# Example triples
gold_triple = ('Mango Bore Proje432ct', 'locatedIn', 'Gascoyne Complex')
model_triple = ('Mango Bore Project', 'discoveredIn', 'Gascoyne Complex')

# Perform fuzzy comparison
if fuzzy_compare(gold_triple, model_triple):
    print("The triples are similar based on fuzzy matching.")
else:
    print("The triples are not similar.")


The triples are similar based on fuzzy matching.


In [83]:
# ! pip install fuzzywuzzy[speedup]

In [84]:
manual_kg = """
locatedIn(Mango Bore Project, Gascoyne Complex)  
locatedApproximately(Mango Bore Project, 105 km east-northeast of Gascoyne Junction)  
coversArea(Mango Bore Project, 99 kmÂ²)  
accessedFrom(Mango Bore Project, Dairy Creek to Cobra Station road)
accessedFrom(Mango Bore Project, station track)
locatedApproximately(station track, 2 km to the south of Yinnietharra Homestead)
liesOn(Mango Bore Project area, Yinnietharra and Mooloo Downs Pastoral leases)
ownedBy(Mango Bore Project, Skytone Pty Ltd)  
operatedBy(Mango Bore Project, U3O8 Limited)  
hasCommonBoundaries(Mango Bore Project, E09/1178)
isPartOf(E09/1178, Wabli Creek project)
locatedIn(Mango Bore Project, Wajarri-Yamatji Claim area)
registeredOn(Wajarri-Yamatji Claim area, 1st December 2005)
liesNear(Mango Bore project, south-western margin of Gascoyne Complex)  
comprises(Gascoyne Complex, Palaeoproterozoic granitic units)
comprises(Gascoyne Complex, medium to high-grade meta-sedimentary units)
forms(medium to high-grade meta-sedimentary units, high grade core of Capricorn Orogen)
limitedTo(Gascoyne Complex, west by Phanerozoic successions of Carnarvon basin)  
isPartOf(Phanerozoic successions, Carnarvon basin)  
isKnownFrom(Glenburgh Orogeny, southern end of Gascoyne Province)  
hasTimescale(Glenburgh Orogeny, 2005-1960 Ma)  
followedBy(Capricorn Orogeny, deposition of protoliths)
hasTimescale(Capricorn Orogeny, 1830-1780 Ma)
derivedFrom(Maximum depositional age, detrital zircons)  
hasAge(Maximum depositional age, ~1840 Ma)  
associatedWith(Morrissey Metamorphics, Varvell 2001)  
hasAge(Morrissey Metamorphics, ~1840 Ma)  
markedBy(Orogeny, deformation and intrusion of Morrissey Metamorphics)
intrudes(Granites of Moorarie Supersuite, Morrissey Metamorphics)
comprises(Moorarie Supersuite, monzogranite)
comprises(Moorarie Supersuite, granodiorite)
includes(Moorarie Supersuite, syenogranite)
includes(Moorarie Supersuite, tonalite)
includes(Moorarie Supersuite, quartz diorite)
bestDeveloped(Mangaroon Orogeny, northern part of Gascoyne Province)
hasTimescale(Mangaroon Orogeny, 1680-1620 Ma)
relatedTo(Mangaroon Orogeny, related structures)
poorlyDeveloped(related structures, Morrissey Metamorphics)
correlateWith(low to medium-grade metasedimentary rocks, rocks of the fluviatile Mt James Formation)  
marking(low to medium-grade metasedimentary rocks, series of fault-bounded basins)  
depositedOn(series of fault-bounded basins, Gascoyne Complex)  
associatedWith(Hunter 1990, Pooranoo Metamorphics)  
comprises(The latter, meta-conglomerates)
comprises(The latter, coarse metasandstones)
overlies(meta-conglomerates, Mount Morrissey Metamorphics)
overlies(coarse metasandstones, Mount Morrissey Metamorphics)
hasTimescale(meta-conglomerates, ~1,700 Ma)
hasTimescale(coarse metasandstones, ~1,700 Ma)
reactivated(Edmundian Orogeny, shear and fault zones)  
hasTimescale(Edmundian Orogeny, 1030-950 Ma)  
basedOn(dates, syn-metamorphic monazite and xenotime)
associatedWith(event, peak regional metamorphism)
hasFacies(peak regional metamorphism, greenschist to amphibolite)
followedBy(peak regional metamorphism, pegmatite intrusion)
showAssociation(Pegmatites, beryllium occurrences)
showAssociation(Pegmatites, tantalumniobium occurrences)
locatedCloseTo(Mango Bore project, boundary between northern domain of Glenburgh Terrane and southern boundary of Mutherbukin zone)
extendsTo(Mango Bore project, Chalba shear zone)
partOf(Mango Bore project, Gascoyne Complex)
comprises(Mango Bore, lowermost units of Morrissey Metamorphics)  
inContactWith(lowermost units of Morrissey Metamorphics, reworked Archaean granitoids)  
inContactWith(lowermost units of Morrissey Metamorphics, Proterozoic pegmatites)  
dominatedBy(Regolith environment, erosional regime)
preservedIn(Regolith environment, minor relict domains)
belongsTo(Regolith environment, zone of limited U-Ce-La-Pb-Th enrichment)
liesUpstream(part of the tenement, Minindi Creek prospect)  
isKnownFor(Minindi Creek prospect, calcrete hosted uranium mineralisation)  
planned(Reverse circulation drilling programme, follow up calcrete hosted uranium mineralisation)
locatedSouthOf(Calcrete hosted uranium mineralisation, Minindi prospect)
delayed(Reverse circulation drilling programme, next reporting period)
potentialFor(Calcrete hosted uranium resource, area)
hasDepth(Calcrete hosted uranium resource, ten metre)
hasNumberOfHoles(Programme, 50)
availability(Rig, difficult)
"""

manual_kg = manual_kg.split("\n")
manual_kg = [parse_and_convert_to_kg(line.strip()) for line in manual_kg if line]
print(manual_kg)


[('Mango Bore Project', 'locatedIn', 'Gascoyne Complex'), ('Mango Bore Project', 'locatedApproximately', '105 km east-northeast of Gascoyne Junction'), ('Mango Bore Project', 'coversArea', '99 kmÂ²'), ('Mango Bore Project', 'accessedFrom', 'Dairy Creek to Cobra Station road'), ('Mango Bore Project', 'accessedFrom', 'station track'), ('station track', 'locatedApproximately', '2 km to the south of Yinnietharra Homestead'), ('Mango Bore Project area', 'liesOn', 'Yinnietharra and Mooloo Downs Pastoral leases'), ('Mango Bore Project', 'ownedBy', 'Skytone Pty Ltd'), ('Mango Bore Project', 'operatedBy', 'U3O8 Limited'), ('Mango Bore Project', 'hasCommonBoundaries', 'E09/1178'), ('E09/1178', 'isPartOf', 'Wabli Creek project'), ('Mango Bore Project', 'locatedIn', 'Wajarri-Yamatji Claim area'), ('Wajarri-Yamatji Claim area', 'registeredOn', '1st December 2005'), ('Mango Bore project', 'liesNear', 'south-western margin of Gascoyne Complex'), ('Gascoyne Complex', 'comprises', 'Palaeoproterozoic gr

In [85]:
print(model_kg)

[('Mango Bore Project', 'situatedApproximately', 'Gascoyne Junction'), ('Mango Bore Project', 'locatedIn', 'Gascoyne Complex'), ('Mango Bore Project', 'distanceFrom', '105 km east-northeast of Gascoyne Junction'), ('Yinnietharra Homestead', 'locatedNear', '2 km to the south of Yinnietharra'), ('Wabli Creek', 'hasBoundary', "U3O8 Limited's Wabli Creek project E09/1178"), ("U3O8 Limited's Wabli Creek project E09/1178", 'isPartOf', 'Wabli Creek'), ('Mango Bore', 'liesNear', 'south-western margin of Gascoyne Complex'), ('Mango Bore', 'isPartOf', 'Gascoyne Complex'), ('Gascoyne Complex', 'limitedTo', 'west by Phanerozoic successions of Carnarvon basin'), ('Phanerozoic', 'hasSuccessions', 'Carnarvon basin'), ('Capricorn Orogeny', 'hasTimescale', '1830-1780 Ma'), ('Capricorn Orogeny', 'followedBy', 'deposition of protoliths of metasedimentary rocks'), ('metasedimentary rocks', 'consistsOf', 'Morrissey Metamorphics'), ('Morrissey Metamorphics, Varvell', 'citedIn', '2001'), ('Morrissey Metamorp

In [191]:
def evaluate_kg(gold_kg, model_kg, match_fn, threshold=70):
    correct_triples = 0
    
    for model_triple in model_kg:
        print(model_triple)
        for gold_triple in gold_kg:
            if match_fn(gold_triple, model_triple, threshold):
                print(gold_triple)
                print(model_triple)
                head, relation, tail = model_triple
                head1, relation1, tail1 = gold_triple
                print(fuzz.ratio(head, head1))
                print(fuzz.ratio(relation, relation1))
                print(fuzz.ratio(tail, tail1))
                correct_triples += 1

                break  # Move to the next model triple after finding a match
    
    precision = correct_triples / len(model_kg) if len(model_kg) > 0 else 0
    recall = correct_triples / len(gold_kg) if len(gold_kg) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score


# Fuzzy comparison using custom match function (e.g., fuzzy_compare)
precision, recall, f1 = evaluate_kg(manual_kg, model_kg, fuzzy_compare)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


('Mango Bore Project', 'situatedIn', 'Gascoyne Complex')
('Mango Bore Project', 'locatedIn', 'Gascoyne Complex')
('Mango Bore Project', 'situatedIn', 'Gascoyne Complex')
100
63
100
('Mango Bore Project', 'locatedApproximately', '105 km east-northeast of Gascoyne Junction')
('Mango Bore Project', 'locatedApproximately', '105 km east-northeast of Gascoyne Junction')
('Mango Bore Project', 'locatedApproximately', '105 km east-northeast of Gascoyne Junction')
100
100
100
('Wabli Creek', 'hasBoundary', "U3O8 Limited's Wabli Creek project E09/1178")
("U3O8 Limited's Wabli Creek project E09/1178", 'isPartOf', 'Wabli Creek')
('Mango Bore Project', 'situatedIn', 'Wajarri-Yamatji Claim area')
('Mango Bore Project', 'locatedIn', 'Wajarri-Yamatji Claim area')
('Mango Bore Project', 'situatedIn', 'Wajarri-Yamatji Claim area')
100
63
100
('Wajarri-Yamatji Claim area', 'hasRegistrationDate', '1st December 2005')
('Wajarri-Yamatji Claim area', 'registeredOn', '1st December 2005')
('Wajarri-Yamatji Cla

 old 0.38, Recall: 0.38, F1 Score: 0.38
 
 old 2 Precision: 0.45, Recall: 0.34, F1 Score: 0.38

 Precision: 0.42, Recall: 0.34, F1 Score: 0.37

In [192]:
# Load json
with open('LLM_Ontologies/LLM_Results.json', 'r') as file:
    text2kg = json.load(file)

In [88]:
import re

# Function to extract entities and relation from the input string and convert to knowledge graph triple
def parse_and_convert_to_kg_types(input_string):
    # Regular expression to extract the relation, head entity, head type, tail entity, and tail type
    pattern = r'(\w+)\(([^()]+)\s\((\w+)\),\s([^()]+)\s\((\w+)\)\)'
    match = re.match(pattern, input_string)
    
    if match:
        relation = match.group(1)
        head_entity = match.group(2).strip()
        head_type = match.group(3)
        tail_entity = match.group(4).strip()
        tail_type = match.group(5)
        
        # Return the extracted data as a triple
        return (head_entity, head_type, relation, tail_entity, tail_type)
    else:
        return None
    
# Function to insert the triple into the knowledge graph
def add_to_knowledge_graph(kg, triple):
    head_entity, head_type, relation, tail_entity, tail_type = triple
    # Add the relation and tail entity to the knowledge graph
    kg[(head_entity, head_type)][relation].add((tail_entity, tail_type))


# Input string
input_string = "foundIn(Rainy RocksE2 (Rock), 18 April 2011 (Location))"

# Convert to knowledge graph triple
kg_triple = parse_and_convert_to_kg_types(input_string)

# Print the result
if kg_triple:
    print(f"({kg_triple[0]} ({kg_triple[1]}), {kg_triple[2]}, {kg_triple[3]} ({kg_triple[4]}))")
else:
    print("Invalid format")

(Rainy RocksE2 (Rock), foundIn, 18 April 2011 (Location))


In [93]:
benchmark_kg = []

for a in text2kg:
    if a != "a092458_e09_1213_2011_a_12624596.json":
        continue
    for b in text2kg[a]:
        print(b)
        if "None" in text2kg[a][b]["LLM_Results"]:
            print("No results found.")
            continue
        else:
            # print(b)
            # print(text2kg[a][b]["LLM_Results"])
            for i in text2kg[a][b]["LLM_Results"].split("\n"):
                kg = parse_and_convert_to_kg_types(i)
                if kg:
                    head_entity, head_type, relation, tail_entity, tail_type = kg
                    kg = (head_entity, relation, tail_entity)
                    print(kg)
                    benchmark_kg.append(kg)
                    # print(head_entity, head_type, relation, tail_entity, tail_type)


The Mango Bore Project (E09/1213) is situated approximately 105 km east-northeast of Gascoyne Junction (Figure 1) in the Gascoyne Complex and covers a total area of roughly 99 km2.
('Mango Bore Project', 'foundIn', 'Gascoyne Junction')
The project is accessed from the Dairy Creek to the Cobra Station road and a station track approximately 2 km to the south of Yinnietharra Homestead.
No results found.
The project area lies on the Yinnietharra and Mooloo Downs Pastoral leases (Figure 2).
2 TENURE 2.1 Tenement Details The Mango Bore Project is owned by Skytone Pty Ltd and operated by U3O8 Limited.
No results found.
It has common boundaries with U3O8 Limiteds Wabli Creek project E09/1178 (Figure 4).
No results found.
Figure 1 shows the boundaries of the project.
No results found.
2.2 Native Title The Mango Bore Project is situated within the Wajarri-Yamatji Claim (WC04-010) area, which was registered and passed the registration test on the 1st December 2005.
No results found.
2 3 4 3 GEOLO

Benchmark evaluation

In [94]:
# Fuzzy comparison using custom match function (e.g., fuzzy_compare)
precision, recall, f1 = evaluate_kg(manual_kg, benchmark_kg, fuzzy_compare)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


('Mango Bore Project', 'foundIn', 'Gascoyne Junction')
('Mango Bore', 'foundIn', 'south-western margin')
('Gascoyne Complex', 'foundIn', 'south-western margin')
('Gascoyne Complex', 'contains', 'Palaeoproterozoic granitic units')
('Gascoyne Complex', 'comprises', 'Palaeoproterozoic granitic units')
('Gascoyne Complex', 'contains', 'Palaeoproterozoic granitic units')
100
47
100
('Gascoyne Complex', 'contains', 'medium to high-grade meta-sedimentary units')
('Gascoyne Complex', 'comprises', 'medium to high-grade meta-sedimentary units')
('Gascoyne Complex', 'contains', 'medium to high-grade meta-sedimentary units')
100
47
100
('Palaeoproterozoic granitic units', 'formedDuring', 'Capricorn Orogen')
('medium to high-grade meta-sedimentary units', 'formedDuring', 'Capricorn Orogen')
('Gascoyne Complex', 'limitedTo', 'Phanerozoic successions')
('Phanerozoic successions', 'foundIn', 'Carnarvon basin')
('Phanerozoic successions', 'isPartOf', 'Carnarvon basin')
('Phanerozoic successions', 'foun

situatedIn(Mango Bore Project, Gascoyne Complex)  
locatedApproximately(Mango Bore Project, 105 km east-northeast of Gascoyne Junction)  
coversArea(Mango Bore Project, 99 km²)  

accessedFrom(Mango Bore Project, Dairy Creek to Cobra Station road)
accessedFrom(Mango Bore Project, station track)
locatedApproximately(station track, 2 km to the south of Yinnietharra Homestead)

liesOn(Mango Bore Project area, Yinnietharra and Mooloo Downs Pastoral leases)

ownedBy(Mango Bore Project, Skytone Pty Ltd)  
operatedBy(Mango Bore Project, U3O8 Limited)  

hasCommonBoundaries(Mango Bore Project, E09/1178)
isPartOf(E09/1178, Wabli Creek project)

foundIn(Gascoyne Complex, south-western margin)

situatedIn(Mango Bore Project, Wajarri-Yamatji Claim area)
registeredOn(Wajarri-Yamatji Claim area, 1st December 2005)

liesNear(Mango Bore project, south-western margin of Gascoyne Complex)  

comprises(Gascoyne Complex, Palaeoproterozoic granitic units)
comprises(Gascoyne Complex, medium to high-grade meta-sedimentary units)
forms(medium to high-grade meta-sedimentary units, high grade core of Capricorn Orogen)

limitedTo(Gascoyne Complex, west by Phanerozoic successions of Carnarvon basin)  
isPartOf(Phanerozoic successions, Carnarvon basin)  

isKnownFrom(Glenburgh Orogeny, southern end of Gascoyne Province)  
hasTimescale(Glenburgh Orogeny, 2005-1960 Ma)  

followedBy(Capricorn Orogeny, deposition of protoliths)
hasTimescale(Capricorn Orogeny, 1830-1780 Ma)

derivedFrom(Maximum depositional age, detrital zircons)  
hasAge(Maximum depositional age, ~1840 Ma)  
foundAt(Morrissey Metamorphics, 2001)  
hasAge(Morrissey Metamorphics, ~1840 Ma)  

markedBy(Orogeny, deformation and intrusion of Morrissey Metamorphics)
intrudes(Granites of Moorarie Supersuite, Morrissey Metamorphics)
comprises(Moorarie Supersuite, monzogranite)
comprises(Moorarie Supersuite, granodiorite)
includes(Moorarie Supersuite, syenogranite)
includes(Moorarie Supersuite, tonalite)
includes(Moorarie Supersuite, quartz diorite)

bestDeveloped(Mangaroon Orogeny, northern part of Gascoyne Province)
hasTimescale(Mangaroon Orogeny, 1680-1620 Ma)
relatedTo(Mangaroon Orogeny, related structures)
poorlyDeveloped(related structures, Morrissey Metamorphics)

correlateWith(low to medium-grade metasedimentary rocks, rocks of the fluviatile Mt James Formation)  
marking(low to medium-grade metasedimentary rocks, series of fault-bounded basins)  
depositedOn(series of fault-bounded basins, Gascoyne Complex)  
associatedWith(Hunter 1990, Pooranoo Metamorphics)  

comprises(The latter, meta-conglomerates)
comprises(The latter, coarse metasandstones)
overlies(meta-conglomerates, Mount Morrissey Metamorphics)
overlies(coarse metasandstones, Mount Morrissey Metamorphics)
hasTimescale(meta-conglomerates, ~1,700 Ma)
hasTimescale(coarse metasandstones, ~1,700 Ma)

reactivated(Edmundian Orogeny, shear zones)
reactivated(Edmundian Orogeny, fault zones)
hasTimescale(Edmundian Orogeny, 1030-950 Ma)  

basedOn(dates, syn-metamorphic monazite and xenotime)
associatedWith(event, peak regional metamorphism)
hasFacies(peak regional metamorphism, greenschist to amphibolite)
followedBy(peak regional metamorphism, pegmatite intrusion)

contains(Pegmatites, beryllium)
contains(Pegmatites, tantalumniobium)
locatedCloseTo(Mango Bore project, boundary between northern domain of Glenburgh Terrane and southern boundary of Mutherbukin zone)
extendsTo(Mango Bore project, Chalba shear zone)
partOf(Mango Bore project, Gascoyne Complex)

comprises(Mango Bore, lowermost units of Morrissey Metamorphics)  
inContactWith(lowermost units of Morrissey Metamorphics, reworked Archaean granitoids)  
inContactWith(lowermost units of Morrissey Metamorphics, Proterozoic pegmatites)  

dominatedBy(Regolith environment, erosional regime)
preservedIn(Regolith environment, minor relict domains)
belongsTo(Regolith environment, zone of limited U-Ce-La-Pb-Th enrichment)

liesUpstream(part of the tenement, Minindi Creek prospect)  
isKnownFor(Minindi Creek prospect, calcrete hosted uranium mineralisation)  

planned(Reverse circulation drilling programme, follow up calcrete hosted uranium mineralisation)
locatedSouthOf(Calcrete hosted uranium mineralisation, Minindi prospect)
delayed(Reverse circulation drilling programme, next reporting period)
potentialFor(Calcrete hosted uranium resource, area)
hasDepth(Calcrete hosted uranium resource, ten metre)
