### Experiment 1: NER Flair Model

In [None]:
import json

data = None
with open ("NER/SyntheticData/data.json", "r") as file:
    data = json.load(file)

In [None]:
def create_data(df, filepath):

    with open(filepath, "w") as file:
        for text, annotations in df:
            for token, label in annotations:
                token = preprocess_word(token)
                if token != " ":
                    file.write(f"{token} {label}\n")
            file.write("\n")


create_data(data, "NER/SyntheticData/train.txt")

In [None]:
from typing import List
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer


def train_flair_ner(data_version):
    columns = {0: "text", 1: "ner"}

    corpus = ColumnCorpus(".", columns, train_file="NER/Training_Data/AutoLabelled.txt")

    label_dict = corpus.make_label_dictionary(label_type='ner')

    # Initialise embeddings
    embedding_types : List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            # other embeddings
            ]
    embeddings : StackedEmbeddings = StackedEmbeddings(
                                    embeddings=embedding_types)

    # Initialise sequence tagger
    tagger : SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=label_dict,
                                        tag_type='ner',
                                        use_crf=True)

    # Initialise trainer - Training
    trainer : ModelTrainer = ModelTrainer(tagger, corpus)
    trainer.train(f'flair_models/{data_version}',
                learning_rate=0.1,
                mini_batch_size=64,
                max_epochs=5)
    
# train_flair_ner('joined')

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

def apply_flair_ner(model_path, document: str):
    # Load the trained model
    tagger = SequenceTagger.load(model_path)
    
    # Create a Sentence object
    sentence = Sentence(document)
    
    # Predict NER tags
    tagger.predict(sentence)

    print(sentence)
    print("Tagged sentence:", sentence.to_tagged_string())
    
    # Extract and print the entities
    for entity in sentence.get_spans('ner'):
        print(f'Entity: {entity.text}, Type: {entity.tag}, Confidence: {entity.score}')
     # Create a mapping of token index to NER tag
    token_tags = {i: 'O' for i in range(len(sentence))}
    for entity in sentence.get_spans('ner'):
        for token in entity:
            token_tags[token.idx - 1] = entity.tag  # Adjusting for zero-based index
    
    # Print each word with its NER tag
    print("\nDetailed word information:")
    for i, token in enumerate(sentence):
        ner_tag = token_tags[i]
        print(f"{token.text} {ner_tag}")
    
# Example usage
document = ['midwest', 'corporation', 'limit', 'annual', 'report', 'e09', '1004', 'september', '2006', 'page', '2', 'of', '21', 'september', '2006', '1', 'bibliographic', 'datum', 'sheet', 'report', 'title', 'midwest', 'corporation', 'limit', 'new', 'forest', 'project', 'annual', 'report', 'for', 'e09', '1004', '26', 'september', '2005', '25', 'september', '2006', 'david', 'broomfield', '20th', 'october', '2006', 'prospect', 'name', 'new', 'forest', 'project', 'tenement', 'number', 'mt', 'aubrey', 'e09', '1004', 'tenement', 'holder', 'midwest', 'corporation', 'limit', 'commodity', 'iron', 'ore', 'goldfield', 'mineral', 'field', 'south', 'west', 'mineral', 'field', '70', 'gascoyne', 'gold', 'field', '09', 'tectonic', 'unit', 'yilgarn', 'craton', 'moyagee', 'formation', 'lithologic', 'unit', 'band', 'iron', 'formation', 'massive', 'haematite', 'chert', 'quartzite', 'meta', 'sedimentary', 'rock', 'mafic', 'and', 'ultramafic', 'rock', 'laterite', '1', '250', '000', 'sheet', 'murgoo', 'sg50', '14', 'keyword', 'iron', 'ore', 'haematite', 'band', 'iron', 'formation', 'geological', 'mapping', 'geochemical', 'sample', 'airborne', 'geophysical', 'survey']
apply_flair_ner('flair_models/joined/final-model.pt', document)

### Experiment 2: Rule_based Labelling
Label all tokens
- with Domain Dictionary vocab

In [None]:
# pip install pyahocorasick

In [None]:
import ahocorasick

num = 0

# Function to create Aho-Corasick automaton for keywords
def create_automaton(keywords):
    A = ahocorasick.Automaton()
    for idx, keyword in enumerate(keywords):
        A.add_word(re.sub(r'[^\w\s]', ' ', keyword.lower()), (idx, keyword))
    A.make_automaton()
    return A

# Function to extract domain specific keywords
def extract_and_label_domain_keywords(text, automaton, num):
    keywords = []
    text = " ".join(text)
    for end_index, (idx, keyword) in automaton.iter(text):
        keyword_tokens = keyword.split()
        phrase_length = len(keyword_tokens)
        start_char_index = end_index - len(keyword) + 1
        
        # Find the token index where the phrase starts
        start_token_index = len(text[:start_char_index].split())

        # print(keyword_tokens)
        # print(phrase_length) 
        # print(start_char_index)
        # print(start_token_index)
        # print(f"Keyword: {keyword} found at index {end_index - len(keyword) + 1}")
        
        keywords.append(keyword)
        num += 1
    return keywords, num

# Extract all domain specific keywords
domains = ["minerals", "rocks", "stratigraphy", "ores_deposits", "locations", "geological_timescales"]
automatons_dictionary = {d: create_automaton(Domain_Dictionary[d]) for d in domains}

for d in domains:
    for i in range(len(output)):
        print(f"Page {i+1}:")
        keywords, num = extract_and_label_domain_keywords(output[i], automatons_dictionary[d], num)
        print(keywords)

print("num = ", num)


### Experiment 3: Synthetic Data
(a)

In [None]:
import random
import spacy

# Load the spaCy model for tokenization
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "tagger", "lemmatizer"])

# Sentence templates for each domain
templates = {
    "minerals": ["{entity} is a common mineral.", "The composition of {entity} is well-studied."],
    "rocks": ["{entity} is a type of rock.", "{entity} is often found in mountain regions."],
    "stratigraphy": ["{entity} provides important geological information.", "The {entity} is used to identify rock layers."],
    "ores_deposits": ["{entity} is mined for its value.", "The {entity} is an important resource."],
    "locations": ["The {entity} is located at a high altitude.", "{entity} is a significant geographic feature."],
    "geological_timescales": ["The {entity} is a major time period.", "{entity} marks a significant era in history."]
}

# Generate synthetic training data
def generate_synthetic_data(domain_dict, templates, num_sentences=10000):
    synthetic_data = []
    for _ in range(num_sentences):
        # Randomly select a domain and a word from that domain
        for domain, words in domain_dict.items():
            word = random.choice(words)
            template = random.choice(templates[domain])
            sentence = template.format(entity=word)

            # Combine with another domain for multiple entity types
            domain2 = random.choice([d for d in domain_dict.keys() if d != domain])
            word2 = random.choice(domain_dict[domain2])
            template2 = random.choice(templates[domain2])
            sentence2 = template2.format(entity=word2)
            # Combine the two sentences
            combined_sentence = f"{sentence} And, {sentence2}"

            token_labels = []
           
            # Tokenize the sentence and label the tokens
            doc = nlp(combined_sentence)

            word_split = word.split()
            word_split2 = word2.split()
            current_words = word_split
            current_domain = domain

            for token in doc:
                if token.text == current_words[0]: 
                    label = "B-" + current_domain.upper()
                elif len(current_words) > 1 and token.text in current_words[1:]:
                    label = "I-" + current_domain.upper()
                else:
                    label = "O"
                    if token.text == "And":
                        current_words = word_split2
                        current_domain = domain2
                token_labels.append((token.text, label))

            synthetic_data.append((combined_sentence, token_labels))
    return synthetic_data

# synthetic_data = generate_synthetic_data(Domain_Dictionary, templates)

(b)

In [None]:
import random
import spacy
from pathlib import Path
import re

# Load the spaCy model for tokenization
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

domains = ["minerals", "rocks", "stratigraphy", "ores_deposits", "locations", "geological_timescales"]

# Generate synthetic training data
def generate_synthetic_data_with_openai(domain_dict, in_file, out_file, num_sentences=10000):
    synthetic_data = []

    templates = []
    with open(in_file, 'r') as file:
        for line in file:
            templates.append(line.strip())

    temp_num = 0
    with open(out_file, 'w') as file:
        for i in range(num_sentences):
            template = templates[temp_num]
            if temp_num == len(templates) - 1:
                temp_num = 0
            else:
                temp_num += 1

            # template = template.split(" ")
            split_template = re.findall(r'\{.*?\}|\w+|[.,]', template)

            for word in split_template:
                entity = re.search(r'\{(.*?)\}', word)
                if (entity):
                    domain = entity.group(1)
                    rand_word = random.choice(domain_dict[domain])
                    
                    rand_word = rand_word.split()
                    for w in rand_word:
                        if w == rand_word[0]:
                            label = "B-" + domain.upper()
                            file.write(w + " " + label + "\n")
                        else:
                            label = "I-" + domain.upper()
                            file.write(w + " " + label+ "\n")
                else:
                    label = "O"
                    file.write(word + " " + label + "\n")
            file.write("\n")
    return synthetic_data
in_file = Path("NER") / "DataExamples.txt"
out_file = Path("NER") / "TrainingData.txt"
synthetic_data = generate_synthetic_data_with_openai(Domain_Dictionary, in_file, out_file)

In [None]:
for text, annotations in synthetic_data[:5]:
    print(text, annotations)

In [None]:
# # Write to JSON file
# with open("NER/data.json", "w") as json_file:
#     json.dump(synthetic_data, json_file)

In [None]:
with open ("NER/data.json", "r") as file:
    data = json.load(file)
    for text, annotations in data[:5]:
        print(text, annotations)

### Experiment 4: Temporal Time Extraction

In [None]:
import re
import datefinder


# Sample text from geological surveys
text = """
This report has been prepared as an investigation of the Mt Aubrey tenement, as part of Midwest’s New
Forest project in the Murchison Region of Western Australia. The report is presented as an Annual Report
to be submitted to the Department of Industry and Resources as part of the conditions of the granting of
E09/1004 and covers the period from the 26 September 2005 to the 25 September 2006.
"""

# test not work well when text such as "E09/1004" occurs and the library mistakes it for a date
matches = datefinder.find_dates(text)

for match in matches:
    print(match)    
