In [25]:
import re
import json
import torch
import random
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
torch.cuda.empty_cache()

In [27]:
# Domain_Dictionary_Path = {"geological_timescales": "../Domain_Dictionary/geological_timescales.txt",
#                           "locations": "../Domain_Dictionary/locations.txt",
#                           "minerals": "../Domain_Dictionary/minerals.txt",
#                           "ores_deposits": "../Domain_Dictionary/ores_deposits.txt",
#                           "rocks": "../Domain_Dictionary/rocks.txt",
#                           "stratigraphy": "../Domain_Dictionary/stratigraphy.txt"}

# Domain_Dictionary = {}

# for domain, path in Domain_Dictionary_Path.items():
#     with open(path, "r") as file:
#         info = file.read().splitlines()
#         if domain == "geological_timescales":
#             info = [i.lstrip().split()[0] for i in info]
#         info = [i for i in info if i.strip()] # Make sure there are no empty strings
#         Domain_Dictionary[domain] = info

#         print(info)
#         print(len(info))

In [28]:
def preprocess_and_tokenize(file_path):
    """
    Reads the file, preprocesses it, and tokenizes the data.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    input_texts = []
    target_labels = []
    
    current_tokens = []
    current_labels = []
    
    for line in lines:
        line = line.strip()
        if line:  # Non-empty line
            token, label = line.split()
            current_tokens.append(token)
            current_labels.append(label)
        else:  # Empty line signifies end of a sentence
            if current_tokens:
                input_texts.append(' '.join(current_tokens))
                target_labels.append(' '.join(current_labels))
                # Reset for next sentence
                current_tokens = []
                current_labels = []

    # Add the last sentence if the data doesn't end with an empty line
    if current_tokens:
        input_texts.append(' '.join(current_tokens))
        target_labels.append(' '.join(current_labels))
    
    return input_texts, target_labels

In [29]:
# Example usage
file_path = 'Training_Data/AutoLabelledSet.txt'  # Replace with your file path
input_texts, target_labels = preprocess_and_tokenize(file_path)
print(input_texts[:5])
print(target_labels[:5])
print(len(input_texts), len(target_labels))

['the jubilee domain contains a complex sequence of ultramafic mafic rocks and interleaved sedimentary rocks , overlain by polymictic conglomerate .', 'extensive weathered banded quartz magnetite rocks occur throughout the project area .', 'the most common and significant metaliferrous rock types in the area are metamorphosed banded iron formations , ( bif ) and granular iron formations , ( gif ) .', 'these can be either completely oxidised medium to coarse grain hematite or more stable magnetite .', 'a major volcanic centre , defined by abundant felsic volcanics and quartz aluminosilicatechloritoid rocks ( considered to represent metamorphosed alteration assemblages ) , occurs in the central portion of this domain .']
['O B-LOCATION O O O O O O B-ROCK I-ROCK I-ROCK O O B-ROCK I-ROCK O O O B-ROCK I-ROCK O', 'O O O B-MINERAL B-MINERAL O O O O O O O', 'O O O O O B-ROCK I-ROCK O O O O O O B-ROCK I-ROCK I-ROCK O O B-ROCK O O B-ROCK I-ROCK I-ROCK O O B-ROCK O O', 'O O O O O O O O O O B-MINE

In [30]:
test_file = "Training_Data/EvaluationSet.txt"
test_inputs, test_labels = preprocess_and_tokenize(test_file)
print(test_inputs[:5])
print(test_labels[:5])
print(len(test_inputs), len(test_labels))

['The geology of the hole was dominated by felsic schists and granites .', 'The mineralisation was characterised by traces of disseminated pyrite with zones of trace pyrrhotite and chalcopyrite in felsic schist .', 'The best mineralisation was intersected in felsic schists below the interpreted position of the VTEM plate model .', 'The geology of the hole was ultramafic schists overlying amphibolite and amphibolitic schists with two 1 m wide weakly sulphidic quartz veins .', 'These quartz veins were characterised by green colouration with traces of magnetite and disseminated pyrite , however contained no anomalous chemistry .']
['O O O O O O O O B-ROCK I-ROCK O B-ROCK O', 'O O O O O O O O B-MINERAL O O O O B-MINERAL O B-MINERAL O B-ROCK I-ROCK O', 'O O O O O O B-ROCK I-ROCK O O O O O O O O O O', 'O O O O O O B-ROCK I-ROCK O B-ROCK O B-ROCK I-ROCK O O O O O O O B-ROCK I-ROCK O', 'O B-ROCK I-ROCK O O O O O O O O B-MINERAL O O B-MINERAL O O O O O O O']
2001 2001


In [31]:
def preprocess_and_tokenize_json(file_name):
    with open(file_name, 'r') as json_file:
        data = json.load(json_file)

    # Initialize lists for sentences and labels
    sentences = []
    labels = []

    # Process each entry in the JSON data
    for entry in data:
        output = entry['output']
        label = entry['labels']
        
        # Join the output list into a string sentence
        sentence = ' '.join(output).lower()
        
        # Join the labels list into a string
        label_str = ' '.join(label)
        
        # Append to lists
        sentences.append(sentence)
        labels.append(label_str)

    return sentences, labels

In [32]:
# Example usage
file_name = "Training_Data/DomainDictionary.json"  # Replace with your file path
dictionary_words, dictionary_labels = preprocess_and_tokenize_json(file_name)

random.seed(42)

print(dictionary_words[:5])
print(dictionary_labels[:5])
print(len(dictionary_words), len(dictionary_labels))

combined = list(zip(dictionary_words * 4, dictionary_labels * 4))

random.shuffle(combined)

dictionary_words , dictionary_labels = zip(*combined)
dictionary_words = list(dictionary_words)
dictionary_labels = list(dictionary_labels)

print(dictionary_words[:5])
print(dictionary_labels[:5])
print(len(dictionary_words), len(dictionary_labels))

['aalenian', 'abereiddian', 'acadian', 'actonian', 'adelaidean']
['B-TIMESCALE', 'B-TIMESCALE', 'B-TIMESCALE', 'B-TIMESCALE', 'B-TIMESCALE']
16920 16920
['cookernup', 'rayite', 'cobb formation', 'mapingian', 'grave dam grit']
['B-LOCATION', 'B-MINERAL', 'B-STRAT I-STRAT', 'B-TIMESCALE', 'B-STRAT I-STRAT I-STRAT']
67680 67680


In [33]:
dd_words = []
dd_labels = []

sentence = ""
labels = ""
num = 60
random.seed(42)
for i in range(len(dictionary_words)):
    word = dictionary_words[i]
    label = dictionary_labels[i]
    if len(sentence.split(" ")) + len(word.split(" ")) > num:
        dd_words.append(sentence)
        dd_labels.append(labels)
        sentence = ""
        labels = ""
        sentence += word
        labels += label
    else:
        if sentence == "":
            sentence += word
            labels += label
        else:
            sentence += " , " + word
            labels += " O " + label
        num = random.randint(40, 60)

print(len(dd_words))

3998


In [34]:
print(dd_words[:5])
print(dd_labels[:5])
print(len(dd_words), len(dd_labels))

print(len(dd_words[0].split(" ")))

['cookernup , rayite , cobb formation , mapingian , grave dam grit , inkamulla granodiorite , fowlerite , davidsmithite , green head , spaltiite , timber creek formation , freshwater limestone , munyi member , volcaniclastic conglomerate', 'mount caernarvon greywacke member , wadjemup formation , twin bonanza porphyry , colimaite , vitimite , bb19 , georgeite , sinter , mudnawatana tonalite , natrolite , parmelia formation , yarragadee formation , kalinjala mylonite , clinoptilolite k , mount andrew migmatite , tertiary , horn valley siltstone', 'childrenite , vincent member , tashelgite , meta ultramafic intrusive rock , monohydrocalcite , sterryite , crossroads granodiorite , bafertisite , magnesiocarbonatite , daly river group , yilgarn star , quartzofeldspathic schist , yangzhumingite , wirraway formation , pelsart limestone , mougooderra formation , paulingite k', 'pottsite , polylithionite , hechtsbergite , dougalls tonalite , riebeckite , mesoproterozoic granites 76633 , ankerit

In [35]:
# from sklearn.model_selection import train_test_split

# # Split data into training and testing sets
# train_texts, test_texts, train_labels, test_labels = train_test_split(
#     input_texts, target_labels, test_size=0.2, random_state=42
# )

# print(f"Number of training samples: {len(train_texts)}")
# print(f"Number of testing samples: {len(test_texts)}")

# print(train_texts[:5])

In [36]:
def tokenize_data(texts, labels, tokenizer):
    """
    Tokenizes the input texts and labels.
    """
    inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    targets = tokenizer(labels, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    
    # Ensure labels are the same length as inputs
    inputs['labels'] = targets['input_ids']
    
    return inputs


In [37]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)


In [38]:
# Training Data
train_input1 = [words for words in input_texts] 
# train_input.extend([words for words in geotime_text])
# train_input.extend([words for words in dd_words])

train_labels1 = [labels for labels in target_labels]
# train_labels.extend([labels for labels in geotime_labels])
# train_labels.extend([labels for labels in dd_labels])

train_input1 = [words for words in dd_words]
train_labels1 = [labels for labels in dd_labels]

# print(train_input[-1])


print(train_input1[-1])

print(train_labels1[-1])

print(len(train_input1))
print(len(train_labels1))
# print(len(train_input))
# print(len(train_labels))


sandstoneother siliciclastic rock , buckleboo granite , dollaseite ce , bokite , areyonga formation , indium , fülöppite , bottle creek gold mine , bottinoite , wattevilleite , dalgaranga dolerite , grischunite , gravelly mud , dravertite , rouse creek arenite , liddle formation , condenser dolerite , ellis formation , fluorbritholite y
B-ROCK I-ROCK I-ROCK O B-STRAT I-STRAT O B-MINERAL I-MINERAL O B-MINERAL O B-STRAT I-STRAT O B-MINERAL O B-MINERAL O B-LOCATION I-LOCATION I-LOCATION I-LOCATION O B-MINERAL O B-MINERAL O B-STRAT I-STRAT O B-MINERAL O B-ROCK I-ROCK O B-MINERAL O B-STRAT I-STRAT I-STRAT O B-STRAT I-STRAT O B-STRAT I-STRAT O B-STRAT I-STRAT O B-MINERAL I-MINERAL
3998
3998


## Training with AutoLabelledSet

In [39]:
# Initialize tokenizer
# tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenizer = AutoTokenizer.from_pretrained("dbmdz/t5-base-conll03-english", clean_up_tokenization_spaces=False)
# tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")

# Tokenize training and testing data
train_data1 = tokenize_data(train_input1, train_labels1, tokenizer)
test_data = tokenize_data(test_inputs, test_labels, tokenizer)


# Prepare datasets
train_dataset1 = NERDataset(train_data1, train_data1['labels'])
test_dataset = NERDataset(test_data, test_data['labels'])


train_loader1 = DataLoader(train_dataset1, batch_size=32, num_workers=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=8)

# Print or return the shape of the inputs and labels
input_shape1 = train_data1['input_ids'].shape
label_shape1 = train_data1['input_ids'].shape

print(f"Input Shape: {input_shape1}")
print(f"Label Shape: {label_shape1}")

# Print or return the shape of the inputs and labels
test_input_shape = test_data['input_ids'].shape
test_label_shape = test_data['input_ids'].shape

print(f"Input Shape: {test_input_shape}")
print(f"Label Shape: {test_label_shape}")

Input Shape: torch.Size([3998, 256])
Label Shape: torch.Size([3998, 256])
Input Shape: torch.Size([2001, 256])
Label Shape: torch.Size([2001, 256])


In [40]:
# Initialize model
model = AutoModelForSeq2SeqLM.from_pretrained("dbmdz/t5-base-conll03-english").to(device)
# model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base").to(device)
# model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    gradient_accumulation_steps=8,
    save_steps=15000,
    eval_steps=15000,
    fp16=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Flatten the lists
    labels = labels.flatten()
    predictions = predictions.flatten()
    return {
        'f1': f1_score(labels, predictions, average='weighted')
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset1,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [41]:
# # Train the model
# trainer.train()

### Predict Entities

In [42]:
def tokenize_input(texts, tokenizer, max_length=256):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

def predict_entities(texts, model, tokenizer):
    inputs = tokenize_input(texts, tokenizer)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient calculation
        outputs = model.generate(**inputs, max_new_tokens=256)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


# Example geological text
geo_texts = [
    "the jubilee domain contains a complex sequence of ultramafic mafic rocks and more evolved rocks, due to magma mixing and fractional crystallization .",
]

geo_texts = ["yulleroo formation , yulleroo sub-basin , western australia .",]

# Predict geological entities
predictions = predict_entities(geo_texts, model, tokenizer)

# Print the results
print(predictions)
for text, prediction in zip(geo_texts, predictions):
    print(f"Text: {text}")
    print(f"Prediction: {prediction}")
    print()

['yulleroo formation , yulleroo sub-basin , western australia .']
Text: yulleroo formation , yulleroo sub-basin , western australia .
Prediction: yulleroo formation , yulleroo sub-basin , western australia .



In [43]:
# model.save_pretrained('./Models/6-GeoEntityNER')  # Custom directory for saving
# tokenizer.save_pretrained('./Models/6-GeoEntityNER')  # Save tokenizer as well

In [44]:
def get_model():
    save_directory = './Models/6-GeoEntityNER'
    # Load the tokenizer and model from the saved directory
    tokenizer = AutoTokenizer.from_pretrained(save_directory)
    model = AutoModelForSeq2SeqLM.from_pretrained(save_directory).to(device)

    return model, tokenizer

model, tokenizer = get_model()



In [45]:
# Predict geological entities
predictions = predict_entities(geo_texts, model, tokenizer)

# Print the results
print(predictions)
for text, prediction in zip(geo_texts, predictions):
    print(f"Text: {text}")
    print(f"Prediction: {prediction}")
    print()

['B-STRAT I-STRAT O B-STRAT I-STRAT O B-LOCATION I-LOCATION O']
Text: yulleroo formation , yulleroo sub-basin , western australia .
Prediction: B-STRAT I-STRAT O B-STRAT I-STRAT O B-LOCATION I-LOCATION O



In [46]:
texts = "The formation of the Rocky Mountains occurred around 70 Ma."

predictions = predict_entities(texts, model, tokenizer)

print(texts)
print(predictions)

print(len(predictions))


The formation of the Rocky Mountains occurred around 70 Ma.
['O O O O B-LOCATION I-LOCATION O O O B-GEO_TIME I-GEO_TIME O']
1


In [47]:
# # Load the tokenizer and model from the saved directory
# save_directory = "../Models/saved_model"
# tokenizer = AutoTokenizer.from_pretrained(save_directory)
# model = AutoModelForSeq2SeqLM.from_pretrained(save_directory).to(device)

# # Load the tokenizer and model from the saved directory
# save_directory = "./Models/double_trained_model"
# new_tokenizer = AutoTokenizer.from_pretrained(save_directory)
# new_model = AutoModelForSeq2SeqLM.from_pretrained(save_directory).to(device)

# Function to map 'B-' and 'I-' labels to their corresponding entity type
def get_entity_type(label):
    if label.startswith('B-') or label.startswith('I-'):
        return label[2:]  # Remove 'B-' or 'I-' prefix
    else:
        return label  # Keep the label as is

def tokenize_input(texts, tokenizer, max_length=256):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

def predict_entities(texts, model, tokenizer):
    inputs = tokenize_input(texts, tokenizer)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to GPU

    with torch.no_grad():  # Disable gradient calculation
        outputs = model.generate(**inputs, max_new_tokens=256)  # Generate output sequences

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def evaluate_model(texts, true_labels, model, tokenizer):
    # Get predicted texts from the model
    texts = texts
    predicted_texts = []
    for text in texts:
        predicted_texts.append(predict_entities(text, model, tokenizer)[0])
    
    true_labels_flat = []
    pred_labels_flat = []

    for i in range(len(texts)):
        pred_tokens = predicted_texts[i].split()
        true_tokens = true_labels[i].split()
        
        for j in range(len(true_tokens)):
            # Skip 'O' labels and avoid out-of-bound errors in predictions
            if j < len(pred_tokens) and (true_tokens[j] != 'O' or pred_tokens[j] != 'O'):
                true_label = get_entity_type(true_tokens[j])
                pred_label = get_entity_type(pred_tokens[j])

                true_labels_flat.append(true_label)
                pred_labels_flat.append(pred_label)
                    
    # Calculate metrics
    accuracy = accuracy_score(true_labels_flat, pred_labels_flat)
    f1_micro = f1_score(true_labels_flat, pred_labels_flat, average='micro')
    f1_macro = f1_score(true_labels_flat, pred_labels_flat, average='macro')
    
    report = classification_report(
        true_labels_flat, pred_labels_flat, labels=[
            'LOCATION', 'MINERAL', 'ORE_DEPOSIT', 'ROCK', 'STRAT', 'TIMESCALE'
        ]
    )

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score (Micro): {f1_micro:.4f}")
    print(f"F1-score (Macro): {f1_macro:.4f}")
    print("Classification Report:")
    print(report)

# Evaluate the model on the parsed test data
evaluate_model(test_inputs, test_labels, model, tokenizer)
# evaluate_model(test_inputs, test_labels, old_model, old_tokenizer)
# evaluate_model(test_inputs, test_labels, new_model, new_tokenizer)

Accuracy: 0.6507
F1-score (Micro): 0.6507
F1-score (Macro): 0.6750
Classification Report:
              precision    recall  f1-score   support

    LOCATION       0.58      0.85      0.69      1692
     MINERAL       0.78      0.86      0.82      1403
 ORE_DEPOSIT       0.88      0.78      0.83       682
        ROCK       0.82      0.74      0.78      2631
       STRAT       0.77      0.76      0.77      1417
   TIMESCALE       0.84      0.86      0.85       213

   micro avg       0.74      0.80      0.77      8038
   macro avg       0.78      0.81      0.79      8038
weighted avg       0.76      0.80      0.77      8038



```
Accuracy: 0.6507
F1-score (Micro): 0.6507
F1-score (Macro): 0.6750
Classification Report:
              precision    recall  f1-score   support

    LOCATION       0.58      0.85      0.69      1692
     MINERAL       0.78      0.86      0.82      1403
 ORE_DEPOSIT       0.88      0.78      0.83       682
        ROCK       0.82      0.74      0.78      2631
       STRAT       0.77      0.76      0.77      1417
   TIMESCALE       0.84      0.86      0.85       213

   micro avg       0.74      0.80      0.77      8038
   macro avg       0.78      0.81      0.79      8038
weighted avg       0.76      0.80      0.77      8038
```