# Training 6-GeoEntity Model

In [1]:
import os
import time
import json
import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, f1_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Prepare Training Data

In [3]:
def preprocess_and_tokenize(file_path):
    """
    Reads the file, preprocesses it, and tokenizes the data.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    input_texts = []
    target_labels = []
    
    current_tokens = []
    current_labels = []
    
    for line in lines:
        line = line.strip()
        if line:  # Non-empty line
            token, label = line.split()
            current_tokens.append(token)
            current_labels.append(label)
        else:  # Empty line signifies end of a sentence
            if current_tokens:
                input_texts.append(' '.join(current_tokens))
                target_labels.append(' '.join(current_labels))
                # Reset for next sentence
                current_tokens = []
                current_labels = []

    # Add the last sentence if the data doesn't end with an empty line
    if current_tokens:
        input_texts.append(' '.join(current_tokens))
        target_labels.append(' '.join(current_labels))
    
    return input_texts, target_labels

In [4]:
# Example usage
file_path = 'Training_Data/AutoLabelledSet.txt'  # Replace with your file path
input_texts, target_labels = preprocess_and_tokenize(file_path)
print(input_texts[:5])
print(target_labels[:5])
print(len(input_texts), len(target_labels))

['the jubilee domain contains a complex sequence of ultramafic mafic rocks and interleaved sedimentary rocks , overlain by polymictic conglomerate .', 'extensive weathered banded quartz magnetite rocks occur throughout the project area .', 'the most common and significant metaliferrous rock types in the area are metamorphosed banded iron formations , ( bif ) and granular iron formations , ( gif ) .', 'these can be either completely oxidised medium to coarse grain hematite or more stable magnetite .', 'a major volcanic centre , defined by abundant felsic volcanics and quartz aluminosilicatechloritoid rocks ( considered to represent metamorphosed alteration assemblages ) , occurs in the central portion of this domain .']
['O B-LOCATION O O O O O O B-ROCK I-ROCK I-ROCK O O B-ROCK I-ROCK O O O B-ROCK I-ROCK O', 'O O O B-MINERAL B-MINERAL O O O O O O O', 'O O O O O B-ROCK I-ROCK O O O O O O B-ROCK I-ROCK I-ROCK O O B-ROCK O O B-ROCK I-ROCK I-ROCK O O B-ROCK O O', 'O O O O O O O O O O B-MINE

In [5]:
test_file = "Training_Data/EvaluationSet.txt"
test_inputs, test_labels = preprocess_and_tokenize(test_file)
print(test_inputs[:5])
print(test_labels[:5])
print(len(test_inputs), len(test_labels))
for i in range(len(test_inputs)):
    test_inputs[i] = test_inputs[i].lower()

print(test_inputs[:5])

['The geology of the hole was dominated by felsic schists and granites .', 'The mineralisation was characterised by traces of disseminated pyrite with zones of trace pyrrhotite and chalcopyrite in felsic schist .', 'The best mineralisation was intersected in felsic schists below the interpreted position of the VTEM plate model .', 'The geology of the hole was ultramafic schists overlying amphibolite and amphibolitic schists with two 1 m wide weakly sulphidic quartz veins .', 'These quartz veins were characterised by green colouration with traces of magnetite and disseminated pyrite , however contained no anomalous chemistry .']
['O O O O O O O O B-ROCK I-ROCK O B-ROCK O', 'O O O O O O O O B-MINERAL O O O O B-MINERAL O B-MINERAL O B-ROCK I-ROCK O', 'O O O O O O B-ROCK I-ROCK O O O O O O O O O O', 'O O O O O O B-ROCK I-ROCK O B-ROCK O B-ROCK I-ROCK O O O O O O O B-ROCK I-ROCK O', 'O B-ROCK I-ROCK O O O O O O O O B-MINERAL O O B-MINERAL O O O O O O O']
2001 2001
['the geology of the hole 

In [6]:
def preprocess_and_tokenize_json(file_name):
    with open(file_name, 'r') as json_file:
        data = json.load(json_file)

    # Initialize lists for sentences and labels
    sentences = []
    labels = []

    # Process each entry in the JSON data
    for entry in data:
        output = entry['output']
        label = entry['labels']
        
        # Join the output list into a string sentence
        sentence = ' '.join(output).lower()
        
        # Join the labels list into a string
        label_str = ' '.join(label)
        
        # Append to lists
        sentences.append(sentence)
        labels.append(label_str)

    return sentences, labels

In [7]:
def tokenize_data(texts, labels, tokenizer):
    """
    Tokenizes the input texts and labels.
    """
    inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    targets = tokenizer(labels, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    
    # Ensure labels are the same length as inputs
    inputs['labels'] = targets['input_ids']
    
    return inputs


In [8]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)


In [9]:
# Training Data
train_input = [words for words in input_texts] 
train_labels = [labels for labels in target_labels]

print(train_input[-1])
print(train_labels[-1])
print(len(train_input))
print(len(train_labels))

the southern portion of the forrestania greenstone belt lies 17 km to the north .
O O O O O B-LOCATION B-STRAT I-STRAT O O O O O O O
31943
31943


## Training with AutoLabelledSet

In [10]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/t5-base-conll03-english", clean_up_tokenization_spaces=False)

# Tokenize training and testing data
train_data = tokenize_data(train_input, train_labels, tokenizer)
test_data = tokenize_data(test_inputs, test_labels, tokenizer)


# Prepare datasets
train_dataset = NERDataset(train_data, train_data['labels'])
test_dataset = NERDataset(test_data, test_data['labels'])


train_loader1 = DataLoader(train_dataset, batch_size=32, num_workers=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=8)

# Print or return the shape of the inputs and labels
input_shape1 = train_data['input_ids'].shape
label_shape1 = train_data['input_ids'].shape

print(f"Input Shape: {input_shape1}")
print(f"Label Shape: {label_shape1}")

# Print or return the shape of the inputs and labels
test_input_shape = test_data['input_ids'].shape
test_label_shape = test_data['input_ids'].shape

print(f"Input Shape: {test_input_shape}")
print(f"Label Shape: {test_label_shape}")

Input Shape: torch.Size([31943, 256])
Label Shape: torch.Size([31943, 256])
Input Shape: torch.Size([2001, 256])
Label Shape: torch.Size([2001, 256])


In [11]:
# Initialize model
model = AutoModelForSeq2SeqLM.from_pretrained("dbmdz/t5-base-conll03-english").to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="steps",
    gradient_accumulation_steps=8,
    save_steps=15000,
    eval_steps=15000,
    fp16=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Flatten the lists
    labels = labels.flatten()
    predictions = predictions.flatten()
    return {
        'f1': f1_score(labels, predictions, average='weighted')
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [12]:
# # Train the model
# trainer.train()

In [13]:
save_directory = './Models/6-GeoEntityNER'

# Check if the directory already exists
if not os.path.exists(save_directory):
    # Save model and tokenizer if it doesn't exist
    model.save_pretrained(save_directory)
    tokenizer.save_pretrained(save_directory)
    print(f'Model and tokenizer saved to {save_directory}')
else:
    # Load the tokenizer and model from the saved directory
    tokenizer = AutoTokenizer.from_pretrained(save_directory)
    model = AutoModelForSeq2SeqLM.from_pretrained(save_directory).to(device)
    print(f'Model and tokenizer loaded from {save_directory}')

Model and tokenizer loaded from ./Models/6-GeoEntityNER


### Predict Entities

In [14]:
def tokenize_input(texts, tokenizer, max_length=256):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

def predict_entities(texts, model, tokenizer):
    inputs = tokenize_input(texts, tokenizer)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient calculation
        outputs = model.generate(**inputs, max_new_tokens=256)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


# Example geological text
geo_texts = [
    "the jubilee domain contains a complex sequence of ultramafic mafic rocks and more evolved rocks, due to magma mixing and fractional crystallization .",
]

# Predict geological entities
predictions = predict_entities(geo_texts, model, tokenizer)

# Print the results
print(predictions)
for text, prediction in zip(geo_texts, predictions):
    print(f"Text: {text}")
    print(f"Prediction: {prediction}")
    print()

['O B-LOCATION O O O O O O B-ROCK I-ROCK I-ROCK O O O O O O O O O O O O O']
Text: the jubilee domain contains a complex sequence of ultramafic mafic rocks and more evolved rocks, due to magma mixing and fractional crystallization .
Prediction: O B-LOCATION O O O O O O B-ROCK I-ROCK I-ROCK O O O O O O O O O O O O O



In [15]:
texts = "The formation of the Rocky Mountains occurred around 70 Ma."

predictions = predict_entities(texts, model, tokenizer)

print(texts)
print(predictions)


The formation of the Rocky Mountains occurred around 70 Ma.
['O O O O B-LOCATION I-LOCATION O O O B-GEO_TIME I-GEO_TIME O']


In [16]:
# Load the tokenizer and model from the saved directory
save_directory = './Models/6-GeoEntityNER'
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForSeq2SeqLM.from_pretrained(save_directory).to(device)

# Function to map 'B-' and 'I-' labels to their corresponding entity type
def get_entity_type(label):
    if label.startswith('B-') or label.startswith('I-'):
        return label[2:]  # Remove 'B-' or 'I-' prefix
    else:
        return label  # Keep the label as is

def tokenize_input(texts, tokenizer, max_length=256):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt").to(device)

def predict_entities(texts, model, tokenizer):
    inputs = tokenize_input(texts, tokenizer)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to GPU

    with torch.no_grad():  # Disable gradient calculation
        outputs = model.generate(**inputs, max_new_tokens=256)  # Generate output sequences

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def evaluate_model(texts, true_labels, model, tokenizer):
    # Get predicted texts from the model
    size = len(texts)
    texts = texts
    predicted_texts = []
    start_time = time.time()
    for i, text in enumerate(texts):
        if i % 100 == 0 and i > 0:
            elapsed_time = time.time() - start_time
            avg_time_per_text = elapsed_time / i
            remaining_time = avg_time_per_text * (size - i)
            print(f"{i} / {size} - Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

        predicted_texts.append(predict_entities(text, model, tokenizer)[0])

    total_time = time.time() - start_time
    print(f"Total evaluation time: {total_time:.2f}s")

    return predicted_texts
    
# Evaluate the model
predicted_texts = evaluate_model(test_inputs, test_labels, model, tokenizer)

100 / 2001 - Elapsed time: 72.75s, Estimated remaining time: 1382.90s
200 / 2001 - Elapsed time: 178.10s, Estimated remaining time: 1603.75s
300 / 2001 - Elapsed time: 263.39s, Estimated remaining time: 1493.43s
400 / 2001 - Elapsed time: 345.28s, Estimated remaining time: 1381.99s
500 / 2001 - Elapsed time: 435.71s, Estimated remaining time: 1307.99s
600 / 2001 - Elapsed time: 532.57s, Estimated remaining time: 1243.56s
700 / 2001 - Elapsed time: 634.76s, Estimated remaining time: 1179.75s
800 / 2001 - Elapsed time: 722.16s, Estimated remaining time: 1084.15s
900 / 2001 - Elapsed time: 804.73s, Estimated remaining time: 984.46s
1000 / 2001 - Elapsed time: 892.17s, Estimated remaining time: 893.06s
1100 / 2001 - Elapsed time: 974.85s, Estimated remaining time: 798.49s
1200 / 2001 - Elapsed time: 1068.25s, Estimated remaining time: 713.06s
1300 / 2001 - Elapsed time: 1149.82s, Estimated remaining time: 620.02s
1400 / 2001 - Elapsed time: 1239.87s, Estimated remaining time: 532.26s
1500 

In [17]:
true_labels = test_labels
print(len(true_labels), len(predicted_texts))

true_labels_flat = []
pred_labels_flat = []

for i in range(len(test_inputs)):
    pred_tokens = predicted_texts[i].split()
    true_tokens = true_labels[i].split()
    
    for j in range(len(true_tokens)):
        # Skip 'O' labels and avoid out-of-bound errors in predictions
        if j < len(pred_tokens) and (true_tokens[j] != 'O' or pred_tokens[j] != 'O'):
            true_label = get_entity_type(true_tokens[j])
            pred_label = get_entity_type(pred_tokens[j])

            true_labels_flat.append(true_label)
            pred_labels_flat.append(pred_label)
                
# Calculate metrics
report = classification_report(
    true_labels_flat, pred_labels_flat, labels=[
        'LOCATION', 'MINERAL', 'ORE_DEPOSIT', 'ROCK', 'STRAT', 'TIMESCALE'
    ]
)

print("Classification Report:")
print(report)

2001 2001
Classification Report:
              precision    recall  f1-score   support

    LOCATION       0.63      0.61      0.62      1692
     MINERAL       0.74      0.81      0.77      1403
 ORE_DEPOSIT       0.81      0.74      0.78       682
        ROCK       0.75      0.72      0.74      2634
       STRAT       0.83      0.72      0.77      1420
   TIMESCALE       0.79      0.77      0.78       213

   micro avg       0.74      0.72      0.73      8044
   macro avg       0.76      0.73      0.74      8044
weighted avg       0.74      0.72      0.73      8044

