In [225]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [226]:
torch.cuda.empty_cache()

In [227]:
from transformers import T5Tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

def preprocess_and_tokenize(file_path):
    """
    Reads the file, preprocesses it, and tokenizes the data.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    input_texts = []
    target_labels = []
    
    current_tokens = []
    current_labels = []
    
    for line in lines:
        line = line.strip()
        if line:  # Non-empty line
            token, label = line.split()
            current_tokens.append(token)
            current_labels.append(label)
        else:  # Empty line signifies end of a sentence
            if current_tokens:
                input_texts.append(' '.join(current_tokens))
                target_labels.append(' '.join(current_labels))
                # Reset for next sentence
                current_tokens = []
                current_labels = []

    # Add the last sentence if the data doesn't end with an empty line
    if current_tokens:
        input_texts.append(' '.join(current_tokens))
        target_labels.append(' '.join(current_labels))
    
    return input_texts, target_labels


# Example usage
file_path = 'NER/Training_Data/AutoLabelledSet.txt'  # Replace with your file path
input_texts, target_labels = preprocess_and_tokenize(file_path)
print(input_texts[:5])
print(target_labels[:5])

['the jubilee domain contains a complex sequence of ultramafic mafic rocks and interleaved sedimentary rocks , overlain by polymictic conglomerate .', 'extensive weathered banded quartz magnetite rocks occur throughout the project area .', 'the most common and significant metaliferrous rock types in the area are metamorphosed banded iron formations , ( bif ) and granular iron formations , ( gif ) .', 'these can be either completely oxidised medium to coarse grain hematite or more stable magnetite .', 'a major volcanic centre , defined by abundant felsic volcanics and quartz aluminosilicatechloritoid rocks ( considered to represent metamorphosed alteration assemblages ) , occurs in the central portion of this domain .']
['O B-LOCATION O O O O O O B-ROCK I-ROCK I-ROCK O O B-ROCK I-ROCK O O O B-ROCK I-ROCK O', 'O O O B-MINERAL B-MINERAL O O O O O O O', 'O O O O O B-ROCK I-ROCK O O O O O O B-ROCK I-ROCK I-ROCK O O B-ROCK O O B-ROCK I-ROCK I-ROCK O O B-ROCK O O', 'O O O O O O O O O O B-MINE

In [228]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    input_texts, target_labels, test_size=0.2, random_state=42
)

print(f"Number of training samples: {len(train_texts)}")
print(f"Number of testing samples: {len(test_texts)}")

print(train_texts[:5])

Number of training samples: 25554
Number of testing samples: 6389
['4.2 local geology the project is underlain entirely by rocks of the coolgardie domain , a sub domain of the kalgoorlie terrane .', 'the abundant crystals or phenocrysts are totally altered feldspars and biotite plus the quartz .', 'recent work by blina resources nl has announced the discovery of numerous diamond bearing palaeochannels on their tenements held in joint venture with the kimberley diamond company immediately to the east of nwds kimberley downs prospects .', 'exploration comprised prospecting , rock sampling and stream sediment geochemistry .', 'historical drilling and surface sampling gold deposit legend historical drilling historical surface sampling cheroona well project 2006 annual report page 26 of 35 5.0 previous work gleneagle gold limited during the previous reporting period from 20th july 2005 to 31st march 2006 , exploration conducted by gleneagle gold limited included the processing of geophysica

In [229]:
# Initialize tokenizer
# tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

def tokenize_data(texts, labels):
    """
    Tokenizes the input texts and labels.
    """
    inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    targets = tokenizer(labels, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    
    # Ensure labels are the same length as inputs
    inputs['labels'] = targets['input_ids']
    
    return inputs

# Tokenize training and testing data
train_data = tokenize_data(train_texts, train_labels)
test_data = tokenize_data(test_texts, test_labels)

# Print tokenized data for verification
print("Training Input IDs:", train_data['input_ids'])
print("Training Labels:", train_data['labels'])
print("Testing Input IDs:", test_data['input_ids'])
print("Testing Labels:", test_data['labels'])


config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Training Input IDs: tensor([[    3, 19765,   415,  ...,     0,     0,     0],
        [    8, 16346,  6884,  ...,     0,     0,     0],
        [ 1100,   161,    57,  ...,     0,     0,     0],
        ...,
        [   48,  3303,    65,  ...,     0,     0,     0],
        [    8, 11508,     9,  ...,     0,     0,     0],
        [ 2724,  4012,  2155,  ...,     0,     0,     0]])
Training Labels: tensor([[411, 411, 411,  ...,   0,   0,   0],
        [411, 411, 272,  ...,   0,   0,   0],
        [411, 411, 411,  ...,   0,   0,   0],
        ...,
        [411, 411, 411,  ...,   0,   0,   0],
        [411, 272,  18,  ...,   0,   0,   0],
        [411, 411, 411,  ...,   0,   0,   0]])
Testing Input IDs: tensor([[   12,     8,  3457,  ...,     0,     0,     0],
        [    3,     9,  4727,  ...,     0,     0,     0],
        [ 8282,  1467,    13,  ...,     0,     0,     0],
        ...,
        [    8,  3731,   159,  ...,     0,     0,     0],
        [ 1877,  2122,  2252,  ...,     0,     

In [230]:
import torch

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)


In [231]:
# Prepare datasets
train_dataset = NERDataset(train_data, train_data['labels'])
test_dataset = NERDataset(test_data, test_data['labels'])

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, num_workers=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=16)



In [232]:
from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration

# Initialize model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base").to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=20,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    gradient_accumulation_steps=8,
    fp16=True,
)


# Define metrics computation
from sklearn.metrics import f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Flatten the lists
    labels = labels.flatten()
    predictions = predictions.flatten()
    return {
        'f1': f1_score(labels, predictions, average='weighted')
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)



model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [233]:
# Train the model
trainer.train()

  0%|          | 0/198 [00:00<?, ?it/s]

In [191]:
def tokenize_data(texts, tokenizer, max_length=256):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

def predict_entities(texts, model, tokenizer):
    inputs = tokenize_data(texts, tokenizer)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient calculation
        outputs = model.generate(**inputs)
    print("Raw outputs:", outputs)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


# Example geological text
geo_texts = [
    "the jubilee domain contains a complex sequence of ultramafic mafic rocks and more evolved rocks",
]

# Predict geological entities
predictions = predict_entities(geo_texts, model, tokenizer)

# Print the results
for text, prediction in zip(geo_texts, predictions):
    print(f"Text: {text}")
    print(f"Prediction: {prediction}")
    print()

Raw outputs: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:0')
Text: the jubilee domain contains a complex sequence of ultramafic mafic rocks and more evolved rocks
Prediction: 



In [224]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch


def preprocess_text(text):
    """
    Preprocess text to ensure punctuation is separated from words.
    """
    import re
    text = re.sub(r'([,.!?])', r' \1 ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_data(texts, tokenizer, max_length=256):
    """
    Tokenizes the input texts and labels.
    """
    processed_texts = [preprocess_text(text) for text in texts]
    print(processed_texts)
    return tokenizer(processed_texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

def decode_predictions(predictions, tokenizer):
    """
    Decode token IDs into labels.
    """
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    return decoded_preds

def predict_entities(texts, model, tokenizer):
    """
    Predicts entities using the model and tokenizer.
    """
    inputs = tokenize_data(texts, tokenizer)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    
    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        print(outputs)
        print(len(outputs))
    
    # Decode predictions
    predictions = decode_predictions(outputs, tokenizer)
    print(predictions)
    return predictions

# Example geological text
geo_texts = [
    "the jubilee domain contains a complex sequence of ultramafic mafic rocks and more evolved rocks.",
]

# Predict geological entities
predictions = predict_entities(geo_texts, model, tokenizer)

# Print the results
for text, prediction in zip(geo_texts, predictions):
    print(f"Text: {text}")
    print(f"Prediction: {prediction}")
    print()


['the jubilee domain contains a complex sequence of ultramafic mafic rocks and more evolved rocks .']
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:0')
1
['']
Text: the jubilee domain contains a complex sequence of ultramafic mafic rocks and more evolved rocks.
Prediction: 



In [None]:
print(train_dataset[0])

{'input_ids': tensor([    3, 19765,   415,   873,  1863,     8,   516,    19,   365,   521,
           77,  4585,    57, 12288,    13,     8,  1633,  1478,  2498,  3303,
            3,     6,     3,     9,   769,  3303,    13,     8,     3,  4766,
          839,   127,  1896,     3, 12829,    29,    15,     3,     5,     1,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [200]:
model.save_pretrained('./saved_model')  # Custom directory for saving
tokenizer.save_pretrained('./saved_model')  # Save tokenizer as well

('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\spiece.model',
 './saved_model\\added_tokens.json')

In [None]:

# Get the vocabulary as a dictionary
vocab = tokenizer.get_vocab()


# Print the first 100 items of the vocabulary for brevity
print({k: vocab[k] for k in list(vocab.keys())[:100]})


print(list(vocab.keys())[272])

{'<pad>': 0, '</s>': 1, '<unk>': 2, '▁': 3, 'X': 4, '.': 5, ',': 6, 's': 7, '▁the': 8, 'a': 9, ':': 10, '▁and': 11, '▁to': 12, '▁of': 13, '▁fill': 14, 'e': 15, '▁in': 16, 't': 17, '-': 18, '▁is': 19, '▁de': 20, '▁for': 21, '’': 22, 'i': 23, '▁that': 24, '▁you': 25, 'd': 26, '▁I': 27, '▁with': 28, 'n': 29, '▁on': 30, "'": 31, 'o': 32, '▁are': 33, '▁it': 34, 'en': 35, '▁be': 36, '▁The': 37, '▁as': 38, '▁your': 39, 'l': 40, '▁(': 41, '▁or': 42, '▁have': 43, '▁at': 44, '▁from': 45, '▁an': 46, '▁was': 47, '▁this': 48, 'er': 49, '▁la': 50, 'm': 51, 'r': 52, 'ing': 53, '▁can': 54, '!': 55, '▁will': 56, '▁by': 57, '?': 58, '▁not': 59, 're': 60, ')': 61, '▁we': 62, 'y': 63, '▁und': 64, '▁has': 65, '▁all': 66, '▁die': 67, '▁but': 68, '▁our': 69, '▁their': 70, '▁A': 71, '▁more': 72, '▁un': 73, '▁der': 74, 'c': 75, 'u': 76, 'in': 77, '▁so': 78, '▁they': 79, '▁one': 80, '▁about': 81, '▁my': 82, 'ul': 83, '▁which': 84, 'à': 85, '▁In': 86, '/': 87, 'he': 88, 'f': 89, '▁le': 90, '▁out': 91, '▁also': 9