In [13]:
import pandas as pd
import json

train_data = pd.read_csv(r"D:\event_extraction\data\train.tsv",sep = "\t")  
def format_data(train_data):
    processed_data = []
    for _, row in train_data.iterrows():
        input_text = row['event_mention']
        output_text = row['event'] if row['event'] != "ND" else "No Event"
        processed_data.append({"input_text": f"extract events: {input_text}", "output_text": output_text})
    return processed_data


processed_data = format_data(train_data)

with open(r"D:\event_extraction\data\train_data.json", "w") as f:
    json.dump(processed_data, f, indent=4)

print("Data preprocessing complete! Saved as t5_formatted_data.json.")


Data preprocessing complete! Saved as t5_formatted_data.json.


In [14]:
import pandas as pd
import json

test_data = pd.read_csv(r"D:\event_extraction\data\test.tsv",sep = "\t")  
def format_data(test_data):
    processed_data = []
    for _, row in test_data.iterrows():
        input_text = row['event_mention']
        output_text = row['event'] if row['event'] != "ND" else "No Event"
        processed_data.append({"input_text": f"extract events: {input_text}", "output_text": output_text})
    return processed_data


processed_data = format_data(train_data)

with open(r"D:\event_extraction\data\test_data.json", "w") as f:
    json.dump(processed_data, f, indent=4)

print("Data preprocessing complete! Saved as t5_formatted_data.json.")


Data preprocessing complete! Saved as t5_formatted_data.json.


In [15]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

# Load the preprocessed dataset
dataset = load_dataset("json", data_files=r"D:\event_extraction\data\train_data.json")
testset = load_dataset("json", data_files=r"D:\event_extraction\data\test_data.json")

# Load T5 tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:


# Tokenize dataset
def tokenize_data(example):
    model_inputs = tokenizer(
        example["input_text"], 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )
    
    labels = tokenizer(
        example["output_text"], 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )["input_ids"]
    
    # Replace padding token id with -100 to ignore in loss computation
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]
    model_inputs["labels"] = labels
    return model_inputs

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_data, batched=True)
tokenized_testset = testset.map(tokenize_data, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./t5_event_extractor",
    evaluation_strategy="steps",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="loss"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_testset["train"],  # Use testset for evaluation
)

# Train model
trainer.train()

# Save the model
model.save_pretrained("./t5_event_extractor")
tokenizer.save_pretrained("./t5_event_extractor")

print("Fine-tuning complete! Model saved.")


Map:   0%|          | 0/36620 [00:00<?, ? examples/s]

Map:   0%|          | 0/36620 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,0.6516,0.107638


In [11]:
import torch

def extract_events(sentence):
    model.eval()  # Set model to evaluation mode
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    input_text = f"extract events: {sentence}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    
    # Generate output with beam search
    output_ids = model.generate(
        inputs.input_ids, 
        attention_mask=inputs.attention_mask, 
        max_length=512, 
        num_beams=3, 
        no_repeat_ngram_size=2, 
        early_stopping=True
    )
    
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return output_text

# Example test
test_sentence = "MEK5/ERK5 signaling regulation il33 modulates endothelial cell migration and focal contact turnover."
print("Extracted Events:", extract_events(test_sentence))


Extracted Events: Extraction events: MEK5 signaling regulation il33 modulates endothelial cell migration and focal contact turnover.
