In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
from nltk.corpus import reuters

In [3]:
import nltk
from nltk.corpus import reuters
import pandas as pd

# Download necessary datasets
nltk.download('reuters')
nltk.download('punkt')

def reuters_to_dataframe():
    data = []
    for file_id in reuters.fileids():  # Loop through all articles
        categories = ', '.join(reuters.categories(file_id))  # Join categories with commas
        content = reuters.raw(file_id).strip()  # Strip unnecessary whitespace
        data.append({"File ID": file_id, "Categories": categories, "Content": content})
    
    # Convert the data into a DataFrame
    df = pd.DataFrame(data)
    return df

# Generate the table for the entire dataset
df_reuters = reuters_to_dataframe()

# Print the dataset summary and head
print(f"Total rows: {len(df_reuters)}")
print(df_reuters.head())  # Display the first few rows

# Optional: Save to a CSV file
df_reuters.to_csv("reuters_dataset_full.csv", index=False)


[nltk_data] Downloading package reuters to
[nltk_data]     /Users/sridhrutitikkisetti/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sridhrutitikkisetti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total rows: 10788
      File ID                                    Categories  \
0  test/14826                                         trade   
1  test/14828                                         grain   
2  test/14829                                crude, nat-gas   
3  test/14832  corn, grain, rice, rubber, sugar, tin, trade   
4  test/14833                             palm-oil, veg-oil   

                                             Content  
0  ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...  
1  CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...  
2  JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...  
3  THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n  ...  
4  INDONESIA SEES CPO PRICE RISING SHARPLY\n  Ind...  


In [4]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertForTokenClassification, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd

# Tokenizer setup
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Prepare the Reuters dataset for training
def preprocess_reuters_data(df):
    sentences, labels = [], []
    causal_indicators = ["caused by", "led to", "due to", "triggered", "resulted in", "owing to"]

    for _, row in df.iterrows():
        content = row["Content"]
        category = row["Categories"]

        # Tokenize content into sentences
        for sentence in sent_tokenize(content):
            sentences.append(sentence)
            # Simulated label: Check for causal indicators
            label = int(any(indicator in sentence.lower() for indicator in causal_indicators))
            labels.append(label)

    return sentences, labels

# Load and preprocess the dataset
sentences, labels = preprocess_reuters_data(df_reuters)
print(f"Preprocessed {len(sentences)} sentences.")


tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Preprocessed 53792 sentences.


In [5]:
class ReutersDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]

        # Tokenize the sentence
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Create Dataset and DataLoader
dataset = ReutersDataset(sentences, labels, tokenizer, max_len=128)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [6]:
from transformers import AdamW

# Load the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1} - Loss: {total_loss / len(dataloader)}")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Loss: 0.03399642606510377
Epoch 2 - Loss: 0.01021203353680375
Epoch 3 - Loss: 0.00817001307374709


In [7]:
# Directory to save the model
save_directory = "bert_event_trigger_model"

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")


Model and tokenizer saved to bert_event_trigger_model


In [None]:
#to load the model again
from transformers import BertTokenizer, BertForSequenceClassification

# Load the saved model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained(save_directory)
loaded_tokenizer = BertTokenizer.from_pretrained(save_directory)

print("Model and tokenizer loaded successfully")
#to load the model again

In [None]:
#to use the trained model
# Example usage
input_sentence = "The market crash was caused by unexpected inflation."

# Tokenize the input
inputs = loaded_tokenizer(
    input_sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=128
)

# Make predictions
loaded_model.eval()
with torch.no_grad():
    outputs = loaded_model(**inputs)

# Get predicted label
predicted_label = torch.argmax(outputs.logits, dim=1).item()
label_map = {0: "Non-Causal", 1: "Causal"}
print(f"Predicted label: {label_map[predicted_label]}")


In [8]:
# Load BERT for token classification
model_args = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=3)  # Adjust labels for arguments

# Create token-level labels (0: O, 1: Trigger, 2: Argument)
def generate_token_labels(sentence, causal_indicators):
    tokens = tokenizer.tokenize(sentence)
    labels = [0] * len(tokens)

    for indicator in causal_indicators:
        if indicator in sentence.lower():
            indicator_tokens = tokenizer.tokenize(indicator)
            start_idx = sentence.lower().find(indicator)

            for i, token in enumerate(tokens):
                token_start = sentence.find(token)
                if start_idx <= token_start < start_idx + len(indicator):
                    labels[i] = 1  # Mark as Trigger
                else:
                    labels[i] = 2  # Mark as Argument
    return tokens, labels

# Generate token-level labels
token_sentences, token_labels = [], []
for sentence in sentences:
    tokens, labels = generate_token_labels(sentence, ["caused by", "led to"])
    token_sentences.append(tokens)
    token_labels.append(labels)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from sklearn.metrics import classification_report

# Inference and evaluation
model.eval()
true_labels, pred_labels = [], []

for batch in dataloader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits.argmax(dim=1).cpu().numpy()
    true_labels.extend(batch["label"].cpu().numpy())
    pred_labels.extend(logits)

print(classification_report(true_labels, pred_labels, target_names=["Non-Causal", "Causal"]))


              precision    recall  f1-score   support

  Non-Causal       1.00      1.00      1.00     52682
      Causal       0.96      0.99      0.97      1110

    accuracy                           1.00     53792
   macro avg       0.98      0.99      0.99     53792
weighted avg       1.00      1.00      1.00     53792



              precision    recall  f1-score   support

  Non-Causal       1.00      1.00      1.00     52682
      Causal       0.96      0.99      0.97      1110

    accuracy                           1.00     53792
   macro avg       0.98      0.99      0.99     53792
weighted avg       1.00      1.00      1.00     53792

