In [1]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

def parse_rs3(file_path):
    """
    Parse a single .rs3 file to extract relations, segments, and groups.
    """
    # Parse the .rs3 file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract relations from the header
    relations = {}
    for rel in root.find("header").find("relations").findall("rel"):
        rel_name = rel.get("name")
        rel_type = rel.get("type")
        relations[rel_name] = rel_type

    # Extract segments and groups from the body
    segments = []
    groups = []
    for elem in root.find("body"):
        if elem.tag == "segment":
            segments.append({
                "file": os.path.basename(file_path),
                "id": elem.get("id"),
                "text": elem.text.strip() if elem.text else "",
                "parent": elem.get("parent"),
                "relname": elem.get("relname")
            })
        elif elem.tag == "group":
            groups.append({
                "file": os.path.basename(file_path),
                "id": elem.get("id"),
                "type": elem.get("type"),
                "parent": elem.get("parent"),
                "relname": elem.get("relname")
            })

    return relations, segments, groups

def parse_folder(folder_path):
    """
    Parse all .rs3 files in a folder and combine the extracted data into DataFrames.
    """
    all_relations = {}
    all_segments = []
    all_groups = []

    # Iterate over all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".rs3"):  # Process only .rs3 files
            file_path = os.path.join(folder_path, file_name)
            relations, segments, groups = parse_rs3(file_path)

            # Combine data
            all_relations[file_name] = relations
            all_segments.extend(segments)
            all_groups.extend(groups)

    # Convert to DataFrames
    segments_df = pd.DataFrame(all_segments)
    groups_df = pd.DataFrame(all_groups)

    return all_relations, segments_df, groups_df

# Example usage
folder_path = "pcc-main/rs3"
relations, segments_df, groups_df = parse_folder(folder_path)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Save to CSV for later use
segments_df.to_csv("parsed_segments.csv", index=False)
groups_df.to_csv("parsed_groups.csv", index=False)

# Example: Filter segments with a specific relation type
filtered_segments = segments_df[segments_df["relname"] == "cause"]


In [3]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the text column
segments_df["tokenized"] = segments_df["text"].apply(
    lambda x: tokenizer(x, truncation=True, padding="max_length", max_length=512)
)




In [4]:
from sklearn.preprocessing import LabelEncoder

# Map relation names to numerical labels
label_encoder = LabelEncoder()
segments_df["label"] = label_encoder.fit_transform(segments_df["relname"])

# Save the mapping for future use
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


In [5]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    segments_df["text"], segments_df["label"], test_size=0.2, random_state=42
)


In [6]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the training and test texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)



In [7]:
import torch

class RelationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = RelationDataset(train_encodings, train_labels.tolist())
test_dataset = RelationDataset(test_encodings, test_labels.tolist())

In [8]:
from transformers import AutoModelForSequenceClassification

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(label_mapping)  # Number of unique labels
)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.o

In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",           # Output directory
    evaluation_strategy="epoch",      # Evaluate after each epoch
    save_strategy="epoch",            # Save checkpoints after each epoch
    learning_rate=1e-4,               # Learning rate
    per_device_train_batch_size=16,   # Batch size for training
    per_device_eval_batch_size=16,    # Batch size for evaluation
    num_train_epochs=10,               # Number of epochs
    weight_decay=0.01,                # Weight decay
    save_total_limit=2,               # Save only the last 2 checkpoints
    load_best_model_at_end=True,      # Load the best model at the end of training
    metric_for_best_model="accuracy" # Use accuracy for evaluation
                   # Use GPU if available
)

In [10]:
from sklearn.metrics import accuracy_score
import numpy as np
def compute_metrics(eval_pred):
    """
    Compute accuracy without using the datasets library.
    """
    logits, labels = eval_pred  # Logits are raw predictions from the model
    predictions = np.argmax(logits, axis=-1)  # Take the class with the highest score for each sample
    accuracy = accuracy_score(labels, predictions)  # Compute accuracy score
    return {"accuracy": accuracy}

In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 4177
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2620
  Number of trainable parameters = 278065180
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.471929,0.380861
2,2.425200,2.463716,0.380861
3,2.425200,2.469627,0.380861
4,2.393800,2.446098,0.380861
5,2.393800,2.451947,0.380861
6,2.389600,2.444575,0.380861
7,2.389600,2.442722,0.380861
8,2.390600,2.446479,0.380861
9,2.390600,2.440046,0.380861
10,2.363400,2.418998,0.380861


***** Running Evaluation *****
  Num examples = 1045
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-262
Configuration saved in ./results/checkpoint-262/config.json
Model weights saved in ./results/checkpoint-262/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-262/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-262/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-156] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1045
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-524
Configuration saved in ./results/checkpoint-524/config.json
Model weights saved in ./results/checkpoint-524/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-524/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-524/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-1560] due to args.save_total_limit
***** Running Evaluation ****