In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import numpy as np
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer


import subprocess


import sys

In [3]:
# Ensure the German SpaCy model is installed
try:
    nlp = spacy.load("de_core_news_sm")
except OSError:
    print("Downloading 'de_core_news_sm' model...")
    subprocess.run([sys.executable, "-m", "spacy", "download", "de_core_news_sm"], check=True)
    nlp = spacy.load("de_core_news_sm")

In [4]:
def parse_rst_to_segments(rst_content):
    """
    Parses the RST content in XML format and extracts segments as a list of dictionaries.
    Each dictionary contains the text of a segment.
    """
    segments = []
    root = ET.fromstring(rst_content)
    body = root.find("body")
    if body is not None:
        for segment in body.findall("segment"):
            text = segment.text.strip() if segment.text else ""
            if text:
                segments.append({"text": text})
    return segments

In [5]:
def extract_features(text):
    """
    Extracts linguistic and statistical features from text.
    """
    doc = nlp(text)
    
    # Part-of-speech tags (POS)
    pos_tags = [token.pos_ for token in doc]
    
    # Dependency relations
    dependencies = [token.dep_ for token in doc]
    
    # Sentence length
    sentence_length = len(doc)
    
    # Count of punctuation marks
    punctuation_count = sum(1 for token in doc if token.is_punct)
    
    return {
        "pos_tags": pos_tags,
        "dependencies": dependencies,
        "sentence_length": sentence_length,
        "punctuation_count": punctuation_count,
    }


In [6]:
def prepare_segmentation_data_full_files(rst_files):
    """
    Processes entire RST files as single segments while preserving segmentation labels.
    Ensures that input sequences do not exceed model max length.
    Also extracts linguistic and statistical features for each file.
    """
    tokens_list = []
    labels_list = []
    
    pos_list = []
    dep_list = []
    lengths = []
    punct_counts = []
    
    label_map = {"B-EDU": 0, "I-EDU": 1, "O": 2}  # Label mapping
    max_length = 512  # Model max length

    for rst_file in rst_files:
        try:
            with open(rst_file, 'r', encoding='utf-8') as file:
                rst_content = file.read()
                segments = parse_rst_to_segments(rst_content)
                
                full_text = " ".join([segment['text'] for segment in segments])
                tokens = tokenizer.tokenize(full_text)
                
                # Ensure tokens do not exceed max_length
                if len(tokens) > max_length:
                    tokens = tokens[:max_length - 2]  # Reserve space for special tokens
                
                labels = []
                current_index = 0
                for segment in segments:
                    segment_tokens = tokenizer.tokenize(segment['text'])
                    if segment_tokens:
                        labels.append(label_map['B-EDU'])
                        labels.extend([label_map['I-EDU']] * (len(segment_tokens) - 1))
                        current_index += len(segment_tokens)
                    
                    # Stop adding segments if we exceed max length
                    if current_index >= max_length:
                        break
                
                labels = labels[:max_length - 2]  # Truncate labels if necessary
                
                # Extract linguistic/statistical features
                features = extract_features(full_text)
                pos_list.append(features["pos_tags"])
                dep_list.append(features["dependencies"])
                lengths.append(features["sentence_length"])
                punct_counts.append(features["punctuation_count"])

                tokens_list.append(tokens)
                labels_list.append(labels)

        except Exception as e:
            print(f"Error processing {rst_file}: {e}")

    return {
        "tokens": tokens_list,
        "labels": labels_list,
        "pos_tags": pos_list,
        "dependencies": dep_list,
        "sentence_length": lengths,
        "punctuation_count": punct_counts
    }


In [7]:
def prepare_segmentation_data_from_folder_full(folder_path):
    """
    Prepares segmentation data from all RST files in a folder while preserving segmentation labels.
    """
    rst_files = [
        os.path.join(folder_path, file_name)
        for file_name in os.listdir(folder_path)
        if file_name.endswith(".rs3") and not file_name.startswith(".") and os.path.isfile(os.path.join(folder_path, file_name))
    ]
    
    return prepare_segmentation_data_full_files(rst_files)

In [8]:

# Initialize tokenizer
model_name = "xlm-roberta-large"  # or "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function_full(examples):
    """
    Tokenizes full-text inputs while maintaining correct segmentation label alignment.
    Ensures inputs do not exceed max sequence length.
    """
    max_length = 512
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        is_split_into_words=True,
        max_length=max_length,  # Ensure truncation
    )

    all_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [-100] * len(word_ids)  # Initialize with -100 for padding
        
        previous_word_id = None
        label_index = 0

        for idx, word_id in enumerate(word_ids):
            if word_id is None:
                continue
            
            if previous_word_id is None or word_id != previous_word_id:
                if label_index < len(labels):
                    aligned_labels[idx] = labels[label_index]
                label_index += 1
            else:
                if label_index - 1 < len(labels):
                    aligned_labels[idx] = labels[label_index - 1]

            previous_word_id = word_id

        all_labels.append(aligned_labels[:max_length])  # Ensure labels do not exceed max length

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs



In [9]:
rst_folder_path = "data/pcc-main/rs3_no_aug/"
segmentation_full_text= prepare_segmentation_data_from_folder_full(rst_folder_path)
segmentation_full_text_dataset = Dataset.from_dict(segmentation_full_text)
segmentation_fulltext_tokenized = segmentation_full_text_dataset.map(tokenize_function_full, batched=True, batch_size=16)


Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 454/454 [00:08<00:00, 53.07 examples/s]


In [10]:
# Train-test split

train_test_split_data = segmentation_fulltext_tokenized.train_test_split(test_size=0.2)

seg_train_test = DatasetDict({"train": train_test_split_data["train"], "test": train_test_split_data["test"]})



In [11]:
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, max_length=128)


In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Get the predicted class for each token
    
    # Flatten the arrays to compute metrics at the token level
    labels_flat = labels.flatten()
    preds_flat = preds.flatten()
    
    # Filter out ignored index (-100) if applicable
    mask = labels_flat != -100
    labels_filtered = labels_flat[mask]
    preds_filtered = preds_flat[mask]
    
    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels_filtered, preds_filtered, average="weighted")
    accuracy = accuracy_score(labels_filtered, preds_filtered)
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:

# Custom Trainer for Weighted Loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        outputs = model(**inputs)
        logits = outputs.logits
        labels = inputs["labels"]

        # Define class weights (adjust if needed)
        loss_weights = torch.tensor([2.0, 1.0, 1.0]).to(logits.device)

        # Compute weighted loss
        loss_function = torch.nn.CrossEntropyLoss(weight=loss_weights, ignore_index=-100)
        loss = loss_function(logits.view(-1, logits.size(-1)), labels.view(-1))

        return (loss, outputs) if return_outputs else loss
        
model_name = "xlm-roberta-large"
segmentation_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=3, device_map="auto",)  # B-EDU, I-EDU, O
segmentation_model.gradient_checkpointing_enable()


# Step 6: Training Arguments
seg_training_args = TrainingArguments(
    output_dir="./Models/segmentation_full_text_model_large_feat",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # Reduced batch size
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    gradient_accumulation_steps=4,  # Gradient accumulation
    fp16=True,  # Mixed precision
)

# Step 7: Trainer Setup
seg_trainer = CustomTrainer(
    model=segmentation_model,
    args=seg_training_args,
    train_dataset=seg_train_test["train"],
    eval_dataset=seg_train_test["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



# Step 8: Training
seg_trainer.train()

# Step 9: Evaluation
seg_results = seg_trainer.evaluate()

print("Segmentation Evaluation:", seg_results)



# Save Model
seg_trainer.model.save_pretrained("./Models/segmentation_full_text_model_large_feat")


In [None]:
def prepare_segmentation_test_data(folder_path):
    """
    Prepare test data for the segmentation model from all files in a folder,
    ensuring the format is consistent with the training labels.
    Also extracts linguistic and statistical features per file.
    Skips hidden files and unreadable files.
    """
    all_tokens = []
    all_labels = []

    pos_list = []
    dep_list = []
    lengths = []
    punct_counts = []

    label_map = {"B-EDU": 0, "I-EDU": 1, "O": 2}  # Mapping of label strings to integers
    max_length = 512

    for file_name in os.listdir(folder_path):
        if file_name.startswith("."):
            continue

        file_path = os.path.join(folder_path, file_name)
        if not os.path.isfile(file_path):
            continue

        file_tokens = []
        file_labels = []
        full_text_parts = []

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    parts = line.strip().split(" ")
                    text = " ".join(parts[1:-1]).strip("[]")
                    full_text_parts.append(text)

                    segment_tokens = tokenizer.tokenize(text)
                    if not segment_tokens:
                        continue
                    
                    # Ensure max length is not exceeded
                    if len(file_tokens) + len(segment_tokens) > max_length - 2:
                        break
                    
                    file_tokens.extend(segment_tokens)
                    file_labels.append(label_map['B-EDU'])  # First token as B-EDU
                    file_labels.extend([label_map['I-EDU']] * (len(segment_tokens) - 1))
            
            full_text = " ".join(full_text_parts)
            features = extract_features(full_text)

            pos_list.append(features["pos_tags"])
            dep_list.append(features["dependencies"])
            lengths.append(features["sentence_length"])
            punct_counts.append(features["punctuation_count"])

            all_tokens.append(file_tokens)
            all_labels.append(file_labels)
            
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
    
    return {
        "tokens": all_tokens,
        "labels": all_labels,
        "pos_tags": pos_list,
        "dependencies": dep_list,
        "sentence_length": lengths,
        "punctuation_count": punct_counts
    }


In [None]:
# Example usage
folder_path = "./data/Essays_dataset"  # Replace with your dataset folder path
test_data_full = prepare_segmentation_test_data(folder_path)  # Adjust chunk size as needed
tokenized_test_dataset = Dataset.from_dict(test_data_full)

# Apply tokenization function correctly
tokenized_test_dataset_full = tokenized_test_dataset.map(tokenize_function_full, batched=True)

In [None]:
print(test_data_full['labels'][0])

In [None]:
model = AutoModelForTokenClassification.from_pretrained("./Models/segmentation_full_text_model_large_feat")

model.eval()

In [None]:
# Predict on Test Data
# Tokenize the test data



# Predict on Test Data
predictions = []
no_in = []
with torch.no_grad():
    for input_ids, attention_mask in tqdm(zip(tokenized_test_dataset_full["input_ids"], tokenized_test_dataset_full["attention_mask"])):
        input_ids = torch.tensor([input_ids])
        attention_mask = torch.tensor([attention_mask])

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).squeeze().tolist()

        # Filter out padding tokens
        filtered_preds = [
            p for p, mask in zip(preds, attention_mask.squeeze().tolist()) if mask == 1
        ]
        no_in_pred = [
            p for p, mask,in_id in zip(preds, attention_mask.squeeze().tolist(),input_ids.squeeze().tolist()) if mask == 1 and in_id not in [0,2,1]
        ]
        predictions.append(filtered_preds)
        no_in.append(no_in_pred)

# Map Predictions Back to Labels
inverse_label_map = {0: "B-EDU", 1: "I-EDU", 2: "O"}
predicted_labels = []
for pred in predictions:
    predicted_labels.append([inverse_label_map[label] for label in pred])

# Flatten labels for evaluation
true_labels_flat = []
predicted_labels_flat = []
for true, pred, attention_mask in zip(tokenized_test_dataset_full["labels"], predictions, tokenized_test_dataset_full["attention_mask"]):
    for t, p, mask in zip(true, pred, attention_mask):
        if mask == 1 and t != -100:  # Exclude padding and special tokens
            true_labels_flat.append(t)
            predicted_labels_flat.append(p)

# Compute Metrics
precision, recall, f1, _ = precision_recall_fscore_support(true_labels_flat, predicted_labels_flat, average="weighted")
accuracy = accuracy_score(true_labels_flat, predicted_labels_flat)


print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

correct the input of the test data..
Make one with large
make example sythetic example and the example from manfred and 
this weekend we should start making the first half of the report or at least put the main structure!!

In [8]:

def segment_text(text, output_filename):
    """
    Segment text based on model predictions and save to a file.
    """
    encoded_text = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**encoded_text)
    
    predictions = torch.argmax(outputs.logits, dim=-1)
    label_map = {0: "B-EDU", 1: "I-EDU", 2: "O"}  # Adjust based on training labels
    predicted_labels = [label_map[label.item()] for label in predictions[0]]
    
    tokens = tokenizer.convert_ids_to_tokens(encoded_text["input_ids"][0])
    segmented_text = ""
    
    for token, label in zip(tokens, predicted_labels):
        if label == "B-EDU" and segmented_text:
            segmented_text += "\n**\n"  # New line for each segment
        segmented_text += token + " "
    
    segmented_text = segmented_text.strip()
    output_folder = 'segmented'
    output_path = os.path.join(output_folder, output_filename)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(segmented_text)
    
    return output_path


In [10]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

# Example German test text
test_text = """Franz Kafka, von neuem gewürdigt Vor gut zwanzig Jahren, im Sommer 1924, starb Franz Kafka im Alter von 44 Jahren.
Während der folgenden Jahre wuchs sein Ruf ständig in Deutschland und Österreich,
seit 1930 auch in Frankreich, England und Amerika. Merkwürdigerweise stimmen seine Bewunderer in diesen Ländern trotz starker Uneinigkeit über den eigentlichen Sinn seines Werkes in einem wesentlichen Punkte überein: 
alle sind betroffen von dem Neuartigen seiner Erzählerkunst, von etwas spezifisch Modernem,
das sonst nirgends in der gleichen Stärke und Unzweideutigkeit erscheint.
Dies ist erstaunlich,
da Kafka - in auffälligem Gegensatz zu anderen Lieblingsschriftstellern der Intellektuellen - keinerlei technische Experimente vornahm.
Ohne die deutsche Sprache in irgendeiner Weise zu verändern,
entkleidete er sie ihrer verwickelten Satzkonstruktionen,
bis sie klar und einfach wurde wie die Umgangssprache,
wenn sie von Nachlässigkeiten und Jargon gereinigt ist.
Die Einfachheit und mühelose Natürlichkeit seiner Sprache mögen darauf hinweisen, daß Kafkas Modernität und die Schwierigkeit seines Werkes wenig mit jener modernen Komplikation des inneren Lebens zu tun haben,
die immer auf der Suche nach neuen und einmaligen Techniken ist,
um neue und einmalige Gefühle auszudrücken.
Das gemeinsame Erlebnis der Leser Kafkas ist eine allgemeine, unbestimmbare Bezauberung, sogar bei Erzählungen,
die sie nicht verstehen, eine klare Erinnerung an merkwürdige und scheinbar unsinnige Bilder und Beschreibungen, -
bis sich ihnen eines Tages der verborgene Sinn mit der plötzlichen Deutlichkeit einer einfachen und unangreifbaren Wahrheit enthüllt.
Beginnen wir mit dem Roman Der Prozeß,
über den eine kleine Bibliothek von Auslegungen veröffentlicht worden ist.
Es ist die Geschichte eines Mannes,
dem der Prozeß gemacht wird nach Gesetzen,
die er nicht entdecken kann,
und der schließlich hingerichtet wird,
ohne daß er herausfinden konnte, um was es sich dabei handelte.
Auf der Suche nach dem wahren Grund seiner Qual erfährt er, daß dahinter »eine große Organisation sich befindet.
Eine Organisation, die nicht nur bestechliche Wächter, läppische Aufseher und Untersuchungsrichter ... beschäftigt,
sondern die weiterhin jedenfalls eine Richterschaft hohen und höchsten Grades unterhält, mit dem zahllosen, unumgänglichen Gefolge von Dienern, Schreibern, Gendarmen und andern Hilfskräften, vielleicht sogar Henkern ....«
Er nimmt sich einen Rechtsanwalt,
der ihm sofort sagt, das einzig Vernünftige sei, sich den bestehenden Zuständen anzupassen
und sie nicht zu kritisieren.
Er wendet sich um Rat an den Gefängnispfarrer,
und der Geistliche predigt die verborgene Größe des Systems
und befiehlt ihm, nicht nach der Wahrheit zu fragen,
denn »man muß nicht alles für wahr halten, man muß es nur für notwendig halten.«
»Trübselige Meinung«,
sagte K.
»Die Lüge wird zur Weltordnung gemacht.«
"""

In [21]:
model = AutoModelForTokenClassification.from_pretrained("Models/segmentation_model_base_4_chunks_features_aug/")


In [22]:
directory_test = '4_text_base.txt'
segment_text(test_text, directory_test)


'segmented/4_text_base.txt'

In [None]:
# Tokenize the text
encoded_text = tokenizer(test_text, truncation=True, padding=True, return_tensors="pt")

print("Tokenized Text:", encoded_text)


# Run inference
with torch.no_grad():
    outputs = model(**encoded_text)

# Get predictions
predictions = torch.argmax(outputs.logits, dim=-1)

# Convert back to labels
label_map = {0: "B-EDU", 1: "I-EDU", 2: "O"}  # Adjust this based on your training labels
predicted_labels = [label_map[label.item()] for label in predictions[0]]

# Print segmented output
tokens = tokenizer.convert_ids_to_tokens(encoded_text["input_ids"][0])
for token, label in zip(tokens, predicted_labels):
    print(f"{token}: {label}")
