# Process the datasets for masking 

In [13]:
from datasets import concatenate_datasets, load_dataset, Dataset
import os
import re
import json 
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch 
import numpy as np
import evaluate

In [14]:
def clean_sentence(sentence):
    suffix = r'(\$_\S*)'
    sentence = re.sub(suffix, '', sentence)
    sentence = sentence.replace("$$", "")
    sentence = sentence.replace("[", "")
    sentence = sentence.replace("]", "")
    sentence = sentence.replace("  ", " ")
    suffix2 = r'_[^\s]*'
    sentence = re.sub(suffix2, '', sentence)
    # remove spaces before punctuation
    pattern = r'\s+([.,;?!:])'
    sentence = re.sub(pattern, r'\1', sentence)
    # remove weird ``
    sentence = re.sub(r'``', '"', sentence)
    sentence = re.sub(r"''", '"', sentence)
    sentence = sentence.replace("\/", "")
    return sentence

def process_jsonl(input_file, output_file, col, gol):
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'a', encoding='utf-8') as outfile:
        
        for line in infile:
            data = json.loads(line)
            data[col] = clean_sentence(data[col])
            data[gol] = clean_sentence(data[gol])
            outfile.write(json.dumps(data) + '\n')

def add_other_golds(input_file, output_file, sentcol, goldcol, finalgoldcol): # only tiger + tüba
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'a', encoding='utf-8') as outfile:
        for line in infile:
            data = json.loads(line)
            if data[goldcol] != " ":
                json.dump({sentcol: data[sentcol], finalgoldcol: data[goldcol], "FCR": data["FCR"], "Gapping": data["Gapping"], "BCR": data["BCR"], "SGF": data["SGF"]}, outfile)
                outfile.write("\n")

In [15]:
# DELETE FILES BEFORE RUNNING THIS AGAIN!

# print("Getting other gold standards!")
# print("Tiger Train")
# input_file = '/Users/marisa/data/tiger_train.jsonl'
# output_file = '/Users/marisa/data/ALL_tiger_train.jsonl'

# add_other_golds(input_file, output_file, "Original sentence", "gold2 (LCO)", "Canonical form")
# add_other_golds(input_file, output_file, "Original sentence", "Canonical form", "Canonical form")

# print("Tiger Test")
# input_file = '/Users/marisa/data/tiger_test.jsonl'
# output_file = '/Users/marisa/data/ALL_tiger_test.jsonl'

# add_other_golds(input_file, output_file, "Original sentence", "gold2 (LCO)", "Canonical form")
# add_other_golds(input_file, output_file, "Original sentence", "Canonical form", "Canonical form")

# print("TüBa Train")
# input_file = '/Users/marisa/data/tüba_train.jsonl'
# output_file = '/Users/marisa/data/ALL_tüba_train.jsonl'

# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_1", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_2", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_3", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Reconstructed-Sentence", "Reconstructed-Sentence")
# print("TüBa Test")
# input_file = '/Users/marisa/data/tüba_test.jsonl'
# output_file = '/Users/marisa/data/ALL_tüba_test.jsonl'

# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_1", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_2", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_3", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Reconstructed-Sentence", "Reconstructed-Sentence")

In [16]:
train_data1 = os.path.expanduser("~/data/ALL_tiger_train.jsonl")
train_data2 = os.path.expanduser("~/data/ALL_tüba_train.jsonl")

train_dataset1 = load_dataset("json", data_files=train_data1, split='train')
train_dataset2 = load_dataset("json", data_files=train_data2, split='train')
train_dataset2 = train_dataset2.rename_column("Treebank-Sentence", "Original sentence")
train_dataset2 = train_dataset2.rename_column("Reconstructed-Sentence", "Canonical form")

# cols_to_check = ['BCR', 'FCR', 'Gapping', 'SGF']
# print(train_dataset1.num_rows)
# train_dataset1 = train_dataset1.filter(lambda row: not all(row[col] == "0" for col in cols_to_check))
# print(train_dataset1.num_rows)
# print(train_dataset2.num_rows)
# train_dataset2 = train_dataset2.filter(lambda row: not all(row[col] == "0" for col in cols_to_check))
# print(train_dataset2.num_rows)
train_dataset = concatenate_datasets([train_dataset1, train_dataset2])
print("Got train data")

t = "Original sentence"
g = "Canonical form"

test_data1 = os.path.expanduser("~/data/ALL_tiger_test.jsonl")
test_data2 = os.path.expanduser("~/data/ALL_tüba_test.jsonl")
test_dataset1 = load_dataset("json", data_files=test_data1, split='train')
test_dataset2 = load_dataset("json", data_files=test_data2, split='train')
test_dataset2 = test_dataset2.rename_column("Treebank-Sentence", "Original sentence")
test_dataset2 = test_dataset2.rename_column("Reconstructed-Sentence", "Canonical form")        
test_dataset = concatenate_datasets([test_dataset1, test_dataset2])
print("Got test data")

Got train data
Got test data


In [17]:
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['Original sentence', 'Canonical form', 'FCR', 'Gapping', 'BCR', 'SGF'],
    num_rows: 6920
})
Dataset({
    features: ['Original sentence', 'Canonical form', 'FCR', 'Gapping', 'BCR', 'SGF'],
    num_rows: 1739
})


In [18]:
def insert_extra_id(incomplete, complete):
    incomplete_tokens = incomplete.split()
    complete_tokens = complete.split()
    
    result = []
    extra_id_counter = 0
    i = 0  # Pointer for incomplete_tokens
    j = 0  # Pointer for complete_tokens
    while j < len(complete_tokens):
        if i < len(incomplete_tokens) and incomplete_tokens[i] == complete_tokens[j]:
            result.append(complete_tokens[j])
            i += 1
        else:
            # Collect missing span until tokens match again
            while j < len(complete_tokens) and (i >= len(incomplete_tokens) or complete_tokens[j] != incomplete_tokens[i]):
                j += 1
            result.append(f'<extra_id_{extra_id_counter}>')
            extra_id_counter += 1
            continue
        j += 1
    
    return " ".join(result)

In [19]:
# Verarbeitung des gesamten Datensatzes
processed_dataset = []
for data in train_dataset:
    incomplete = data['Original sentence']
    complete = data['Canonical form']
    processed_input = insert_extra_id(incomplete, complete)
    
    processed_dataset.append({
        "Masked": processed_input,
        "Target": complete,
        "FCR": data["FCR"],
        "Gapping": data["Gapping"],
        "BCR": data["BCR"],
        "SGF": data["SGF"]
    })

# for entry in processed_dataset:
#     print("Input with extra_id:", entry['Masked'])
#     print("Target:", entry['Target'])
#     print("---")

masked_dataset = Dataset.from_list(processed_dataset)
print(masked_dataset[1000])

processed_test_dataset = []
for data in test_dataset:
    incomplete = data['Original sentence']
    complete = data['Canonical form']
    processed_input = insert_extra_id(incomplete, complete)
    
    processed_test_dataset.append({
        "Masked": processed_input,
        "Target": complete,
        "FCR": data["FCR"],
        "Gapping": data["Gapping"],
        "BCR": data["BCR"],
        "SGF": data["SGF"]
    })

masked_test_dataset = Dataset.from_list(processed_test_dataset)
print(masked_test_dataset[1000])

{'Masked': 'Menschen , die dem schmutzigen Krieg Hitlers entfliehen wollten , qualifiziert Schreiber öffentlich so ab : Man müsse sehen , daß die Masse der Deserteure Leute waren , `` die sich entweder drücken wollten - der berühmte Drückeberger - <extra_id_0> , <extra_id_1>', 'Target': "Menschen , die dem schmutzigen Krieg Hitlers entfliehen wollten , qualifiziert Schreiber öffentlich so ab : Man müsse sehen , daß die Masse der Deserteure Leute waren , `` die sich entweder drücken wollten - der berühmte Drückeberger - ,_g daß_g die_g Masse_g der_g Deserteure_g letzten Endes der Feigling waren_g , oder aber , und das ist viel wichtiger zu wissen ,_f daß_f die_f Masse_f der_f Deserteure_f Leute waren , die eine Strafverfolgung durch die Militärgerichte ... zu erwarten hatten , wegen ganz anderer Taten . '' ", 'FCR': '1', 'Gapping': '1', 'BCR': '0', 'SGF': '0'}
{'Masked': 'Die Durchschnitts-Komödien , -Thriller und -Love Stories - warum sollte jemand dafür seine Wohnung verlassen und <ex

In [20]:
# CLEAN SENTENCES
final_dataset = masked_dataset.map(lambda x: {'Masked': clean_sentence(x['Masked'])})
final_dataset = masked_dataset.map(lambda x: {'Target': clean_sentence(x['Target'])})

print(final_dataset[1000])

final_test_dataset = masked_test_dataset.map(lambda x: {'Masked': clean_sentence(x['Masked'])})
final_test_dataset = masked_test_dataset.map(lambda x: {'Target': clean_sentence(x['Target'])})

print(final_test_dataset[1000])



[A[A

[A[A

Map: 100%|██████████| 6920/6920 [00:00<00:00, 24331.58 examples/s]


[A[A

[A[A

Map: 100%|██████████| 6920/6920 [00:00<00:00, 26319.66 examples/s]


{'Masked': 'Menschen , die dem schmutzigen Krieg Hitlers entfliehen wollten , qualifiziert Schreiber öffentlich so ab : Man müsse sehen , daß die Masse der Deserteure Leute waren , `` die sich entweder drücken wollten - der berühmte Drückeberger - <extra_id_0> , <extra_id_1>', 'Target': 'Menschen, die dem schmutzigen Krieg Hitlers entfliehen wollten, qualifiziert Schreiber öffentlich so ab: Man müsse sehen, daß die Masse der Deserteure Leute waren, " die sich entweder drücken wollten - der berühmte Drückeberger -, daß die Masse der Deserteure letzten Endes der Feigling waren, oder aber, und das ist viel wichtiger zu wissen, daß die Masse der Deserteure Leute waren, die eine Strafverfolgung durch die Militärgerichte... zu erwarten hatten, wegen ganz anderer Taten. " ', 'FCR': '1', 'Gapping': '1', 'BCR': '0', 'SGF': '0'}




Map: 100%|██████████| 1739/1739 [00:00<00:00, 25351.37 examples/s]


 10%|█         | 692/6920 [05:08<46:13,  2.25it/s]


Map: 100%|██████████| 1739/1739 [00:00<00:00, 10082.32 examples/s]

{'Masked': 'Die Durchschnitts-Komödien , -Thriller und -Love Stories - warum sollte jemand dafür seine Wohnung verlassen und <extra_id_0> Geld ausgeben , wenn er sie frei Haus haben konnte ?', 'Target': 'Die Durchschnitts-Komödien, -Thriller und -Love Stories - warum sollte jemand dafür seine Wohnung verlassen und warum sollte jemand dafür Geld ausgeben, wenn er sie frei Haus haben konnte? ', 'FCR': '0', 'Gapping': '0', 'BCR': '0', 'SGF': '0'}





In [21]:
final_dataset = final_dataset.train_test_split(test_size=0.2)

final_dataset.save_to_disk("MaskedTrainTestDataset")
final_test_dataset.save_to_disk("MaskedEvalDataset")

Saving the dataset (1/1 shards): 100%|██████████| 5536/5536 [00:00<00:00, 219858.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1384/1384 [00:00<00:00, 194489.12 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1739/1739 [00:00<00:00, 564936.46 examples/s]


In [22]:
print(final_dataset["train"][0])

{'Masked': 'Gestern begann der Prozeß gegen Hintermänner des Brandanschlags auf ein Asylbewerberheim in Dolgenbrodt - und <extra_id_0> wurde vertagt .', 'Target': 'Gestern begann der Prozeß gegen Hintermänner des Brandanschlags auf ein Asylbewerberheim in Dolgenbrodt - und der Prozeß wurde vertagt.', 'FCR': '0', 'Gapping': '0', 'BCR': '0', 'SGF': '1'}


# First training 

In [23]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["bleu"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Tokenizer laden
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenisierung
def tokenize_function(example):
    inputs = tokenizer(example["Masked"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(example["Target"], padding="max_length", truncation=True, max_length=128)
    
    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenisiertes Dataset erstellen
tokenized_dataset = final_dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
log_dir = os.path.expanduser("~/models/" + "FirstMasking" + "/logs")
metric = evaluate.load("bleu")

training_args = TrainingArguments(
    output_dir="FirstMasking",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=0,
    save_strategy="epoch",
    logging_dir=log_dir,
    logging_strategy="steps",
    logging_steps=100,
    load_best_model_at_end=True,
    push_to_hub=False
)

# Trainer einrichten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],  # Kann auch ein separater Validierungsdatensatz sein
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics,
)

# Training starten
trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 5536/5536 [00:01<00:00, 4561.39 examples/s]
Map: 100%|██████████| 1384/1384 [00:00<00:00, 4562.63 examples/s]


DatasetDict({
    train: Dataset({
        features: ['Masked', 'Target', 'FCR', 'Gapping', 'BCR', 'SGF', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5536
    })
    test: Dataset({
        features: ['Masked', 'Target', 'FCR', 'Gapping', 'BCR', 'SGF', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1384
    })
})


  1%|▏         | 100/6920 [00:27<30:57,  3.67it/s]

                                                  
  1%|▏         | 100/6920 [00:27<30:57,  3.67it/s]

{'loss': 3.0225, 'learning_rate': 4.9277456647398845e-05, 'epoch': 0.14}


  3%|▎         | 200/6920 [00:54<30:33,  3.66it/s]

                                                  
  3%|▎         | 200/6920 [00:54<30:33,  3.66it/s]

{'loss': 0.3835, 'learning_rate': 4.855491329479769e-05, 'epoch': 0.29}


  4%|▍         | 300/6920 [01:22<30:10,  3.66it/s]

                                                  
  4%|▍         | 300/6920 [01:22<30:10,  3.66it/s]

{'loss': 0.2303, 'learning_rate': 4.783236994219654e-05, 'epoch': 0.43}


  4%|▍         | 301/6920 [01:22<30:17,  3.64it/s]

KeyboardInterrupt: 

In [None]:
def predict(model, tokenizer, input_text):
    inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)
    outputs = model.generate(**inputs, num_beams=5, num_return_sequences=5)
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

test_input = "Meine Schwester fährt ein rotes Auto und <extra_id_0> wohnt in Bayern."
preds = predict(model, tokenizer, test_input)
for pred in preds:
    print(pred)