# Process the datasets for masking 

In [2]:
from datasets import concatenate_datasets, load_dataset, Dataset
import os
import re
import json 
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch 

In [3]:
def clean_sentence(sentence):
    suffix = r'(\$_\S*)'
    sentence = re.sub(suffix, '', sentence)
    sentence = sentence.replace("$$", "")
    sentence = sentence.replace("[", "")
    sentence = sentence.replace("]", "")
    sentence = sentence.replace("  ", " ")
    suffix2 = r'_[^\s]*'
    sentence = re.sub(suffix2, '', sentence)
    # remove spaces before punctuation
    pattern = r'\s+([.,;?!:])'
    sentence = re.sub(pattern, r'\1', sentence)
    # remove weird ``
    sentence = re.sub(r'``', '"', sentence)
    sentence = re.sub(r"''", '"', sentence)
    sentence = sentence.replace("\/", "")
    return sentence

def process_jsonl(input_file, output_file, col, gol):
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'a', encoding='utf-8') as outfile:
        
        for line in infile:
            data = json.loads(line)
            data[col] = clean_sentence(data[col])
            data[gol] = clean_sentence(data[gol])
            outfile.write(json.dumps(data) + '\n')

def add_other_golds(input_file, output_file, sentcol, goldcol, finalgoldcol): # only tiger + tüba
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'a', encoding='utf-8') as outfile:
        for line in infile:
            data = json.loads(line)
            if data[goldcol] != " ":
                json.dump({sentcol: data[sentcol], finalgoldcol: data[goldcol], "FCR": data["FCR"], "Gapping": data["Gapping"], "BCR": data["BCR"], "SGF": data["SGF"]}, outfile)
                outfile.write("\n")

In [4]:
# DELETE FILES BEFORE RUNNING THIS AGAIN!

# print("Getting other gold standards!")
# print("Tiger Train")
# input_file = '/Users/marisa/data/tiger_train.jsonl'
# output_file = '/Users/marisa/data/ALL_tiger_train.jsonl'

# add_other_golds(input_file, output_file, "Original sentence", "gold2 (LCO)", "Canonical form")
# add_other_golds(input_file, output_file, "Original sentence", "Canonical form", "Canonical form")

# print("Tiger Test")
# input_file = '/Users/marisa/data/tiger_test.jsonl'
# output_file = '/Users/marisa/data/ALL_tiger_test.jsonl'

# add_other_golds(input_file, output_file, "Original sentence", "gold2 (LCO)", "Canonical form")
# add_other_golds(input_file, output_file, "Original sentence", "Canonical form", "Canonical form")

# print("TüBa Train")
# input_file = '/Users/marisa/data/tüba_train.jsonl'
# output_file = '/Users/marisa/data/ALL_tüba_train.jsonl'

# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_1", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_2", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_3", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Reconstructed-Sentence", "Reconstructed-Sentence")
# print("TüBa Test")
# input_file = '/Users/marisa/data/tüba_test.jsonl'
# output_file = '/Users/marisa/data/ALL_tüba_test.jsonl'

# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_1", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_2", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Gold_3", "Reconstructed-Sentence")
# add_other_golds(input_file, output_file, "Treebank-Sentence", "Reconstructed-Sentence", "Reconstructed-Sentence")

In [5]:
train_data1 = os.path.expanduser("~/data/ALL_tiger_train.jsonl")
train_data2 = os.path.expanduser("~/data/ALL_tüba_train.jsonl")

train_dataset1 = load_dataset("json", data_files=train_data1, split='train')
train_dataset2 = load_dataset("json", data_files=train_data2, split='train')
train_dataset2 = train_dataset2.rename_column("Treebank-Sentence", "Original sentence")
train_dataset2 = train_dataset2.rename_column("Reconstructed-Sentence", "Canonical form")

cols_to_check = ['BCR', 'FCR', 'Gapping', 'SGF']
print(train_dataset1.num_rows)
train_dataset1 = train_dataset1.filter(lambda row: not all(row[col] == "0" for col in cols_to_check))
print(train_dataset1.num_rows)
print(train_dataset2.num_rows)
train_dataset2 = train_dataset2.filter(lambda row: not all(row[col] == "0" for col in cols_to_check))
print(train_dataset2.num_rows)
train_dataset = concatenate_datasets([train_dataset1, train_dataset2])
print("Got train data")

t = "Original sentence"
g = "Canonical form"
batchsize = 4
prefix = "reconstruct the ellipsis in this sentence: "
epochs = 10 #5
        

3997
3582
2923
1098
Got train data


In [6]:
print(train_dataset)

Dataset({
    features: ['Original sentence', 'Canonical form', 'FCR', 'Gapping', 'BCR', 'SGF'],
    num_rows: 4680
})


In [7]:
def insert_extra_id(incomplete, complete):
    incomplete_tokens = incomplete.split()
    complete_tokens = complete.split()
    
    result = []
    extra_id_counter = 0
    i = 0  # Pointer for incomplete_tokens
    j = 0  # Pointer for complete_tokens
    while j < len(complete_tokens):
        if i < len(incomplete_tokens) and incomplete_tokens[i] == complete_tokens[j]:
            result.append(complete_tokens[j])
            i += 1
        else:
            # Collect missing span until tokens match again
            while j < len(complete_tokens) and (i >= len(incomplete_tokens) or complete_tokens[j] != incomplete_tokens[i]):
                j += 1
            result.append(f'<extra_id_{extra_id_counter}>')
            extra_id_counter += 1
            continue
        j += 1
    
    return " ".join(result)

In [8]:
# Verarbeitung des gesamten Datensatzes
processed_dataset = []
for data in train_dataset:
    incomplete = data['Original sentence']
    complete = data['Canonical form']
    processed_input = insert_extra_id(incomplete, complete)
    
    processed_dataset.append({
        "Masked": processed_input,
        "Target": complete
    })

# for entry in processed_dataset:
#     print("Input with extra_id:", entry['Masked'])
#     print("Target:", entry['Target'])
#     print("---")

masked_dataset = Dataset.from_list(processed_dataset)
print(masked_dataset[1000])

{'Masked': 'Sein Vorgänger habe keine Abfindung erhalten und <extra_id_0> beziehe auch bis zum Auslaufen seines Vertrages im Herbst kein Gehalt .', 'Target': 'Sein Vorgänger habe keine Abfindung erhalten und sein_f Vorgänger_f  beziehe auch bis zum Auslaufen seines Vertrages im Herbst kein Gehalt . '}


In [9]:
# CLEAN SENTENCES
final_dataset = masked_dataset.map(lambda x: {'Masked': clean_sentence(x['Masked'])})
final_dataset = masked_dataset.map(lambda x: {'Target': clean_sentence(x['Target'])})

print(final_dataset[1000])

Map: 100%|██████████| 4680/4680 [00:00<00:00, 40941.72 examples/s]
Map: 100%|██████████| 4680/4680 [00:00<00:00, 38954.53 examples/s]

{'Masked': 'Sein Vorgänger habe keine Abfindung erhalten und <extra_id_0> beziehe auch bis zum Auslaufen seines Vertrages im Herbst kein Gehalt .', 'Target': 'Sein Vorgänger habe keine Abfindung erhalten und sein Vorgänger beziehe auch bis zum Auslaufen seines Vertrages im Herbst kein Gehalt. '}





In [10]:
final_dataset = final_dataset.train_test_split(test_size=0.2)

In [11]:
# Tokenizer laden
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenisierung
def tokenize_function(example):
    inputs = tokenizer(example["Masked"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(example["Target"], padding="max_length", truncation=True, max_length=128)
    
    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenisiertes Dataset erstellen
tokenized_dataset = final_dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
log_dir = os.path.expanduser("~/models/" + "FirstMasking" + "/logs")

training_args = TrainingArguments(
    output_dir="FirstMasking",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir=log_dir,
    logging_strategy="steps",
    logging_steps=100,
    load_best_model_at_end=True,
    push_to_hub=False
)

# Trainer einrichten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],  # Kann auch ein separater Validierungsdatensatz sein
    tokenizer=tokenizer,
)

# Training starten
trainer.train()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 3744/3744 [00:00<00:00, 4728.64 examples/s]
Map: 100%|██████████| 936/936 [00:00<00:00, 4724.03 examples/s]


DatasetDict({
    train: Dataset({
        features: ['Masked', 'Target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3744
    })
    test: Dataset({
        features: ['Masked', 'Target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 936
    })
})


  7%|▋         | 100/1404 [00:27<05:52,  3.69it/s]

{'loss': 2.9317, 'learning_rate': 4.643874643874644e-05, 'epoch': 0.21}


 14%|█▍        | 200/1404 [00:54<05:27,  3.68it/s]

{'loss': 0.3243, 'learning_rate': 4.287749287749288e-05, 'epoch': 0.43}


 21%|██▏       | 300/1404 [01:22<05:00,  3.67it/s]

{'loss': 0.2617, 'learning_rate': 3.931623931623932e-05, 'epoch': 0.64}


 28%|██▊       | 400/1404 [01:49<04:32,  3.68it/s]

{'loss': 0.1835, 'learning_rate': 3.575498575498576e-05, 'epoch': 0.85}


                                                  
 33%|███▎      | 468/1404 [02:15<04:14,  3.67it/s]

{'eval_loss': 0.14146994054317474, 'eval_runtime': 7.6273, 'eval_samples_per_second': 122.718, 'eval_steps_per_second': 15.34, 'epoch': 1.0}


 36%|███▌      | 500/1404 [02:25<04:05,  3.68it/s]

{'loss': 0.1518, 'learning_rate': 3.2193732193732194e-05, 'epoch': 1.07}


 43%|████▎     | 600/1404 [02:52<03:39,  3.67it/s]

{'loss': 0.1484, 'learning_rate': 2.863247863247863e-05, 'epoch': 1.28}


 50%|████▉     | 700/1404 [03:19<03:10,  3.70it/s]

{'loss': 0.157, 'learning_rate': 2.5071225071225073e-05, 'epoch': 1.5}


 57%|█████▋    | 800/1404 [03:47<02:43,  3.70it/s]

{'loss': 0.1417, 'learning_rate': 2.150997150997151e-05, 'epoch': 1.71}


 64%|██████▍   | 900/1404 [04:14<02:16,  3.70it/s]

{'loss': 0.1551, 'learning_rate': 1.794871794871795e-05, 'epoch': 1.92}


                                                  
 67%|██████▋   | 936/1404 [04:31<02:06,  3.69it/s]

{'eval_loss': 0.12826012074947357, 'eval_runtime': 7.6035, 'eval_samples_per_second': 123.101, 'eval_steps_per_second': 15.388, 'epoch': 2.0}


 71%|███████   | 1000/1404 [04:49<01:49,  3.69it/s]

{'loss': 0.167, 'learning_rate': 1.4387464387464389e-05, 'epoch': 2.14}


 78%|███████▊  | 1100/1404 [05:16<01:22,  3.69it/s]

{'loss': 0.1124, 'learning_rate': 1.0826210826210826e-05, 'epoch': 2.35}


 85%|████████▌ | 1200/1404 [05:43<00:55,  3.70it/s]

{'loss': 0.1214, 'learning_rate': 7.264957264957266e-06, 'epoch': 2.56}


 93%|█████████▎| 1300/1404 [06:10<00:28,  3.70it/s]

{'loss': 0.1491, 'learning_rate': 3.7037037037037037e-06, 'epoch': 2.78}


100%|█████████▉| 1400/1404 [06:38<00:01,  3.70it/s]

{'loss': 0.1515, 'learning_rate': 1.4245014245014247e-07, 'epoch': 2.99}


                                                   
100%|██████████| 1404/1404 [06:46<00:00,  3.69it/s]

{'eval_loss': 0.12606953084468842, 'eval_runtime': 7.6005, 'eval_samples_per_second': 123.15, 'eval_steps_per_second': 15.394, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1404/1404 [06:47<00:00,  3.44it/s]

{'train_runtime': 407.8614, 'train_samples_per_second': 27.539, 'train_steps_per_second': 3.442, 'train_loss': 0.3676083913428491, 'epoch': 3.0}





TrainOutput(global_step=1404, training_loss=0.3676083913428491, metrics={'train_runtime': 407.8614, 'train_samples_per_second': 27.539, 'train_steps_per_second': 3.442, 'train_loss': 0.3676083913428491, 'epoch': 3.0})

In [25]:
def predict(model, tokenizer, input_text):
    inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)
    outputs = model.generate(**inputs, num_beams=5, num_return_sequences=5)
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

test_input = "Meine Schwester fährt ein rotes Auto und <extra_id_0> wohnt in Bayern."
preds = predict(model, tokenizer, test_input)
for pred in preds:
    print(pred)



Meine Schwester fährt ein rotes Auto und meine Schwester wohnt in Bayern
Meine Schwester fährt ein rotes Auto und mein Schwester wohnt in Bayern
Mein Schwester fährt ein rotes Auto und mein Schwester wohnt in Bayern
Mein Schwester fährt ein rotes Auto und meine Schwester wohnt in Bayern
Meine Schwester fährt ein rotes Auto und Meine Schwester wohnt in Bayern
