In [1]:
import os
import random
import datasets

import numpy as np
import pandas as pd

from datetime import datetime
from IPython.display import display, HTML
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, \
Seq2SeqTrainingArguments, Seq2SeqTrainer, MarianMTModel, MarianTokenizer

In [2]:
# model = "Helsinki-NLP/opus-mt-en-de"
model = "IKEA-MT-development/Models/en_GB-de_DE/IKEA-MT_en-GB_de-DE_2022-08-01 08:36:40.937862/"

In [3]:
df = pd.read_csv('IKEA-MT-development/Data/en_GB-de_DE/cleaned_data.csv', engine='python')

df = df[['en_GB', 'de_DE']]
df = df[(df['en_GB'].notnull()) & (df['de_DE'].notnull())]

df.drop_duplicates()

data = Dataset.from_pandas(pd.DataFrame({'translation': df.to_dict('records')}))

In [4]:
train_test_valid = data.train_test_split(test_size=0.0015)

test_valid = train_test_valid['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    'train': train_test_valid['train'],
    'validation': test_valid['test'],
    'test': test_valid['train']})

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 3887030
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2920
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2920
    })
})

In [6]:
df = pd.DataFrame(data[[random.randint(0, len(data)) for i in range(5)]])

display(HTML(df.to_html()))

Unnamed: 0,translation
0,"{'de_DE': '80 der Hundert größten Unternehmen der Europäischen Union werden durch nur drei Agenturen kontrolliert . Gerade deshalb sollte das Rating von Unternehmen auch von diesen selbst besser nachvollzogen werden können .', 'en_GB': 'Of the hundred largest enterprises in the European Union , 80 are monitored by only three agencies , which should , therefore , be able to do a better job of rating them .'}"
1,"{'de_DE': 'Schätzungen zufolge wurden 45 000 Soldaten gegen 5 000 Rebellen in die Provinz entsandt .', 'en_GB': 'An estimated 45 000 troops have been sent into the province against 5000 rebels .'}"
2,"{'de_DE': 'Die Erfahrungen der Gerichtshöfe , die wir bereits hatten , der Kriegsverbrechertribunale für Ruanda und Jugoslawien , machen deutlich , wie unzureichend das gegenwärtige System bei der Ahndung solcher von mir genannter Verbrechen ist .', 'en_GB': 'Experience from those courts we have set up , namely the war crimes tribunals for Rwanda and Yugoslavia , clearly show that the present system is inadequate for dealing with the type of crimes I have just mentioned .'}"
3,"{'de_DE': 'Das RP7 ist zentraler Bestandteil der Lissabon-Strategie für Wachstum und Arbeitsplätze .', 'en_GB': 'FP7 is a central part of the Lisbon Strategy for growth and jobs .'}"
4,"{'de_DE': 'Booking .com: Hotel Holiday Inn Manhattan Sixth Avenue , Manhattan (New York) , Vereinigte Staaten - 537 Gästebewertungen .', 'en_GB': 'Booking .com: hotel Holiday Inn Manhattan Sixth Avenue , Manhattan (New York) , United States of America - 549 Guest reviews .'}"


In [7]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [8]:
def processing(dataset, source_len=128, target_len=128, source="en_GB", target="de_DE"):
    """
    Output: Generates tokenized data using the attributes of the base model
    :param dataset: raw dataset to transform
    :param source_len: maximum sentence length for source string
    :param target_len: maximum sentence length for target string
    :param source: source language
    :param target: target language
    """
    inputs = [s[source] for s in dataset["translation"]]
    targets = [s[target] for s in dataset["translation"]]
    
    model_inputs = tokenizer(inputs, max_length=source_len, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=target_len, truncation=True)
        
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [9]:
tokenized_dataset = dataset.map(processing, batched=True)

  0%|          | 0/3888 [00:00<?, ?ba/s]

KeyboardInterrupt: 

In [26]:
trained_model = AutoModelForSeq2SeqLM.from_pretrained(model)

Downloading:   0%|          | 0.00/284M [00:00<?, ?B/s]

In [27]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=trained_model)

In [28]:
metric = load_metric("sacrebleu")

def compute_metrics(predictions):
    """
    Output: evaluation metrics to track model performance in training
    :param predictions: output of predictions to decode
    """
    def process_text(predictions, labels):
        preds = [pred.strip() for pred in predictions]
        labels = [[label.strip()] for label in labels]
        return preds, labels
    
    preds, labels = predictions
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = process_text(decoded_preds, decoded_labels)
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()} #round results
    
    return result

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [78]:
time = str(datetime.now())

batch_size = 16
model_name = 'IKEA_MT_en-de'

args = Seq2SeqTrainingArguments(
    f"{model_name}_en_GB-to-de_DE_{time}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=20,
    num_train_epochs=5,
    predict_with_generate=True    
)

trainer = Seq2SeqTrainer(
    trained_model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3887030
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 303675


Epoch,Training Loss,Validation Loss


Saving model checkpoint to IKEA_MT_en-de_en_GB-to-de_DE_2022-07-29 10:21:41.553928/checkpoint-500
Configuration saved in IKEA_MT_en-de_en_GB-to-de_DE_2022-07-29 10:21:41.553928/checkpoint-500/config.json
Model weights saved in IKEA_MT_en-de_en_GB-to-de_DE_2022-07-29 10:21:41.553928/checkpoint-500/pytorch_model.bin
tokenizer config file saved in IKEA_MT_en-de_en_GB-to-de_DE_2022-07-29 10:21:41.553928/checkpoint-500/tokenizer_config.json
Special tokens file saved in IKEA_MT_en-de_en_GB-to-de_DE_2022-07-29 10:21:41.553928/checkpoint-500/special_tokens_map.json
Saving model checkpoint to IKEA_MT_en-de_en_GB-to-de_DE_2022-07-29 10:21:41.553928/checkpoint-1000
Configuration saved in IKEA_MT_en-de_en_GB-to-de_DE_2022-07-29 10:21:41.553928/checkpoint-1000/config.json
Model weights saved in IKEA_MT_en-de_en_GB-to-de_DE_2022-07-29 10:21:41.553928/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in IKEA_MT_en-de_en_GB-to-de_DE_2022-07-29 10:21:41.553928/checkpoint-1000/tokenizer_conf

In [79]:
trainer.save_model('IKEA-MT_en-GB_de-DE_' + time)

Saving model checkpoint to IKEA-MT_en-GB_de-DE_2022-08-01 08:36:40.937862
Configuration saved in IKEA-MT_en-GB_de-DE_2022-08-01 08:36:40.937862/config.json
Model weights saved in IKEA-MT_en-GB_de-DE_2022-08-01 08:36:40.937862/pytorch_model.bin
tokenizer config file saved in IKEA-MT_en-GB_de-DE_2022-08-01 08:36:40.937862/tokenizer_config.json
Special tokens file saved in IKEA-MT_en-GB_de-DE_2022-08-01 08:36:40.937862/special_tokens_map.json


## Inference

In [5]:
src_text = "AO57/58"
model_name = "../EU_IKEA_clean_data_2022-06-28 00:03:00.355629/"

model = MarianMTModel.from_pretrained(model_name)

tokenizer = MarianTokenizer.from_pretrained(model_name, output_loading_info=False)

def inference(src_text):
    """
    Output: Translated text of the source text
    :param src_text: source string
    """   
    translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=False))
    translated_txt = [tokenizer.decode(s, skip_special_tokens=True) for s in translated]

    return translated_txt

In [7]:
src_text = "printscreen"

inference(src_text)

['Druckbildschirm']

In [8]:
src_text = "We have received your incident report. Your ticket number is <XXXXX> (shows as a hyperlink to the actual ticket), You don’t need to do anything at the moment – we will get back to you soon."

inference(src_text)

['Wir haben Euren Vorfallbericht erhalten. Eure Ticketnummer lautet <XXXXXX> (zeigt als Hyperlink zum aktuellen Ticket), ihr müsst im Moment nichts unternehmen – wir melden euch in Kürze.']

In [80]:
test_set = pd.read_csv('test_sample.csv')
# test_set = pd.read_excel('NowIT-stefan.xlsx')
# test_set['en_GB'] = test_set['Source text'].apply(lambda s: s.replace('\t', '').replace('\n', ''))
# test_set['en_GB'] = test_set['en_GB'].apply(lambda s: s.replace("\\", ''))
# test_set['en_GB'] = test_set['en_GB'].apply(lambda s: s.replace(".", ' ').replace(",", ' ').replace("!", ' '))

model_path = "IKEA-MT_en-GB_de-DE_2022-08-01 08:36:40.937862/"
model = MarianMTModel.from_pretrained(model_path)

tokenizer = MarianTokenizer.from_pretrained(model_path)

def inference(src_text):

    translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=False))
    translated_txt = [tokenizer.decode(s, skip_special_tokens=True) for s in translated]

    return translated_txt

test_set['Translation'] =  test_set['en_GB'].apply(lambda s: inference(s)[0])

loading configuration file IKEA-MT_en-GB_de-DE_2022-08-01 08:36:40.937862/config.json
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-en-de",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      58100
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 58100,
  "decoder_vocab_size": 58101,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init

In [81]:
test_set.to_excel('new_model_v4.xlsx')

In [49]:
test_set['en_GB'] = test_set['Source text'].apply(lambda s: s.replace('\t', '').replace('\n', ''))

In [58]:
test_set['en_GB'] = test_set['en_GB'].apply(lambda s: s.replace("/", ''))

In [62]:
test_set['en_GB'].iloc[0]

'1.Incident  Received (caller and watch list user)'

In [82]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2920
  Batch size = 64


{'eval_loss': 1.5674965381622314,
 'eval_bleu': 27.983,
 'eval_gen_len': 33.1442,
 'eval_runtime': 245.5699,
 'eval_samples_per_second': 11.891,
 'eval_steps_per_second': 0.187}