# Experiments
This notebook contains some of the experiments that were performed for the Master Thesis


In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_from_disk, concatenate_datasets
from tatoeba import analysis as tatoeba_analysis
from rude_nmt import analysis as rude_nmt_analysis

## Merge datasets
Due to the large amount of time needed to generate the translations, they are performed for each direction individually. Run the following merge in order to combine the two datasets into one for the remaining analyses if that has not already been done.

In [None]:
ko_data = load_from_disk("./data/tatoeba_de_ko_labelled")
de_data = load_from_disk("./data/tatoeba_ko_de_labelled")

In [None]:
remove_cols = [col for col in ko_data.column_names if col in de_data.column_names]
disjunct_de = de_data.remove_columns(column_names=remove_cols)
merged_data = concatenate_datasets([ko_data, disjunct_de], axis=1)

In [None]:
merged_data.save_to_disk("./data/tatoeba_merged")

## Explore Dataset
First the dataset is explored to obtain some base level statistics

In [None]:
ds = load_from_disk("./data/tatoeba_merged")

In [None]:
print(ds)

In [None]:
tatoeba_analysis.get_formality_plot(ds, "de_formality")

In [None]:
tatoeba_analysis.get_formality_plot(ds, "ko_formality", ax_annotate_vals=(0.3,2500))

remove all ambiguous samples in case this has not been done during the labeling process

In [None]:
ds = ds.filter(lambda ex: not (ex["de_formality"] == "ambiguous" or ex["ko_formality"] == "ambiguous"), num_proc=os.cpu_count())
print(ds)

In [None]:
tatoeba_analysis.get_cross_formality_plot(ds, "ko_formality", "de_formality", exclude_vals=["ambiguous"], form_col_desc="Korean formality", cross_col_desc="German formality")

## Analzye the translations
As the next step, the quality of the translations is analyzed.

In [None]:
tatoeba_analysis.get_formality_plot(ds, "de_formality_nmt")

In [None]:
tatoeba_analysis.get_formality_plot(ds, "ko_formality_nmt", ax_annotate_vals=(0.3,2500))

In [None]:
cross_ds = ds.filter(lambda ex: ex["de_formality_nmt"] != "ambiguous", num_proc=os.cpu_count())

In [None]:
tatoeba_analysis.get_cross_formality_plot(cross_ds, "ko_formality_nmt", "de_formality_nmt", form_col_desc="Korean formality", cross_col_desc="German formality", plot_title="form_distribution_nmt")

In [None]:
# the comet score has to be upscaled to fit into the same range as BLEU and chrF
def upscale_comet(example):
    example["comet_ko"] = example["comet_ko"] * 100
    example["comet_de"] = example["comet_de"] * 100
    return example

In [None]:
ds = ds.map(upscale_comet, num_proc=os.cpu_count())

In [None]:
rude_nmt_analysis.plot_translation_metrics(ds, ["bleu_ko", "chrf_ko", "comet_ko"], ["BLEU", "chrF", "COMET"], show=True, plt_name="translation_metrics_ko")

In [None]:
rude_nmt_analysis.plot_translation_metrics(ds, ["bleu_de", "chrf_de", "comet_de"], ["BLEU", "chrF", "COMET"], show=True, plt_name="translation_metrics_de")

In [None]:
rude_nmt_analysis.plot_sankey(ds, "ko_formality", "ko_formality_nmt", show=True, plt_name="sankey_ko_formality")

In [None]:
rude_nmt_analysis.plot_sankey(ds, "de_formality", "de_formality_nmt", show=True, plt_name="sankey_de_formality")

remove all samples with a comet score below 0.4 (40 in this case, since the scores have been upscaled before)

In [None]:
ds = ds.filter(lambda ex: ex["comet_ko"] > 40 and ex["comet_de"] > 40, num_proc=os.cpu_count())

In [None]:
print(ds)

In [None]:
tatoeba_analysis.get_formality_plot(ds, "de_formality", save=False)

In [None]:
tatoeba_analysis.get_formality_plot(ds, "ko_formality", ax_annotate_vals=(0.3,2500), save=False)

save COMET filtered dataset to disk for use in the attributions

In [None]:
ds.save_to_disk("./data/tatoeba_filtered")

## Attributions
analyze the attributions for the translations

In [2]:
#optionally load the filtered dataset from disk
ds = load_from_disk("./data/tatoeba_filtered")

In [3]:
print(ds[0])

{'id': 1, 'source': 'Carol, hier ist Vincent. Sag nichts. Hör bloß zu.', 'target': '캐럴, 나야 빈센트 말하지 말고 들어', 'ko_nmt': '캐롤, 여기  Vincent이 있습니다. 말하지 마세요. 그냥 들어보세요.', 'chrf_ko': 10.119, 'bleu_ko': 9.38, 'comet_ko': 76.2, 'upos_tags_source': ['PROPN', 'PUNCT', 'ADV', 'AUX', 'PROPN', 'PUNCT', 'VERB', 'PRON', 'PUNCT', 'VERB', 'ADV', 'ADP', 'PUNCT'], 'pos_tags_source': ['NE', '$,', 'ADV', 'VAFIN', 'NE', '$.', 'VVIMP', 'PIS', '$.', 'VVIMP', 'ADV', 'PTKVZ', '$.'], 'ws_tokens_source': ['Carol', ',', 'hier', 'ist', 'Vincent', '.', 'Sag', 'nichts', '.', 'Hör', 'bloß', 'zu', '.'], 'sent_ids_source': [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2], 'de_formality': 'underspecified', 'de_formality_map': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'upos_tags_target': ['NOUN', 'PUNCT', 'ADV', 'PROPN', 'VERB', 'CCONJ', 'SCONJ'], 'pos_tags_target': ['ncn', 'sp', 'npp+jca', 'nq', 'pvg+ecx', 'px+ecc', 'pvg+ecs'], 'ws_tokens_target': ['캐럴', ',', '나야', '빈센트', '말하지', '말고', '들어'], 'sent_ids_target': [0, 0, 0, 0, 0, 0, 0], '

In [4]:
import inseq
from transformers import (MBartForConditionalGeneration, MBart50TokenizerFast)

In [5]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", output_attentions=True)

In [6]:
de_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="de_DE", tgt_lang="ko_KR")

In [7]:
attr_model = inseq.load_model(model, "input_x_gradient", tokenizer=de_tokenizer)

In [9]:
out = attr_model.attribute(
    input_texts=ds[0]["source"],
    generated_texts=ds[0]["ko_nmt"],
    attribute_target=False,
    batch_size=5,
    step_scores=["probability"]
)
out = out.aggregate(aggregator=inseq.data.aggregator.AggregatorPipeline([inseq.data.aggregator.SubwordAggregator]))
out.show()

Attributing with input_x_gradient...: 100%|██████████| 20/20 [00:11<00:00,  1.66it/s]


Unnamed: 0_level_0,ko_KR,"▁캐롤,",▁여기,▁Vincent이,▁있습니다.,▁말하지,▁마세요.,▁그냥,▁들어보세요.,</s>
de_DE,0.102,0.071,0.058,0.069,0.071,0.059,0.064,0.058,0.063,0.13
"▁Carol,",0.212,0.438,0.101,0.089,0.101,0.053,0.066,0.042,0.043,0.114
▁hier,0.041,0.041,0.102,0.062,0.07,0.025,0.026,0.017,0.019,0.036
▁ist,0.036,0.039,0.066,0.049,0.064,0.024,0.029,0.017,0.018,0.029
▁Vincent.,0.159,0.175,0.401,0.421,0.199,0.095,0.093,0.054,0.051,0.143
▁Sag,0.086,0.052,0.072,0.069,0.132,0.263,0.178,0.072,0.125,0.085
▁nichts.,0.062,0.036,0.052,0.047,0.077,0.163,0.127,0.051,0.045,0.06
▁Hör,0.068,0.035,0.036,0.043,0.086,0.087,0.146,0.198,0.25,0.107
▁bloß,0.087,0.044,0.045,0.048,0.106,0.125,0.161,0.341,0.16,0.127
▁zu.,0.081,0.035,0.03,0.053,0.057,0.06,0.069,0.114,0.179,0.122
</s>,0.066,0.033,0.036,0.051,0.036,0.046,0.04,0.038,0.047,0.046
probability,0.0,0.08,0.149,0.034,0.325,0.046,0.261,0.311,0.082,0.872


In [10]:
attr_model2 = inseq.load_model("facebook/mbart-large-50-many-to-many-mmt", "input_x_gradient", tokenizer_kwargs={"src_lang": "de_DE", "tgt_lang": "ko_KR"})

In [16]:

#    generated_texts=ds[0:10]["ko_nmt"],
#    layers=(0,-1),
#    heads=(0,-1),
out = attr_model2.attribute(
    input_texts=ds[0]["source"],
    generated_texts=ds[0]["ko_nmt"],
    attribute_target=False,
    batch_size=5,
    step_scores=["probability"]
)
out = out.aggregate(aggregator=inseq.data.aggregator.AggregatorPipeline([inseq.data.aggregator.SubwordAggregator, inseq.data.aggregator.SequenceAttributionAggregator]))
out.sequence_attributions[0].source_attributions[1:-1].T[1:-1].tolist()


Attributing with input_x_gradient...: 100%|██████████| 20/20 [00:11<00:00,  1.61it/s]


[[0.43840715289115906,
  0.04073164984583855,
  0.03927171230316162,
  0.17463932931423187,
  0.05206599459052086,
  0.036157362163066864,
  0.035129863768815994,
  0.04412499815225601,
  0.03491465747356415],
 [0.10082988440990448,
  0.10249965637922287,
  0.06634380668401718,
  0.40117913484573364,
  0.0718628540635109,
  0.05241023376584053,
  0.03628076612949371,
  0.04467998072504997,
  0.02951953560113907],
 [0.08911564201116562,
  0.06162300333380699,
  0.04919227212667465,
  0.42056745290756226,
  0.06929496675729752,
  0.04737542197108269,
  0.04316408932209015,
  0.047713398933410645,
  0.05252497270703316],
 [0.10068227350711823,
  0.07031651586294174,
  0.06434743851423264,
  0.19930046796798706,
  0.13165660202503204,
  0.0774613693356514,
  0.08638675510883331,
  0.10635272413492203,
  0.05673668161034584],
 [0.05263480916619301,
  0.02512262761592865,
  0.023653635755181313,
  0.0949077233672142,
  0.2626975476741791,
  0.16323798894882202,
  0.08677950501441956,
  0.125

In [None]:
from transformers import NllbTokenizerFast, AutoModelForSeq2SeqLM

In [None]:
nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", output_attentions=True)
nllb_tokenizer = NllbTokenizerFast.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="deu_Latn", tgt_lang="kor_Hang")

In [None]:
nllb_model = inseq.load_model(nllb_model, "attention", tokenizer=nllb_tokenizer)

In [None]:
out = nllb_model.attribute(
    input_texts=ds[0:10]["source"],
    generation_args={"forced_bos_token_id": nllb_tokenizer.lang_code_to_id["kor_Hang"]},
    attribute_target=False,
    layers=(0,-1),
    heads=(0,-1),
    batch_size=5
)
out.show()

In [None]:
#"Helsinki-NLP/opus-mt-en-de"
model = inseq.load_model("gpt2", "attention")
out = model.attribute(
    "translate this to german: hello world",
    layers=(0,-1),
    heads=(0,-1),
)
out.show()