# Exploring the German-Korean Tatoeba dataset
[Github](https://github.com/Helsinki-NLP/Tatoeba-Challenge)

[Huggingface](https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt)

In [None]:
from tatoeba import preprocess, analysis
from rude_nmt import label_german, label_korean
import re
import os
import pandas as pd
from datasets import load_from_disk

In [None]:
subtitle_data = preprocess.get_subtitle_dataset(force_renew=False)

In [None]:
subtitle_data = load_from_disk("data/unknown_labelled")

In [None]:
len(subtitle_data)
print(subtitle_data)


In [None]:
long_data = subtitle_data.filter(lambda ex: len(ex["de_ws_tokens"]) > 100 or len(ex["ko_ws_tokens"]) > 100, num_proc=os.cpu_count())

In [None]:
de_long = []
for sent in long_data["de_ws_tokens"]:
    de_long.append(len(sent))
print(list(reversed(sorted(de_long))))

In [None]:
de_long = []
for sent in long_data["de_ws_tokens"]:
    de_long.append(len(sent))
print(list(reversed(sorted(de_long))))

In [None]:
ko_long = []
for sent in long_data["ko_ws_tokens"]:
    ko_long.append(len(sent))
print(list(reversed(sorted(ko_long))))

In [None]:
ko_long = []
for sent in long_data["ko_ws_tokens"]:
    ko_long.append(len(sent))
print(list(reversed(sorted(ko_long))))

In [None]:
subtitle_data = subtitle_data.map(label_german.annotate_formality, load_from_cache_file=True, num_proc=8)
subtitle_data = subtitle_data.map(label_korean.annotate_formality, load_from_cache_file=True, num_proc=8)

In [None]:
doc = label_german.get_pos_tags(subtitle_data[1:3])

In [None]:
print(doc)

In [None]:
subtitle_data = subtitle_data.map(label_german.get_pos_tags, load_from_cache_file=False, batched=True)

In [None]:
subtitle_data.to_csv("./data/subtitle_data.csv")

In [None]:
one_word_trg = analysis.get_one_word_sentences(subtitle_data)
print(one_word_trg)

In [None]:
dat = subtitle_data.to_pandas()

In [None]:
dat.info()

In [None]:
dat["de_formality"] = dat["de_formality"].astype('category')

In [None]:
rows = len(dat.index)
ax = dat["de_formality"].value_counts().plot(kind="bar")
for p in ax.patches:
    b = p.get_bbox()
    ax.annotate(str(round(p.get_height()/rows * 100,2)), ((b.x0 + b.x1)/2 - 0.16, b.y1 + 8000))

fig = ax.get_figure()
fig.savefig("german.png", bbox_inches="tight")

In [None]:
rows = len(dat.index)
ax = dat["ko_formality"].value_counts().plot(kind="bar")
for p in ax.patches:
    b = p.get_bbox()
    ax.annotate(str(round(p.get_height()/rows * 100,2)), ((b.x0 + b.x1)/2 - 0.3, b.y1 + 2500))

fig = ax.get_figure()
fig.savefig("korean.png", bbox_inches="tight")

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
print(len(dat.index))

In [None]:
dat["de_formality"].value_counts()

In [None]:
dat["ko_formality"].value_counts()

In [None]:
dat.head(10)

In [None]:
amb_dat = dat[dat["ko_formality"].str.match("underspecified")]

In [None]:
amb_dat.head(100)