# Colab

In [13]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [14]:
%cd /content/drive/MyDrive/uni_bonn/nlp/final_project

/content/drive/MyDrive/uni_bonn/nlp/final_project


# Install modules

In [1]:
!pip install datasets
!pip install 'transformers[torch]'
!pip install fasttext
!pip install PyDictionary
!pip install validators

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected 

Collecting validators
  Downloading validators-0.22.0-py3-none-any.whl (26 kB)
Installing collected packages: validators
Successfully installed validators-0.22.0


# Import modules

In [2]:
from datasets import load_dataset
import spacy
import nltk
import string
import re
from PyDictionary import PyDictionary
from collections import Counter

english_vocab_2 = PyDictionary()
nltk.download('words')
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
from nltk.corpus import wordnet as wn

In [5]:
len(wn.synsets("40000"))

0

In [6]:
import time

In [7]:
import pickle

In [8]:
nlp = spacy.load("en_core_web_sm")

In [9]:
def add_to_dict(x, d):
  if x not in d:
    d[x] = 0
  d[x] += 1

# Data Exploration

In [16]:
def produce_stat_of_file(filepath):
  current_corpus_freq = dict()
  oov_vocab = set()
  num_of_token = 0
  num_of_line = 0
  num_of_oov_token = 0

  with open(filepath, 'r') as f:
    line = None
    while True:
      line = f.readline().rstrip()
      if not line:
        break
      num_of_line += 1

      doc = nlp(line)
      for token in doc:
        num_of_token += 1
        current_lemma = token.lemma_
        current_text = token.text.lower()

        add_to_dict(current_text, current_corpus_freq)

        if len(wn.synsets(current_lemma.lower())) == 0:
          oov_vocab.add(current_text)
          num_of_oov_token += 1

  singletons = 0
  for key, value in current_corpus_freq.items():
    if value == 1:
      singletons += 1

  print(filepath)
  print("Number of lines:", num_of_line)
  print("Number of tokens:", num_of_token)
  print("Average tokens per line", num_of_token / num_of_line)
  print("Number of meaningless token", num_of_oov_token)
  print("Number of word", len(current_corpus_freq))
  print("Number of meaningless word", len(oov_vocab))
  print("Singletons:", singletons)
  print("\n")

  stats = dict(corpus_freq=current_corpus_freq, oov_vocab=oov_vocab, num_of_token=num_of_token, num_of_line=num_of_line, num_of_oov_token=num_of_oov_token)

  return stats

In [17]:
def produce_total_stats(list_of_stats_dict):
  current_corpus_freq = dict()
  oov_vocab = set()
  num_of_token = 0
  num_of_line = 0
  num_of_oov_token = 0

  for stats_dict in list_of_stats_dict:
    current_corpus_freq = dict(Counter(current_corpus_freq) + Counter(stats_dict['corpus_freq']))
    oov_vocab = oov_vocab.union(stats_dict['oov_vocab'])
    num_of_token += stats_dict['num_of_token']
    num_of_line += stats_dict['num_of_line']
    num_of_oov_token += stats_dict['num_of_oov_token']

  singletons = 0
  for key, value in current_corpus_freq.items():
    if value == 1:
      singletons += 1

  print("Number of lines:", num_of_line)
  print("Number of tokens:", num_of_token)
  print("Average tokens per line", num_of_token / num_of_line)
  print("Number of meaningless token", num_of_oov_token)
  print("Number of word", len(current_corpus_freq))
  print("Number of meaningless word", len(oov_vocab))
  print("Singletons:", singletons)
  print("\n")

  stats = dict(corpus_freq=current_corpus_freq, oov_vocab=oov_vocab, num_of_token=num_of_token, num_of_line=num_of_line, num_of_oov_token=num_of_oov_token)
  return stats

## Formal domain (target)

In [18]:
fr_formal_train = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/train/formal")
fr_formal_tune_ref0 = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/tune/formal.ref0")
fr_formal_tune_ref1 = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/tune/formal.ref1")
fr_formal_tune_ref2 = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/tune/formal.ref2")
fr_formal_tune_ref3 = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/tune/formal.ref3")
fr_formal_test_ref0 = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/test/formal.ref0")
fr_formal_test_ref1 = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/test/formal.ref1")
fr_formal_test_ref2 = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/test/formal.ref2")
fr_formal_test_ref3 = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/test/formal.ref3")

./dataset/GYAFC_Corpus/Family_Relationships/train/formal
Number of lines: 51967
Number of tokens: 681466
Average tokens per line 13.113437373717936
Number of meaningless token 252244
Number of word 13969
Number of meaningless word 2321
Singletons: 6381


./dataset/GYAFC_Corpus/Family_Relationships/tune/formal.ref0
Number of lines: 2788
Number of tokens: 34369
Average tokens per line 12.327474892395983
Number of meaningless token 12761
Number of word 2772
Number of meaningless word 198
Singletons: 1425


./dataset/GYAFC_Corpus/Family_Relationships/tune/formal.ref1
Number of lines: 2788
Number of tokens: 34050
Average tokens per line 12.213055954088953
Number of meaningless token 12524
Number of word 2787
Number of meaningless word 212
Singletons: 1489


./dataset/GYAFC_Corpus/Family_Relationships/tune/formal.ref2
Number of lines: 2788
Number of tokens: 34288
Average tokens per line 12.29842180774749
Number of meaningless token 12719
Number of word 2844
Number of meaningless word 206
Sin

In [20]:
print("OOV tune ref0:", len(set(fr_formal_tune_ref0['corpus_freq'].keys()) - set(fr_formal_train['corpus_freq'].keys())))
print("OOV tune ref1:", len(set(fr_formal_tune_ref1['corpus_freq'].keys()) - set(fr_formal_train['corpus_freq'].keys())))
print("OOV tune ref2:", len(set(fr_formal_tune_ref2['corpus_freq'].keys()) - set(fr_formal_train['corpus_freq'].keys())))
print("OOV tune ref3:", len(set(fr_formal_tune_ref3['corpus_freq'].keys()) - set(fr_formal_train['corpus_freq'].keys())))
print("OOV test ref0:", len(set(fr_formal_test_ref0['corpus_freq'].keys()) - set(fr_formal_train['corpus_freq'].keys())))
print("OOV test ref1:", len(set(fr_formal_test_ref1['corpus_freq'].keys()) - set(fr_formal_train['corpus_freq'].keys())))
print("OOV test ref2:", len(set(fr_formal_test_ref2['corpus_freq'].keys()) - set(fr_formal_train['corpus_freq'].keys())))
print("OOV test ref3:", len(set(fr_formal_test_ref3['corpus_freq'].keys()) - set(fr_formal_train['corpus_freq'].keys())))

OOV tune ref0: 262
OOV tune ref1: 272
OOV tune ref2: 261
OOV tune ref3: 264
OOV test ref0: 137
OOV test ref1: 154
OOV test ref2: 152
OOV test ref3: 148


In [21]:
em_formal_train = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/train/formal")
em_formal_tune_ref0 = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/tune/formal.ref0")
em_formal_tune_ref1 = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/tune/formal.ref1")
em_formal_tune_ref2 = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/tune/formal.ref2")
em_formal_tune_ref3 = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/tune/formal.ref3")
em_formal_test_ref0 = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/test/formal.ref0")
em_formal_test_ref1 = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/test/formal.ref1")
em_formal_test_ref2 = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/test/formal.ref2")
em_formal_test_ref3 = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/test/formal.ref3")

./dataset/GYAFC_Corpus/Entertainment_Music/train/formal
Number of lines: 52595
Number of tokens: 664090
Average tokens per line 12.626485407358114
Number of meaningless token 244513
Number of word 24028
Number of meaningless word 8755
Singletons: 12003


./dataset/GYAFC_Corpus/Entertainment_Music/tune/formal.ref0
Number of lines: 2877
Number of tokens: 36262
Average tokens per line 12.604101494612443
Number of meaningless token 13378
Number of word 4455
Number of meaningless word 688
Singletons: 2517


./dataset/GYAFC_Corpus/Entertainment_Music/tune/formal.ref1
Number of lines: 2877
Number of tokens: 36139
Average tokens per line 12.561348627042058
Number of meaningless token 13341
Number of word 4439
Number of meaningless word 688
Singletons: 2542


./dataset/GYAFC_Corpus/Entertainment_Music/tune/formal.ref2
Number of lines: 2877
Number of tokens: 36129
Average tokens per line 12.557872784150156
Number of meaningless token 13336
Number of word 4428
Number of meaningless word 681
Singl

In [22]:
print(em_formal_train["oov_vocab"])

{'cdusa', 'animae', 'www.dellhoroscope.com', 'hyun', 'philly', 'weenenes', 'choppa', 'crizzzib', 'proxytap.com', "i'm", 'brigida', 'fubu', 'michelle', '1.5', 'tagworld', '1990', 'debz', 'www.tocka.com.mk', 'applebees', 'andy', 'crocs', 'ramones', 'knightley', 'www.punkrockvids.com', 'bevieve', 'chobits', 'farrah', 'akm-rocking@yahoo.com', 'torres', 'mapquest', 'kasem', 'puscifer', 'heartwarming', 'brittania.co.uk', 'alcia', 'valerramma', 'eggman', '1812', 'chamilitary', 'jamie', 'gretchen', 'btjunkie.com', 'baird', 'bigtalltom', 'voltron', 'bittorents', 'v103', 'lynyrd', 'rhinannon', 'kenison', 'krutch', 'narusegawa', 'wo', 'ferrell', 'badonkadonk', 'yun', 'duex', 'elford', 'dead.¨', 'trish', 'brode', 'remixed', 'simon´s', 'him', 'metalica', 'daughtry', 'www.myspace.com/peoplemusic', 'helloween', 'cdl', 'raboti', 'santana', 'movae', 'fiona', 'ott', 'sk8baorder', '995', 'these', 'darren', 'bandra', 'lamborghini', 'judgemental', 'repopulate', 'royksopp', 'coleen', 'a.c.', 'deadalive', 'n

In [23]:
print("OOV tune ref0:", len(set(em_formal_tune_ref0['corpus_freq'].keys()) - set(em_formal_train['corpus_freq'].keys())))
print("OOV tune ref1:", len(set(em_formal_tune_ref1['corpus_freq'].keys()) - set(em_formal_train['corpus_freq'].keys())))
print("OOV tune ref2:", len(set(em_formal_tune_ref2['corpus_freq'].keys()) - set(em_formal_train['corpus_freq'].keys())))
print("OOV tune ref3:", len(set(em_formal_tune_ref3['corpus_freq'].keys()) - set(em_formal_train['corpus_freq'].keys())))
print("OOV test ref0:", len(set(em_formal_test_ref0['corpus_freq'].keys()) - set(em_formal_train['corpus_freq'].keys())))
print("OOV test ref1:", len(set(em_formal_test_ref1['corpus_freq'].keys()) - set(em_formal_train['corpus_freq'].keys())))
print("OOV test ref2:", len(set(em_formal_test_ref2['corpus_freq'].keys()) - set(em_formal_train['corpus_freq'].keys())))
print("OOV test ref3:", len(set(em_formal_test_ref3['corpus_freq'].keys()) - set(em_formal_train['corpus_freq'].keys())))

OOV tune ref0: 527
OOV tune ref1: 525
OOV tune ref2: 530
OOV tune ref3: 545
OOV test ref0: 286
OOV test ref1: 269
OOV test ref2: 254
OOV test ref3: 251


In [24]:
fr_formal_stats = produce_total_stats([
    fr_formal_train,
    fr_formal_tune_ref0,
    fr_formal_tune_ref1,
    fr_formal_tune_ref2,
    fr_formal_tune_ref3,
    fr_formal_test_ref0,
    fr_formal_test_ref1,
    fr_formal_test_ref2,
    fr_formal_test_ref3,
])

Number of lines: 68447
Number of tokens: 887437
Average tokens per line 12.965316230075825
Number of meaningless token 328538
Number of word 15061
Number of meaningless word 2680
Singletons: 6667




In [25]:
em_formal_stats = produce_total_stats([
    em_formal_train,
    em_formal_tune_ref0,
    em_formal_tune_ref1,
    em_formal_tune_ref2,
    em_formal_tune_ref3,
    em_formal_test_ref0,
    em_formal_test_ref1,
    em_formal_test_ref2,
    em_formal_test_ref3,
])

Number of lines: 69767
Number of tokens: 881052
Average tokens per line 12.62849198044921
Number of meaningless token 324631
Number of word 25776
Number of meaningless word 9443
Singletons: 12293




In [26]:
formal_stats = produce_total_stats([
    em_formal_stats,
    fr_formal_stats
])

Number of lines: 138214
Number of tokens: 1768489
Average tokens per line 12.795295700869666
Number of meaningless token 653169
Number of word 31133
Number of meaningless word 11330
Singletons: 14516




In [27]:
all_formal_stats = dict(
    formal_stats=formal_stats,
    em_formal_stats=em_formal_stats,
    fr_formal_stats=fr_formal_stats,
    fr_formal_train=fr_formal_train,
    fr_formal_tune_ref0=fr_formal_tune_ref0,
    fr_formal_tune_ref1=fr_formal_tune_ref1,
    fr_formal_tune_ref2=fr_formal_tune_ref2,
    fr_formal_tune_ref3=fr_formal_tune_ref3,
    fr_formal_test_ref0=fr_formal_test_ref0,
    fr_formal_test_ref1=fr_formal_test_ref1,
    fr_formal_test_ref2=fr_formal_test_ref2,
    fr_formal_test_ref3=fr_formal_test_ref3,
    em_formal_train=em_formal_train,
    em_formal_tune_ref0=em_formal_tune_ref0,
    em_formal_tune_ref1=em_formal_tune_ref1,
    em_formal_tune_ref2=em_formal_tune_ref2,
    em_formal_tune_ref3=em_formal_tune_ref3,
    em_formal_test_ref0=em_formal_test_ref0,
    em_formal_test_ref1=em_formal_test_ref1,
    em_formal_test_ref2=em_formal_test_ref2,
    em_formal_test_ref3=em_formal_test_ref3,
)

In [None]:
# with open("./formal_stats.pkl", 'wb') as file:
#     pickle.dump(all_formal_stats, file)

## Informal domain (source)

In [28]:
fr_informal_train = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/train/informal")
fr_informal_tune = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/tune/informal")
fr_informal_test = produce_stat_of_file("./dataset/GYAFC_Corpus/Family_Relationships/test/informal")

./dataset/GYAFC_Corpus/Family_Relationships/train/informal
Number of lines: 51967
Number of tokens: 656066
Average tokens per line 12.624665653202994
Number of meaningless token 239226
Number of word 17891
Number of meaningless word 8238
Singletons: 10075


./dataset/GYAFC_Corpus/Family_Relationships/tune/informal
Number of lines: 2788
Number of tokens: 34191
Average tokens per line 12.263629842180775
Number of meaningless token 12429
Number of word 3128
Number of meaningless word 820
Singletons: 1782


./dataset/GYAFC_Corpus/Family_Relationships/test/informal
Number of lines: 1332
Number of tokens: 16951
Average tokens per line 12.725975975975976
Number of meaningless token 6188
Number of word 2089
Number of meaningless word 488
Singletons: 1237




In [29]:
print("OOV tune:", len(set(fr_informal_tune['corpus_freq'].keys()) - set(fr_informal_train['corpus_freq'].keys())))
print("OOV test:", len(set(fr_informal_test['corpus_freq'].keys()) - set(fr_informal_train['corpus_freq'].keys())))

OOV tune: 536
OOV test: 268


In [30]:
em_informal_train = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/train/informal")
em_informal_tune = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/tune/informal")
em_informal_test = produce_stat_of_file("./dataset/GYAFC_Corpus/Entertainment_Music/test/informal")

./dataset/GYAFC_Corpus/Entertainment_Music/train/informal
Number of lines: 52595
Number of tokens: 638538
Average tokens per line 12.14065975853218
Number of meaningless token 240502
Number of word 29095
Number of meaningless word 15679
Singletons: 16515


./dataset/GYAFC_Corpus/Entertainment_Music/tune/informal
Number of lines: 2877
Number of tokens: 34862
Average tokens per line 12.117483489746263
Number of meaningless token 12793
Number of word 4620
Number of meaningless word 1322
Singletons: 2806


./dataset/GYAFC_Corpus/Entertainment_Music/test/informal
Number of lines: 1416
Number of tokens: 17447
Average tokens per line 12.32132768361582
Number of meaningless token 6432
Number of word 2910
Number of meaningless word 760
Singletons: 1783




In [31]:
print("OOV tune:", len(set(em_informal_tune['corpus_freq'].keys()) - set(em_informal_train['corpus_freq'].keys())))
print("OOV test:", len(set(em_informal_test['corpus_freq'].keys()) - set(em_informal_train['corpus_freq'].keys())))

OOV tune: 803
OOV test: 381


In [32]:
fr_informal_stats = produce_total_stats([
    fr_informal_train,
    fr_informal_tune,
    fr_informal_test,
])

Number of lines: 56087
Number of tokens: 707208
Average tokens per line 12.609125109205342
Number of meaningless token 257843
Number of word 18691
Number of meaningless word 8755
Singletons: 10545




In [33]:
em_informal_stats = produce_total_stats([
    em_informal_train,
    em_informal_tune,
    em_informal_test,
])

Number of lines: 56888
Number of tokens: 690847
Average tokens per line 12.143984671635494
Number of meaningless token 259727
Number of word 30272
Number of meaningless word 16393
Singletons: 17170




In [34]:
informal_stats = produce_total_stats([
    em_informal_stats,
    fr_informal_stats
])

Number of lines: 112975
Number of tokens: 1398055
Average tokens per line 12.37490595264439
Number of meaningless token 517570
Number of word 39549
Number of meaningless word 22820
Singletons: 22623




In [35]:
all_informal_stats = dict(
    informal_stats=informal_stats,
    em_informal_stats=em_informal_stats,
    fr_informal_stats=fr_informal_stats,
    fr_informal_train=fr_informal_train,
    fr_informal_tune=fr_informal_tune,
    fr_informal_test=fr_informal_test,
    em_informal_train=em_informal_train,
    em_informal_tune=em_informal_tune,
    em_informal_test=em_informal_test,
)

In [36]:
# with open("./informal_stats.pkl", 'wb') as file:
#     pickle.dump(all_informal_stats, file)

In [37]:
# with open("./informal_stats.pkl", "rb") as file:
#     test = pickle.load(file)

In [38]:
# print(test['informal_stats'])

# EDA with CoEdit

In [10]:
wmt14 = load_dataset('wmt14', 'de-en')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/41.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/658M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/919M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

In [11]:
print(wmt14)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4508785
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})


In [12]:
def produce_stat_of_dataset(dataset):
  current_corpus_freq = dict()
  oov_vocab = set()
  num_of_token = 0
  num_of_line = 0
  num_of_oov_token = 0

  for i in range(len(dataset)):
      line = dataset[i]['translation']['en']
      num_of_line += 1

      doc = nlp(line)
      for token in doc:
        num_of_token += 1
        current_lemma = token.lemma_
        current_text = token.text.lower()

        add_to_dict(current_text, current_corpus_freq)

        if len(wn.synsets(current_lemma.lower())) == 0:
          oov_vocab.add(current_text)
          num_of_oov_token += 1

  singletons = 0
  for key, value in current_corpus_freq.items():
    if value == 1:
      singletons += 1

  print("Number of lines:", num_of_line)
  print("Number of tokens:", num_of_token)
  print("Average tokens per line", num_of_token / num_of_line)
  print("Number of meaningless token", num_of_oov_token)
  print("Number of word", len(current_corpus_freq))
  print("Number of meaningless word", len(oov_vocab))
  print("Singletons:", singletons)
  print("\n")

  stats = dict(corpus_freq=current_corpus_freq, oov_vocab=oov_vocab, num_of_token=num_of_token, num_of_line=num_of_line, num_of_oov_token=num_of_oov_token)

In [13]:
train_stat = produce_stat_of_dataset(wmt14['train'])

Number of lines: 4508785
Number of tokens: 118163923
Average tokens per line 26.20748671759687
Number of meaningless token 45971794
Number of word 774748
Number of meaningless word 691933
Singletons: 425417




In [None]:
text = wmt14['train'][20]['translation']['en']

In [None]:
print(text)