In [1]:
from time import time
from re import findall
from langdetect import detect
import torch
from nemo.collections.nlp.models import MTEncDecModel
from nemo.utils import logging
import contextlib
import csv
import json
import os

if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
    autocast = torch.cuda.amp.autocast
else:
    @contextlib.contextmanager
    def autocast():
        yield

from nltk import download, sent_tokenize
download('punkt')

_TEXT_LEN_LIMIT = 5000
_TEXT_SPLIT_THRESHOLD = 1024
_SPLIT_LEN = 512
model = MTEncDecModel.restore_from("models/aayn_base.nemo", map_location="cuda")




[nltk_data] Downloading package punkt to /home/matejk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[NeMo W 2024-06-12 10:11:07 nlp_overrides:802] Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/apex
    Megatron-based models require Apex to function correctly.


[NeMo I 2024-06-12 10:11:55 tokenizer_utils:179] Getting YouTokenToMeTokenizer with model: /tmp/tmpykh0snj_/abc9d4e95fee471e8ff84ca15d818568_en_tokenizer.64000.BPE.model with r2l: False.
[NeMo I 2024-06-12 10:11:55 tokenizer_utils:179] Getting YouTokenToMeTokenizer with model: /tmp/tmpykh0snj_/d0f346b27aa14336bca69b91867db036_sl_tokenizer.64000.BPE.model with r2l: False.


[NeMo W 2024-06-12 10:11:55 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    src_file_name: /data/cjvt/v1.2.6/train.en
    tgt_file_name: /data/cjvt/v1.2.6/train.sl
    use_tarred_dataset: true
    tar_files: null
    metadata_file:
    - /data/cjvt/v1.2.6/en-sl/metadata.tokens.1024.json
    lines_per_dataset_fragment: 1000000
    num_batches_per_tarfile: 100
    shard_strategy: scatter
    tokens_in_batch: 1024
    clean: true
    max_seq_length: 512
    min_seq_length: 1
    cache_ids: false
    cache_data_per_node: false
    use_cache: false
    shuffle: true
    num_samples: -1
    drop_last: false
    pin_memory: true
    num_workers: 8
    reverse_lang_direction: false
    load_from_tarred_dataset: false
    metadata_path: null
    tar_shuffle_n: 100
    n_preproc_jobs: -2
    tar_file_prefix: parallel
    concat_sampling_tec

[NeMo I 2024-06-12 10:12:02 nlp_overrides:1110] Model MTEncDecModel was successfully restored from /mnt/c/Users/kranj/Desktop/mag-delo/models/aayn_base.nemo.


In [2]:
def translate_text(item):
  time0 = time()
  #logging.info(f" Q: {item}")

  if isinstance(item, str):
    text = [item]
  else:
    text = item
  text_len = sum(len(_text) for _text in text)
  if text_len > _TEXT_LEN_LIMIT:
    logging.warning(f'{text}, text length exceded {text_len}c [max {_TEXT_LEN_LIMIT}c]')

  text_batch = []
  text_batch_split = []
  for _text in text:
    if len(_text) > _TEXT_SPLIT_THRESHOLD:
      _split_start = len(text_batch)
      _sent = sent_tokenize(_text)
      i = 0
      while i < len(_sent):
        j = i+1
        while j < len(_sent) and len(' '.join(_sent[i:j])) < _SPLIT_LEN: j+=1
        if len(' '.join(_sent[i:j])) > _TEXT_SPLIT_THRESHOLD:
          _split=findall(rf'(.{{1,{_SPLIT_LEN}}})(?:\s|$)',' '.join(_sent[i:j]))
          text_batch.extend(_split)
        else:
          text_batch.append(' '.join(_sent[i:j]))
        i = j
      _split_end = len(text_batch)
      text_batch_split.append((_split_start,_split_end))
    else:
      text_batch.append(_text)

  #logging.debug(f' B: {text_batch}, BS: {text_batch_split}')


  translation_batch = model.translate(text_batch)
  #logging.debug(f' BT: {translation_batch}')

  translation = []
  _start = 0
  for _split_start,_split_end in text_batch_split:
    if _split_start != _start:
      translation.extend(translation_batch[_start:_split_start])
    translation.append(' '.join(translation_batch[_split_start:_split_end]))
    _start = _split_end
  if _start < len(translation_batch):
    translation.extend(translation_batch[_start:])


  #logging.debug(f'text_length: {text_len}c, duration: {round(time()-time0,2)}s')

  torch.cuda.empty_cache()
  return ' '.join(translation) if isinstance(text, str) else translation


In [1]:
import numpy as np
from collections import defaultdict
from tqdm import tqdm


def process(anc, res, labels, filters):
    data = list()
    for a, r_, l_ in zip(anc, res, labels):
        for r, l in zip(r_, l_):
            data.append((a, r, l))
    
    for f in filters:
        data = list(filter(f, data))


    out = defaultdict(lambda: [list(), list()])
    for d in data:
        out[tuple(d[0])][0].append(d[1])
        out[tuple(d[0])][1].append(d[2])

    anc_, res_, lab_ = list(), list(), list()
    for o in out.items():
        anc_.append(list(o[0]))
        res_.append(o[1][0])
        lab_.append(o[1][1])

    return anc_, res_, lab_


def filter_length(x, min_length=20, max_length=200):
    return len(x[1])>min_length and len(x[1])<max_length and np.all([len(i)>min_length and len(i)<max_length for i in x[0]])

def filter_numbers(x, max_numbers=5):
    return sum([x[1].count(str(n)) for n in range(10)])<max_numbers and np.all([sum([i.count(str(n)) for n in range(10)])<max_numbers for i in x[0]])

def filter_chars(x, max_chars=1, chars="@#$%^_+={}|<>;"):
    return sum([x[1].count(n) for n in chars])<max_chars and np.all([sum([i.count(n) for n in chars])<max_chars for i in x[0]])




def filter_dataset(docs, labels, batch_size=2056, filters = [filter_length, filter_numbers, filter_chars]):
    n = len(labels)
    anc, res = docs["ancestors"], docs["responses"]
    anc_, res_, lab_ = list(), list(), list()
    for i in tqdm(range(0, n, batch_size)):
        a, r, l = process(anc[i:i+batch_size], res[i:i+batch_size], labels[i:i+batch_size], filters)
        anc_ += a
        res_ += r
        lab_ += l
    out = dict()
    out["ancestors"] = anc_
    out["responses"] = res_
    return out, lab_ 


import pickle
from tqdm import tqdm

with open("data/english/train", "rb") as file:
    train = pickle.load(file)
with open("data/english/test", "rb") as file:
    test = pickle.load(file)

with open("data/english/train_labels", "rb") as file:
    train_labels = pickle.load(file)
with open("data/english/test_labels", "rb") as file:
    test_labels = pickle.load(file)
    
train, train_labels = filter_dataset(train, train_labels)
test, test_labels = filter_dataset(test, test_labels)

anc, res = train["ancestors"], train["responses"]
anc += test["ancestors"]
res += test["responses"]
labels = train_labels + test_labels
anc = [" > ".join(a) for a in anc]

len(anc), len(res), len(labels)

100%|███████████████████████████████████████████████████████████████████████████████████| 63/63 [00:04<00:00, 13.69it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00, 13.90it/s]


(120741, 120741, 120741)

In [None]:
translated_train = list()
for i, a, r in tqdm(zip(range(len(anc)), anc, res), total=len(anc)):
    
    translated_train.append(translate_text(" ; ".join([a, " ; ".join(r)])))
    #print(translated_train[-1][0])
    #print(detect(translated_train[-1][0]))

# with open("data/slovene/train_filtered", "wb") as file:
#     pickle.dump(translated_train, file)

# test_translated = translate_docs(test)
# with open("data/test_filtered_translated", "wb") as file:
#     pickle.dump(test_translated, file)

# train_translated = translate_docs(train)
# with open("data/train_filtered_translated", "wb") as file:
#     pickle.dump(train_translated, file)
#translate_text(test)


In [18]:
with open("data/onion_data", "rb") as file:
    onion = pickle.load(file)

onion_data, onion_labels = [i["input"] for i in onion], [i["label"] for i in onion]
onion_translated = list()
for i, o in tqdm(enumerate(onion_data), total=len(onion_data)):
    onion_translated.append(translate_text(o))
    if i%1000 == 0:
        with open("data/slovene/onion_translated", "wb") as file:
            pickle.dump(onion_translated, file)

with open("data/slovene/onion_translated", "wb") as file:
    pickle.dump(onion_translated, file)

100%|███████████████████████████████████████████████████████████████████████████| 28619/28619 [1:39:10<00:00,  4.81it/s]


In [2]:
from langdetect import detect_langs

with open("data/slovene/sarc_filtered_translated", "rb") as file:
    sarc_translated = [i[0] for i in pickle.load(file)]
with open("data/slovene/sarc_filtered_labels", "rb") as file:
    sarc_labels = pickle.load(file)

anc2, res2 = list(), list()
for s_ in sarc_translated:
    s_ = s_.replace(";;", ";")
    s = s_.split(";")
    anc2.append(s[0])
    res2.append(s[1:])

d = zip(anc, res, anc2, res2, sarc_labels)
def f(x):
    a1, r1, a2, r2, l = x
    try:
        t = detect_langs(a2)[0]
        lang, prob = t.lang, t.prob
    except:
        return None
    try:
        t2 = detect_langs(" ".join(r2))[0]
        lang2, prob2 = t2.lang, t2.prob
    except:
        return None
    if lang == lang2 == "sl" and min(prob, prob2) > 0.99 and len(r1) == len(r2) == len(l):
        return x, min(prob, prob2)
    return None

from multiprocessing import Pool
with Pool(16) as p:
    d = p.map(f, d)

dataset = dict()
dataset["eng_anc"] = list()
dataset["eng_res"] = list()
dataset["slo_anc"] = list()
dataset["slo_res"] = list()
dataset["labels"] = list()
dataset["prob"] = list()
for d_ in d:
    if not d_: continue
    data_, p = d_
    a1, r1, a2, r2, l = data_
    dataset["eng_anc"].append(a1.replace(">", ""))
    dataset["eng_res"].append(r1)
    dataset["slo_anc"].append(a2.replace(">", ""))
    dataset["slo_res"].append(r2)
    dataset["labels"].append([int(l_) for l_ in l])
    dataset["prob"].append(p)

s = np.argsort(dataset["prob"])
for k in dataset.keys():
    dataset[k] = list(np.array(dataset[k], dtype=object)[s])

with open("data/parallel/SARC_PARALLEL", "wb") as file:
    pickle.dump(dataset, file)


In [4]:
with open("data/slovene/onion_translated", "rb") as file:
    onion_translated = pickle.load(file)
with open("data/onion_data", "rb") as file:
    onion_original = pickle.load(file)

d = zip(onion_original, onion_translated)

def f(x):
    o, t = x
    t = t[0]
    l = o["label"]
    e = o["input"]
    try:
        t_ = detect_langs(t)[0]
        lang, prob = t_.lang, t_.prob
    except:
        return None
    if lang=="sl" and prob>0.99:
        return e, t, l, prob
    return None

with Pool(16) as p:
    d = p.map(f, d)

dataset_onion = dict()
dataset_onion["eng"] = list()
dataset_onion["slo"] = list()
dataset_onion["labels"] = list()
dataset_onion["prob"] = list()
for d_ in d:
    if not d_: continue
    e, t, l, prob = d_
    dataset_onion["eng"].append(e)
    dataset_onion["slo"].append(t)
    dataset_onion["labels"].append(l)
    dataset_onion["prob"].append(prob)

s = np.argsort(dataset_onion["prob"])
for k in dataset_onion.keys():
    dataset_onion[k] = list(np.array(dataset_onion[k])[s])


with open("data/parallel/ONION_PARALLEL", "wb") as file:
    pickle.dump(dataset_onion, file)
