In [1]:
import torch
torch.cuda.is_available()

True

In [10]:
import pdftotext

def convert_to_txt(pdf_file, txt_file):
    # Load your PDF
    with open(pdf_file, "rb") as f:
        pdf = pdftotext.PDF(f)

#     # Read some individual pages
#     print(pdf[0])
#     print(pdf[1])
    with open(txt_file, "w") as f:
        for page in pdf:
            f.write(page)

convert_to_txt("data/eng-t4t_all.pdf", "data/eng.txt")

In [11]:
convert_to_txt("data/txuNT_all.pdf", "data/txu.txt")

In [69]:
def parallelize(txt1, txt2, res_file):
    with open(txt1, encoding="utf-8") as f_en, open(txt2, encoding="utf-8") as f_txu:
        eng_lines = f_en.readlines()
        txu_lines = f_txu.readlines()

    with open(res_file, "w", encoding="utf-8") as out:
        for en, tx in zip(eng_lines, txu_lines):
            out.write(f"{en.strip()}\t{tx.strip()}\n")
        
parallelize("data/eng.txt", "data/txu.txt", "data/parallel.txt")

In [1]:
eng_data = []
txu_data = []
with open("data/parallel.txt", "r") as f:
    for line in f:
        line.strip('\n')
        eng, txu = line.split('\t')
        eng_data.append(eng)
        txu_data.append(txu)
        
eng_data[5]
txu_data[5]

'copyright © 2012 Wycliffe Bible Translators, Inc.\n'

In [2]:
from datasets import load_dataset

data = load_dataset(
    "csv",
    data_files="data/parallel.txt",
    delimiter="\t",
    column_names=["en","txu"],
    split="train"
)

data = data.train_test_split(test_size=0.1)

In [3]:
print(data)

DatasetDict({
    train: Dataset({
        features: ['en', 'txu'],
        num_rows: 54225
    })
    test: Dataset({
        features: ['en', 'txu'],
        num_rows: 6026
    })
})


In [4]:
from datasets import DatasetDict
import nlpaug.augmenter.word as naw

train_dataset = data['train']
test_dataset = data['test']
aug = naw.SynonymAug(aug_src='wordnet')

def augment_batch(batch):
    return {
        'en': [aug.augment(text) for text in batch['en']],
        'txu': batch['txu']
    }

train_aug_dataset = train_dataset.map(augment_batch, batched=True, batch_size=32)
test_aug_dataset = test_dataset.map(augment_batch, batched=True, batch_size=32)

Map:   0%|          | 0/54225 [00:00<?, ? examples/s]

Map:   0%|          | 0/6026 [00:00<?, ? examples/s]

In [5]:
from datasets import concatenate_datasets

def join_en(example):
    if isinstance(example["en"], list):
        return {"en": " ".join(example["en"])}
    return example

train_aug_dataset = train_aug_dataset.map(join_en)
test_aug_dataset = test_aug_dataset.map(join_en)

train_dataset = concatenate_datasets([train_dataset, train_aug_dataset])
test_dataset = concatenate_datasets([test_dataset, test_aug_dataset])

Map:   0%|          | 0/54225 [00:00<?, ? examples/s]

Map:   0%|          | 0/6026 [00:00<?, ? examples/s]

In [6]:
import nlpaug.augmenter.word as naw

aug = naw.RandomWordAug(action="swap")

def augment_batch(batch):
    return {
        'en': [aug.augment(text) for text in batch['en']],
        'txu': batch['txu']
    }

train_aug_dataset = train_dataset.map(augment_batch, batched=True, batch_size=32)
test_aug_dataset = test_dataset.map(augment_batch, batched=True, batch_size=32)

Map:   0%|          | 0/108450 [00:00<?, ? examples/s]

Map:   0%|          | 0/12052 [00:00<?, ? examples/s]

In [7]:
train_aug_dataset = train_aug_dataset.map(join_en)
test_aug_dataset = test_aug_dataset.map(join_en)

train_dataset = concatenate_datasets([train_dataset, train_aug_dataset])
test_dataset = concatenate_datasets([test_dataset, test_aug_dataset])

Map:   0%|          | 0/108450 [00:00<?, ? examples/s]

Map:   0%|          | 0/12052 [00:00<?, ? examples/s]

In [8]:
train_dataset

Dataset({
    features: ['en', 'txu'],
    num_rows: 216900
})

In [9]:
from datasets import DatasetDict

dataset = DatasetDict({
    'train': train_dataset.shuffle(seed=42),
    'test': data['test']
})

dataset.save_to_disk("augmented_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/216900 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6026 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")

def preprocess(example):
    # prefix determine translation direction
    prefix = "<en>"
    enc = tokenizer(prefix + " " + example["en"],
                    truncation=True, max_length=128)
    dec = tokenizer(example["txu"],
                    truncation=True, max_length=128)
    return {
        "input_ids": enc.input_ids,
        "attention_mask": enc.attention_mask,
        "labels": dec.input_ids
    }

# filter null lines
data = data.filter(lambda x: x["en"] is not None and x["txu"] is not None)
tokenized = data.map(preprocess, remove_columns=["en","txu"])

Filter:   0%|          | 0/54225 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6026 [00:00<?, ? examples/s]

Map:   0%|          | 0/41519 [00:00<?, ? examples/s]

Map:   0%|          | 0/4609 [00:00<?, ? examples/s]

In [7]:
print(tokenized)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 41519
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4609
    })
})


In [None]:
from transformers import pipeline


In [6]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.cuda.amp import GradScaler, autocast

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [5]:
from concurrent.futures import ProcessPoolExecutor, as_completed

def preprocess_text_parallel(texts, n_jobs = None, method = 'process'):
    if n_jobs is None:
        n_jobs = cpu_count()
    
    if method == 'process':
        with ProcessPoolExecutor(max_workers=n_jobs) as executor:
            results = list(executor.map(preprocess_text, texts))
    elif method == 'thread':
        with ThreadPoolExecutor(max_workers=n_jobs) as executor:
            results = list(executor.map(preprocess_text, texts))
    else:
        raise ValueError("method deve ser 'process' ou 'thread'")
    
    return results


In [7]:
from multiprocessing import Pool, cpu_count
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

def preprocess_text_vectorized(texts):
    """
    Otimize using vectorization
    """
    # Batch transforms
    texts_lower = [text.lower() for text in texts]
    
    # Remove digits and specials chars in batch
    cleaned_texts = []
    for text in texts_lower:
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        cleaned_texts.append(text)
    
    # parallel tokenize
    return preprocess_text_parallel(cleaned_texts, method='thread')

eng_tokens = preprocess_text_vectorized(eng_data)
print(eng_tokens[:10])

[['ii'], [], ['translation', 'for', 'translators'], [], ['a', 'bible', 'translation', 'for', 'bible', 'translators', 'which', 'makes', 'implied', 'information', 'explicit', 'in', 'the'], ['text', 'as', 'an', 'aid', 'to', 'the', 'translator', 'who', 'may', 'need', 'that', 'information', 'to', 'correctly', 'translate', 'into', 'a'], ['particular', 'language'], [], ['copyright', 'ellis', 'w', 'deibler', 'jr'], ['language', 'english']]


In [8]:
len(eng_tokens)

60251

In [9]:
txu_tokens = preprocess_text_vectorized(txu_data)
len(txu_tokens)

60251

In [100]:
txu_tokens[200]

['nhym', 'kam', 'djekonij', 'arỳm', 'xarati']

In [101]:
eng_tokens[200]

['that',
 'are',
 'different',
 'from',
 'every',
 'other',
 'languagejust',
 'like',
 'every',
 'language',
 'has',
 'a',
 'set',
 'of']

In [12]:
%%bash
pip install ipywidgets
jupyter nbextension enable --py widgetsnbextension

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.6/216.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m4.9 MB/s[0m eta [36m0:00

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [13]:
%%bash
jupyter nbextension install --py widgetsnbextension
jupyter nbextension enable --py widgetsnbextension

Installing /usr/local/lib/python3.10/dist-packages/widgetsnbextension/static -> jupyter-js-widgets
Up to date: /usr/local/share/jupyter/nbextensions/jupyter-js-widgets/extension.js.map
Up to date: /usr/local/share/jupyter/nbextensions/jupyter-js-widgets/extension.js
Up to date: /usr/local/share/jupyter/nbextensions/jupyter-js-widgets/extension.js.LICENSE.txt
- Validating: [32mOK[0m

    To initialize this nbextension in the browser every time the notebook (or other app) loads:
    
          jupyter nbextension enable widgetsnbextension --py
    
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [14]:
from tqdm.notebook import tqdm
from transformers import NllbTokenizer

tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
tokenizer("How was your day?").input_ids

[256047, 13374, 1398, 4260, 4039, 248130, 2]