## Facebook NLLB Translator

<br>

Development Envrionment
<br>Data Exploration
<br>NHNDQ/nllb-finetuned-en2ko

<br>

### Development Envrionment

In [6]:
import glob
import json
# import jsonlines
import datetime
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

### Data Exploration

In [2]:
data = load_dataset("lmqg/qg_squad")

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['answer', 'paragraph_question', 'question', 'sentence', 'paragraph', 'sentence_answer', 'paragraph_answer', 'paragraph_sentence'],
        num_rows: 75722
    })
    validation: Dataset({
        features: ['answer', 'paragraph_question', 'question', 'sentence', 'paragraph', 'sentence_answer', 'paragraph_answer', 'paragraph_sentence'],
        num_rows: 10570
    })
    test: Dataset({
        features: ['answer', 'paragraph_question', 'question', 'sentence', 'paragraph', 'sentence_answer', 'paragraph_answer', 'paragraph_sentence'],
        num_rows: 11877
    })
})

In [18]:
jsonl_data = "sample/train00.jsonl"
with open(jsonl_data) as f:
    jsonl_df = pd.DataFrame(json.loads(line) for line in f)

In [22]:
sample_jsonl_df = jsonl_df[['paragraph_answer', 'question']]

In [23]:
f = open('sample/test_train00.jsonl', 'w')
print(sample_jsonl_df.to_json(orient='records', lines=True),file=f, flush=False)

In [None]:
pd.set_option('display.max_colwidth', 0)
pd.options.display.max_rows = None
jsonl_df[['paragraph_answer', 'question']]

### NHNDQ/nllb-finetuned-en2ko

In [15]:
def sorted_list(path_list):
    
    path_list = sorted(path_list, reverse=False)
    path_list = sorted(path_list, key=len)
    
    return path_list

In [None]:
from transformers import pipeline 
from transformers.pipelines.pt_utils import KeyDataset
import datasets 
datasets.disable_progress_bar()

device_num = 1
translator = pipeline('translation', model='NHNDQ/nllb-finetuned-en2ko', #Model finetuned from facebook/nllb-200-distilled-600M]
                      device=device_num, src_lang='eng_Latn', tgt_lang='kor_Hang', max_length=512)

In [2]:
text = 'Lockheed Martin Delivers Initial 5G Testbed To U.S. Marine Corps And Begins Mobile Network Experimentation'
output = translator(text, max_length=512)
print(output[0]['translation_text'])

록히드마틴이 미국 해병대에 최초 5G 테스트베드를 전달하고 모바일 네트워크 실험에 나선다.


In [22]:
jsonl_paths = glob.glob("./sample/*.jsonl")
jsonl_paths = sorted_list(jsonl_paths)
print("The number of jsonl file:", len(jsonl_paths))
print()

for idx, jsonl_path in enumerate(jsonl_paths):
    
    save_path = jsonl_path.replace("sample", "translation").replace("jsonl", "xlsx")
    data_file = jsonl_path.split("/")[-1]
    data_file = data_file.split(".")[0]

    if idx % 4 == 0 and idx != 0:
        print()
        
    print(idx, data_file, end="  |  ")
    
    paragraph_answer_translation = []
    question_translation = []
    dataset = load_dataset("json", data_files=jsonl_path)
    
    result_j = translator(KeyDataset(dataset['train'], 'paragraph_answer'), max_length=1000, batch_size=64)
    for idx, extracted_entities in enumerate(result_j):
        for entity in extracted_entities:
            paragraph_answer_translation.append(entity['translation_text'])

    result_k = translator(KeyDataset(dataset['train'], 'question'), max_length=1000, batch_size=64)
    for idx, extracted_entities in enumerate(result_k):
        for entity in extracted_entities:
            question_translation.append(entity['translation_text'])

    translation_df = pd.DataFrame({'paragraph_answer':paragraph_answer_translation,
                                    'question':question_translation})
    
    translation_df.to_excel(save_path, index=False)

The number of jsonl file: 32

0 dev00
1 dev01




2 dev02
3 dev03
4 test00
5 test01
6 test02
7 test03
8 train00
9 train01
10 train02
11 train03
12 train04
13 train05
14 train06
15 train07
16 train08
17 train09
18 train10
19 train11
20 train12
21 train13
22 train14


Your input_length: 955 is bigger than 0.9 * max_length: 1000. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


23 train15
24 train16
25 train17
26 train18
27 train19
28 train20
29 train21
30 train22
31 test_train00


### Reference

<b>HuggingFace</b>
<br>[lmqg/qg_squad](https://huggingface.co/datasets/lmqg/qg_squad)
<br>[NHNDQ/nllb-finetuned-en2ko](https://huggingface.co/NHNDQ/nllb-finetuned-en2ko)
<br>[facebook/nllb-200-distilled-600M](https://huggingface.co/facebook/nllb-200-distilled-600M)

<br><b>Blog</b>
<br>[Meta에 항상 감사하십시오. NLLB-200 모델을 이용한 기계번역](https://int-i.github.io/python/2023-09-05/nllb-en-ko-translation/)