In [1]:
!pip install datasets evaluate transformers sacrebleu
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import TFAutoModelForSeq2SeqLM
from transformers import create_optimizer
import tensorflow as tf
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from tqdm import tqdm
from IPython.display import clear_output
clear_output()

In [2]:
raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

Downloading builder script:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading and preparing dataset kde4/en-fr (download: 6.72 MiB, generated: 24.46 MiB, post-processed: Unknown size, total: 31.18 MiB) to /root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac...


Downloading data:   0%|          | 0.00/7.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/210173 [00:00<?, ? examples/s]

Dataset kde4 downloaded and prepared to /root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [4]:
raw_datasets['train'][0:3]

{'id': ['0', '1', '2'],
 'translation': [{'en': 'Lauri Watts', 'fr': 'Lauri Watts'},
  {'en': '& Lauri. Watts. mail;', 'fr': '& Lauri. Watts. mail;'},
  {'en': 'ROLES_OF_TRANSLATORS', 'fr': '& traducteurJeromeBlanc;'}]}

In [5]:
raw_datasets=raw_datasets['train'].train_test_split(train_size=150000,test_size=10000)
intermediated=raw_datasets['test'].train_test_split(train_size=500,test_size=9500)
raw_datasets['test']=intermediated['train']
raw_datasets['validation']=intermediated['test']
#we select small test set as compute bleu take large time

In [13]:
data_for_test=raw_datasets['test']['translation'][:10]

In [14]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"

tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)


In [16]:
max_length=128
def tokeizer_func(example):
    
    en=[i['en'] for i in example['translation']]
    fr=[i['fr'] for i in example['translation']]
    
    return tokenizer(en,text_target=fr,max_length=max_length,truncation =True)

In [17]:
raw_datasets=raw_datasets.map(tokeizer_func,batched=True,remove_columns=raw_datasets['train'].column_names)

  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Exception ignored in: <function tqdm.__del__ at 0x7b1764670700>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/tqdm/std.py", line 1144, in __del__
    def __del__(self):
KeyboardInterrupt: 


In [18]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9500
    })
})

In [19]:
model=TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint,from_pt=True)
model.summary()

Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFMarianMTModel.

All the weights of TFMarianMTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


Model: "tf_marian_mt_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  75133952  
                                                                 
 final_logits_bias (BiasLaye  multiple                 59514     
 r)                                                              
                                                                 
Total params: 75,193,466
Trainable params: 75,133,952
Non-trainable params: 59,514
_________________________________________________________________


In [20]:
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model,return_tensors='tf')

In [21]:
sample=data_collator([raw_datasets['train'][i]for i in [1,2]])
print(sample.keys())
print(tokenizer.decode(sample['decoder_input_ids'][0]))

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
<pad> La police à employer pour les menus dans les applications.


In [22]:
raw_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 150000
})

In [23]:
train_data=raw_datasets['train'].to_tf_dataset(columns=['input_ids', 'attention_mask'], batch_size=32
                                               , shuffle=True, collate_fn=data_collator
                                              ,label_cols='labels')
validation_data=raw_datasets['validation'].to_tf_dataset(columns=['input_ids', 'attention_mask'], batch_size=32
                                               , shuffle=True, collate_fn=data_collator
                                              ,label_cols='labels')
test_data=raw_datasets['test'].to_tf_dataset(columns=['input_ids', 'attention_mask'], batch_size=32
                                               , shuffle=False, collate_fn=data_collator
                                              ,label_cols='labels')

In [24]:
num_epochs = 3
num_train_steps = len(train_data) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")
model.evaluate(test_data)



1.6725298166275024

In [25]:
import evaluate
metric=evaluate.load('sacrebleu')

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [26]:
def compute_bleu(validation_data):
    all_label=[]
    predictions=[]
    count=0
    for batch,labels in test_data:

        prediction=model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=128,)
        prediction=tokenizer.batch_decode(prediction,skip_special_tokens=True)
        label=labels.numpy()
        label=np.where(label!=-100,label,tokenizer.pad_token_id)

        label=tokenizer.batch_decode(label,skip_special_tokens=True)

        prediction=[i.strip() for i in prediction]
        label=[i.strip() for i in label]
        all_label.extend(label)
        predictions.extend(prediction)
        count+=1
        print(count)
    return all_label,predictions
    

In [None]:
all_label,predictions=compute_bleu(test_data)

In [19]:
metric.compute(predictions=predictions, references=all_label)

{'score': 19.084623940865942,
 'counts': [3437, 2317, 1660, 1198],
 'totals': [11151, 10651, 10216, 9839],
 'precisions': [30.822347771500315,
  21.753825931837387,
  16.24902114330462,
  12.176034149811972],
 'bp': 1.0,
 'sys_len': 11151,
 'ref_len': 5463}

In [28]:
%%time
model.fit(
    train_data,
    validation_data=validation_data,
    epochs=num_epochs,
)


Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 1h 27min 24s, sys: 32min 10s, total: 1h 59min 34s
Wall time: 2h 24min 31s


<keras.callbacks.History at 0x7b161ac2e530>

In [29]:
model.evaluate(test_data)



0.8840053677558899

In [30]:
all_label,predictions=compute_bleu(test_data)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16


In [31]:
metric.compute(predictions=predictions, references=all_label)

{'score': 31.843589599426945,
 'counts': [3813, 2776, 2146, 1704],
 'totals': [8540, 8040, 7602, 7212],
 'precisions': [44.64871194379391,
  34.527363184079604,
  28.22941331228624,
  23.62728785357737],
 'bp': 1.0,
 'sys_len': 8540,
 'ref_len': 5274}

In [35]:
transformers.

In [48]:
from transformers import pipeline
translation_model=pipeline('translation',model=model,tokenizer=tokenizer)



In [52]:
print('english sentence',data_for_test[0]['en'])
print('expected output',data_for_test[0]['fr'])
print('translation from model',translation_model(data_for_test[0]['en']))


english sentence Construct the parallel line through this point
expected output Construire la droite parallèle passant par ce point
translation from model [{'translation_text': 'Construire la droite parallèle passant par ce point'}]


In [58]:
print('english sentence',data_for_test[1]['en'])
print('expected output',data_for_test[1]['fr'])
print('translation from model',translation_model(data_for_test[1]['en']))


english sentence Select Gimp Curves File to Load
expected output Sélectionner le fichier des courbes GIMP à charger
translation from model [{'translation_text': 'Sélectionner le fichier des courbes GIMP à charger'}]


In [None]:
#you can increase accuracy by increase number of eboch ,increase training data 