In [None]:
# !pip install datasets
# !pip install bleu
# !pip install evaluate

In [None]:
# connect to google drive to store the 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
df = pd.read_csv('/kaggle/input/translation/translation_train.csv')
df.head()

Unnamed: 0,English,Arabic
0,I have been dealt four aces.,لقد رميت اربع اوراق من نوع أس.
1,rejoice over the verdict.,ابتهاج نتيجة اصدار الحكم
2,today the people of south africa marched in su...,اليوم شعب جنوب افريقيا يدعم الفلسطينيين الذين ...
3,earlier this month the ministry of health decl...,وقد صرحت وزارة الصحة باكرا هذا الشهر ان اكثر م...
4,sharek posted this video titled sharek partici...,نشر هذا الفيديو تحت عنوان شارك داعيا الشعب للا...


In [9]:
df.shape

(23406, 2)

In [10]:
train_df, test_df = train_test_split(df, test_size=.2, random_state=42)
train_df.shape, test_df.shape

((18724, 2), (4682, 2))

In [11]:
# Convert DataFrames to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [12]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [13]:
source_lang = "English"
target_lang = "Arabic"
prefix = "translate English to Arabic: "


def preprocess_function(examples):
    """ 
        encode input text to input ids and attention mask 
        input:
            source language (English or Arabic)
            target language (Arabic or English)
        output:
            encoding of the source and target languages
            
            source: input_ids, attension_maks
            target: input_ids

        
    """
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/18724 [00:00<?, ? examples/s]



In [15]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4682 [00:00<?, ? examples/s]

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [17]:
# !pip install accelerate -U
# !pip install transformers[torch]

In [18]:
# training configurations 
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    fp16=True,
)

In [19]:
# device to run the model on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## why BLEU?

- **The BLEU score** is a number between zero and one that measures the similarity of the machine-translated text to a set of high quality reference translations.

In [None]:
# blue metrics for evaluation 
metric = evaluate.load("bleu")

In [20]:
trainer = Seq2SeqTrainer(
    model=model.to(device),
    args=training_args,
    train_dataset= tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics= metric
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.9473,2.528955
2,2.5794,2.304389
3,2.4068,2.175877
4,2.2893,2.097761
5,2.2217,2.052235
6,2.1454,2.039114


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


TrainOutput(global_step=3516, training_loss=2.393181538283486, metrics={'train_runtime': 1763.6778, 'train_samples_per_second': 63.699, 'train_steps_per_second': 1.994, 'total_flos': 3759694137262080.0, 'train_loss': 2.393181538283486, 'epoch': 6.0})

In [24]:
def translate_text(text, max_length=5000):
    """
    Generates a summary for the given text using a pre-trained model.

    Args:
        text (str): The text to be summarized.
        max_length (int): The maximum length of the input text for the model.

    Returns:
        str: The generated summary of the input text.
    """
    # Encode the input text using the tokenizer. The 'pt' indicates PyTorch tensors.
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=max_length, truncation=False)

    # Move the encoded text to the same device as the model (e.g., GPU or CPU)
    inputs = inputs.to(device)

    # Generate translate IDs with the model. num_beams controls the beam search width.
    translate_ids = model.generate(inputs, max_length=2000, num_beams=30, early_stopping=False)

    # Decode the generated IDs to text.
    translation = tokenizer.decode(translate_ids[0], skip_special_tokens=True)

    # Return the generated translation
    return translation

In [33]:
# Prompt the user to enter text for summarization
text = input('Enter the text: ')
print()

# Call the summarize_text function to generate a summary of the input text
summary = translate_text(text)

Enter the text:  miguelmorenatti es the king of belgium will abdicate today while the king of spain keeps on thinking that abdicate means an auto rental firm note abdicate in spanish is abdicar.





In [34]:
summary

'ميجويلموريناتتي بلجيوم سيبديكاتي يوم اليوم وماذا يقول على على اعبديقي انكليزي اوتو رنتال في الاسبانية هي ابديكار'