# Task 3: machine translation with pre-trained models

In [3]:
# all the datasets that will be needed
!ls /kaggle/input/

english-to-french


In [2]:
import pandas as pd

df_english = pd.read_csv("/kaggle/input/english-to-french/small_vocab_en.csv", sep = '\t' , names = ['english'])
df_french = pd.read_csv("/kaggle/input/english-to-french/small_vocab_fr.csv", sep = '\t' , names = ['french'])

df = pd.concat([df_english, df_french], axis=1)
df.head(2)

Unnamed: 0,english,french
0,"new jersey is sometimes quiet during autumn , ...",new jersey est parfois calme pendant l' automn...
1,the united states is usually chilly during jul...,les états-unis est généralement froid en juill...


# Data pre-processing/cleaning

In [3]:
import string
import re

punctuation_pattern = f"[{re.escape(string.punctuation)}]" # remove all the punctuations
printable_pattern = re.compile(f"[^{re.escape(string.printable)}]") # remove all the non-printable characters

def clean_sentences(sentence):
    clean = str(sentence)
    clean = printable_pattern.sub('', clean)
    clean = re.compile(punctuation_pattern).sub('', clean)
    
    return clean

In [12]:
df['english'] = df['english'].astype(str).str.replace(punctuation_pattern, '', regex=True).str.lower().apply(lambda x: printable_pattern.sub('', x)).str.strip()
df['french']  =  df['french'].astype(str).str.replace(punctuation_pattern, '', regex=True).str.lower().str.strip()

In [4]:
!pip install transformers



In [6]:
from transformers import MarianTokenizer, TFMarianMTModel

"""
hugging face model

    - this will be used for english to vecotors (tokenizing)
    - translating english to french
"""
model_name = "Helsinki-NLP/opus-mt-en-fr"

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = TFMarianMTModel.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-fr.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [7]:
"""
    takes sentence, clean it, tokenize it, pad it
    then converted to french (probability vectors)
    returns french word (decoded)
"""

def translate_english_to_french(input_seq):
    seq = [clean_sentences(sentence) for sentence in input_seq]
    inputs = tokenizer(seq, return_tensors="tf", padding=True)
    
    translated = model.generate(**inputs)
    french_decoded_sentences = tokenizer.batch_decode(translated, skip_special_tokens=True)

    return french_decoded_sentences

In [8]:
english_sentence = "california is usually quiet during march  and it is usually hot in june"
translation = translate_english_to_french([english_sentence])

print(f"english (given)     : {english_sentence}")
print(f"french (prediction) : {translation}")

english (given)     : california is usually quiet during march  and it is usually hot in june
french (prediction) : ['californie est généralement calme pendant la marche et il est habituellement chaud en juin']


# belu score

In [9]:
!pip install sacrebleu



In [15]:
# some random sentences from data
import numpy as np

sentences_idx = np.random.choice(df.index, size=100)

actual_en = df['english'][sentences_idx]
actual_fr = df['french'][sentences_idx]
predicted_fr = translate_english_to_french(actual_en)

In [16]:
import sacrebleu

bleu = sacrebleu.corpus_bleu(predicted_fr, [actual_fr.to_list()])
print(f"belu score: {bleu.score:.2f}")

belu score: 46.51
