In [None]:
!pip install transformers

# Translation Models


Machine translation is a pivotal field within natural language processing (NLP) that focuses on automating the conversion of text or speech from one language to another. It relies on sophisticated models and techniques to accomplish this challenging task effectively. One of the cornerstone methods in machine translation is the sequence-to-sequence (seq2seq) model, which employs deep neural networks to encode input text and then decode it into the target language. This technique has revolutionized translation tasks by learning to capture complex linguistic nuances and contextual information. Additionally, other models like Transformer-based models, including the famous BERT and GPT-3, have also made significant strides in translation, leveraging attention mechanisms to excel in various language pairs and domains. The choice of model depends on specific translation requirements, language pairs, and the quality of available training data. In this Colab file, we havee given a basic demo on how tto use the dataset and work on a simple seq2seq moel usig RNN.Your task will be to improve the model to the maximum you can ,make prediction on the test dataset given and write a code to generate the BLEU score of you prediction compared to original.






#1. Importing Libraries

In [None]:
import pandas as pd
import numpy as np
#from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,LSTM, Dropout,Attention
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
import re
import string

#2. Loading and pre-processing the data

### A) Loading the dataset

In [None]:
#Loading and processing data
eng_fr = pd.read_csv("/content/nlp_intel_train.csv")
eng_fr_test = pd.read_csv("/content/nlp_intel_test.csv")

### B) Dropping NaN texts

In [None]:
eng_fr = eng_fr.dropna(axis=0, how="any", subset=None, inplace=False)
eng_fr_test = eng_fr_test.dropna(axis=0, how="any", subset=None, inplace=False)

### C) Removing non-alphabetic characters

In [None]:
eng_fr["en"]=eng_fr["en"].apply(lambda x: x.lower())
eng_fr["fr"]=eng_fr["fr"].apply(lambda x: x.lower())

eng_fr["en"]=eng_fr["en"].apply(lambda x: re.sub("'","",x))
eng_fr["fr"]=eng_fr["fr"].apply(lambda x: re.sub("'","",x))

exclude=set(string.punctuation)
eng_fr["en"]=eng_fr["en"].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
eng_fr["fr"]=eng_fr["fr"].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

eng_fr["en"]=eng_fr["en"].apply(lambda x: re.sub("[1234567890]","",x))
eng_fr["fr"]=eng_fr["fr"].apply(lambda x: re.sub("[1234567890]","",x))

eng_fr["en"]=eng_fr["en"].apply(lambda x: x.strip())
eng_fr["fr"]=eng_fr["fr"].apply(lambda x: x.strip())

eng_fr["en"]=eng_fr["en"].apply(lambda x: re.sub(" +"," ",x))
eng_fr["fr"]=eng_fr["fr"].apply(lambda x: re.sub(" +"," ",x))

eng_fr

Unnamed: 0.1,Unnamed: 0,en,fr
0,1000,in he founded the astronomy club of rimouski i...,en il fonde le club dastronomie de rimouski au...
1,1001,the club was very active and they twice organi...,le club est très actif et organise à deux occa...
2,1002,in lemay initiated the first joint meeting of ...,en il est linstigateur à québec du congrès ann...
3,1003,the conference took place in quebec city and w...,le congrès est un franc succès et regroupe pas...
4,1004,from to he was the national president of the r...,de à il est président national de la société r...
...,...,...,...
17995,18995,imports of shrimp and prawn recorded also a sh...,en une forte baisse des importations japonaise...
17996,18996,the volume of import decreased by from mt in t...,en effet entre et le volume des importations a...
17997,18997,the market for northern shrimp pandalus boreal...,de plus le marché mondial des crevettes nordiq...
17998,18998,imports of molluscs almost of this being surf ...,entre et les importations de mollusques dont l...


### D) Making a list of all sentences

In [None]:
X=eng_fr["en"].tolist()
Y=eng_fr["fr"].tolist()

### E) Functions for pre-processing the sentences

 - Function to make a corpus from the available sentences



In [None]:
def to_corpus(sent_list):
  text_corpus=""
  for sentence in sent_list:
    text_corpus+=sentence.lower()+" "
  return text_corpus

- Function to train the tokenizer

In [None]:
def train_tokenizer(file_path):
  tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
  trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
  tokenizer.pre_tokenizer = Whitespace()
  files=[file_path]
  tokenizer.train(files, trainer)
  tokenizer.post_processor = TemplateProcessing(single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1",special_tokens=[("[CLS]", tokenizer.token_to_id("[CLS]")),("[SEP]", tokenizer.token_to_id("[SEP]")),],)
  return tokenizer

- Function to form sequence of integers for each sentence

In [None]:
def sequences(tokenizer,sent_list):
  prepoc_sentences=[]
  for sent in sent_list:
    encoding=tokenizer.encode(sent.lower())
    prepoc_sentences.append(encoding.ids)
  prepoc_sentences = pad_sequences(prepoc_sentences,55, padding='post')
  prepoc_sentences=np.array(prepoc_sentences)
  return prepoc_sentences

- Function to form a dictionary of words

In [None]:
def vocabulary(path_vocab):
    import json
    f=open(path_vocab)
    vocab=json.load(f)
    dict_vocab={}
    for i in vocab["model"]["vocab"]:
        dict_vocab[vocab["model"]["vocab"][i]]= i
    return dict_vocab

### F) Making a corpus out of the sentences

In [None]:
f1=open("x.txt","w")
f1.write(to_corpus(X))

f2=open("y.txt","w")
f2.write(to_corpus(Y))

2368969

### G) Training and saving the tokenizer with the corpus made

In [None]:
tokenizer_eng,tokenizer_fr=train_tokenizer("/content/x.txt"),train_tokenizer("/content/y.txt")
tokenizer_eng.save("english_vocab.json",pretty=True)
tokenizer_fr.save("french_vocab.json",pretty=True)

### H) Forming a sequence for each sentence

In [None]:
prepoc_english_sentences,prepoc_french_sentences=sequences(tokenizer_eng,X),sequences(tokenizer_fr,Y)

### I) Creating a dictionary of words and ids

In [None]:
dict_eng,dict_fr=vocabulary("english_vocab.json"),vocabulary("french_vocab.json")
vocabulary_size_english=tokenizer_eng.get_vocab_size()
vocabulary_size_french=tokenizer_fr.get_vocab_size()


#3. Building and Training the se2seq model

### A) Building the model

In [None]:
def build_model(vocab_size_fr,vocab_size_eng):
    learning_rate=0.003
    model=Sequential()
    model.add(Embedding(vocab_size_fr,256))
    model.add(Bidirectional(GRU(256,return_sequences=True)))  #Bidirectional LSTM would be better
    model.add(Dense(1024, activation='selu'))
    model.add(Dropout(0.5))                                   #Adding an attention layer will also optimise the model
    model.add(Bidirectional(GRU(512,return_sequences=True)))
    model.add(Dense(1024, activation='selu'))
    model.add(Dense(vocab_size_eng, activation='softmax'))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

### B) Training the model

In [None]:
tmp_x = pad_sequences(prepoc_french_sentences, maxlen = 55, padding = 'post')
model = build_model(vocabulary_size_french,vocabulary_size_english)
model.summary()
model.fit(tmp_x, prepoc_english_sentences, batch_size=64, epochs=50, validation_split=0.2)

#4. Predictions

### A) Function to Convert sequence to sentence

In [None]:
def logits_to_text(logits,dict_lan):
  pred=[prediction for prediction in np.argmax(logits, 1)]
  while pred[-1]==0:
    pred.pop(-1)
  return ' '.join([dict_lan[prediction] for prediction in pred])

### B) Prediction from training set

In [None]:
i= 1

print("Prediction:")
test_val=logits_to_text(model.predict(tmp_x[[i]])[0], dict_eng)
print(test_val)
print("\nCorrect Translation:")
actual_val=X[i]
print(X[i])
print("\nOriginal text:")
print(Y[i])

### C) Testing set - preprocessing and prediction.

In [None]:
eng_fr_test["en"]=eng_fr_test["en"].apply(lambda x: x.lower())
eng_fr_test["fr"]=eng_fr_test["fr"].apply(lambda x: x.lower())

eng_fr_test["en"]=eng_fr_test["en"].apply(lambda x: re.sub("'","",x))
eng_fr_test["fr"]=eng_fr_test["fr"].apply(lambda x: re.sub("'","",x))

exclude=set(string.punctuation)
eng_fr_test["en"]=eng_fr_test["en"].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
eng_fr_test["fr"]=eng_fr_test["fr"].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

eng_fr_test["en"]=eng_fr_test["en"].apply(lambda x: re.sub("[1234567890]","",x))
eng_fr_test["fr"]=eng_fr_test["fr"].apply(lambda x: re.sub("[1234567890]","",x))

eng_fr_test["en"]=eng_fr_test["en"].apply(lambda x: x.strip())
eng_fr_test["fr"]=eng_fr_test["fr"].apply(lambda x: x.strip())

eng_fr_test["en"]=eng_fr_test["en"].apply(lambda x: re.sub(" +"," ",x))
eng_fr_test["fr"]=eng_fr_test["fr"].apply(lambda x: re.sub(" +"," ",x))

eng_fr_test

In [None]:
X_test=eng_fr_test["en"].tolist()
Y_test=eng_fr_test["fr"].tolist()

In [None]:
test_english_sentences,test_french_sentences=sequences(tokenizer_eng,X_test),sequences(tokenizer_fr,Y_test)

In [None]:
tmp_test = pad_sequences(test_french_sentences, maxlen = 55, padding = 'post')
i= 1998

print("Prediction:")
test_val=logits_to_text(model.predict(tmp_test[[i]])[0], dict_eng)
print(test_val)
print("\nCorrect Translation:")
actual_val=X_test[i]
print(X_test[i])
print("\nOriginal text:")
print(Y_test[i])

#5. Metrics

In [None]:
import nltk

hypothesis = test_val.split()
reference = actual_val.split()
BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
print(BLEUscore)


4.1297213512319855e-78


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
