# Import Library

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import tensorflow as tf
import json

# Dataset

In [None]:
from datasets import Dataset

In [1]:
root = "/kaggle/input/"   # root path of dataset
def read_data(root, folder):
    data = {'en':[], 'vi':[]}
    path = os.path.join(root, folder)
    for file_name in os.listdir(path):
        file_path = os.path.join(path, file_name)
        with open(file_path,'r') as f:
            _, tail = file_path.split('.')
            if tail =='en':
                for line in f:
                    data['en'].append(line.strip())
            else:
                for line in f:
                    data['vi'].append(line.strip())
                    
    return data

test_data = read_data(root, 'datatest')

In [2]:
test_dataset = Dataset.from_dict(test_data)

In [3]:
test_dataset

Dataset({
    features: ['en', 'vi'],
    num_rows: 100
})

# Get_prediction

## EncoderDecoder model

In [None]:
from transformers import AutoTokenizer, EncoderDecoderModel

In [None]:
def load_model_Bert_BARTPho(model_path):
    encoder_tokenizer = "bert-base-uncased"  
    decoder_tokenizer = "vinai/bartpho-word"  
    
    encoder = AutoTokenizer.from_pretrained(encoder_tokenizer)
    decoder = AutoTokenizer.from_pretrained(decoder_tokenizer)
    model = EncoderDecoderModel.from_pretrained(model_path)  # Load model pretrain
    return encoder, decoder, model

In [None]:
def get_predict_EncoderDecoder(model_path, test_data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder, decoder, model = load_model_Bert_BARTPho(model_path)
    model = model.to(device)
    
    texts = []
    predictions = []
    references = []
    
    for item in test_data:
        source = item["en"]
        target = item["vi"]
        
        inputs = encoder(source, 
                        padding=True, 
                        truncation=True, 
                        max_length=len(target.split()),
                        return_tensors="pt")
        inputs = {key: value.to(model.device) for key, value in inputs.items()}
            # Generate translation
        outputs = model.generate(inputs["input_ids"], max_length=64, num_beams=4)
        prediction = decoder.decode(outputs[0], skip_special_tokens=True)

        texts.append(source)
        predictions.append(prediction)
        references.append(target)
        
    
    return texts, predictions, references    

## T5

In [None]:
import sentencepiece as spm

In [None]:
def load_model_T5(model_path):
    encoder = T5Tokenizer.from_pretrained("t5-small")
    decoder = spm.SentencePieceProcessor(model_file = '/kaggle/working/models--Sag1012--machine-translation/snapshots/164bfec8e7d09d77ab222a6055293e66934994ca/T5/vi_tokenizer_32128.model')
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    return encoder, decoder, model

In [None]:
def get_predict_T5(model_path, test_data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder, decoder, model = load_model_T5(model_path)
    model = model.to(device)

    texts = []
    predictions = []
    references = []
    
    for item in test_data:
        source = item["en"]
        target = item["vi"]
        
        inputs = encoder(source, 
                        padding=True, 
                        truncation=True, 
                        max_length=len(target.split()),
                        return_tensors="pt")
        inputs = {key: value.to(model.device) for key, value in inputs.items()}
        outputs = model.generate(inputs["input_ids"], max_length=64, num_beams=4)
        prediction = decoder.decode(outputs[0].tolist())
            
        texts.append(source)
        predictions.append(prediction)
        references.append(target)
        
    
    return texts, predictions, references    

## BiLSTM

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

In [None]:
def get_predict_BiLSTM(model_path,test_data):
    import os
    directory_path = os.path.dirname(model_path)
    texts = []
    predictions = []
    references = []
    model = load_model(model_path)
    encoder_input = model.input[0]  
    encoder_output = model.get_layer("bidirectional").output[0]
    encoder_state_h = model.get_layer("state_h_concat").output
    encoder_state_c = model.get_layer("state_c_concat").output
    
    # Encoder inference model
    encoder_model = Model(encoder_input, [encoder_output, encoder_state_h, encoder_state_c])
    decoder_embedding = model.get_layer("decoder_embedding")
    decoder_lstm = model.get_layer("decoder_lstm")
    decoder_dense = model.get_layer("decoder_dense")
    units = 128  # LSTM units
    # Decoder inference inputs
    decoder_state_input_h = Input(shape=(units * 2,), name="decoder_state_input_h")  
    decoder_state_input_c = Input(shape=(units * 2,), name="decoder_state_input_c")
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    # Reuse the embedding and LSTM layers
    decoder_input = Input(shape=(1,), name="decoder_input")  # Decoder input for one time step
    decoder_embedding_inf = decoder_embedding(decoder_input)
    decoder_lstm_inf = decoder_lstm(decoder_embedding_inf, initial_state=decoder_states_inputs)
    decoder_output_inf, state_h_inf, state_c_inf = decoder_lstm_inf
    
    decoder_states_inf = [state_h_inf, state_c_inf]
    
    # Dense layer for probabilities
    decoder_output_inf = decoder_dense(decoder_output_inf)
    
    # Decoder inference model
    decoder_model = Model(
        [decoder_input] + decoder_states_inputs,  # Inputs
        [decoder_output_inf] + decoder_states_inf)  # Outputs
    def preprocess_sentence(sentence, tokenizer, max_length):
        sequence = tokenizer.texts_to_sequences([sentence])
        return pad_sequences(sequence, maxlen=max_length, padding='post')
    
    def decode_sequence(input_seq):
        # Encode the input sequence to get initial states
    
        encoder_output, state_h, state_c = encoder_model.predict(input_seq)
    
        target_seq = np.zeros((1, 1))  # Shape: (batch_size, 1)
        target_seq[0, 0] = vi_loaded_tokenizer.texts_to_sequences(["<SOS>"])[0][0]
    
        states = [state_h, state_c]
    
        decoded_sentence = []
        for _ in range(232):
            output_tokens, h, c = decoder_model.predict([target_seq] + states)
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_token = vi_loaded_tokenizer.index_word.get(sampled_token_index, '<unk>')
            if sampled_token == '<eos>':
                break
    
            decoded_sentence.append(sampled_token)
            target_seq[0, 0] = sampled_token_index
            states = [h, c]
    
        return ' '.join(decoded_sentence)
    with open(directory_path + '/english_tokenizer.pkl', 'rb') as file:
        eng_loaded_tokenizer = pickle.load(file)
    with open(directory_path +'/vietnamese_tokenizer.pkl', 'rb') as file:
        vi_loaded_tokenizer = pickle.load(file)
    

    for item in test_data:
        source = item["en"]
        target = item["vi"]
        
        input_sentence = source
        input_sequence = preprocess_sentence(input_sentence, eng_loaded_tokenizer, 193)
        
        prediction = decode_sequence(input_sequence)
            
        texts.append(source)
        predictions.append(prediction)
        references.append(target)
        
    
    return texts, predictions, references   


## Bi-GRU with attention

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import json
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModel, PreTrainedTokenizerFast

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SOS_token = 0
MAX_LENGTH = 50

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size).to(device)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True).to(device)
        self.hidden_transform = nn.Linear(hidden_size * 2, hidden_size).to(device)

    def forward(self, input):
        input = input.to(device)
        embedded = self.embedding(input)
        output, hidden = self.gru(embedded)
        output, hidden = output.to(device), hidden.to(device)
        output = self.hidden_transform(output)

        return output, hidden

In [None]:
class CrossAttention(nn.Module):
    def __init__(self, hidden_size):
        super(CrossAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size).to(device)  # Linear layer for the query
        self.Ua = nn.Linear(hidden_size, hidden_size).to(device)    # Linear layer for the keys
        self.Va = nn.Linear(hidden_size, hidden_size).to(device)  # Linear layer for the values
        self.softmax = nn.Softmax(dim=-1).to(device)  

    def forward(self, query, keys):

        query_proj = self.Wa(query).to(device)  # Shape: (batch_size, query_len, hidden_size)
        key_proj = self.Ua(keys).to(device)      # Shape: (batch_size, key_len, hidden_size)
        value_proj = self.Va(keys).to(device)  # Shape: (batch_size, key_len, hidden_size)

        scores = torch.bmm(query_proj, key_proj.transpose(1, 2)).to(device)  # Shape: (batch_size, query_len, key_len)
        scores = scores / torch.sqrt(torch.tensor(key_proj.size(-1), dtype=torch.float32, device=device))  # Scale by sqrt(hidden_size)
        

        # Compute attention weights
        attention_weights = self.softmax(scores).to(device)  # Shape: (batch_size, query_len, key_len)

        # Compute context vectors as weighted sum of values
        context = torch.bmm(attention_weights, value_proj).to(device)  # Shape: (batch_size, query_len, hidden_size)

        return context, attention_weights


In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size).to(device)
        self.attention = CrossAttention(hidden_size).to(device)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout_p), batch_first=True).to(device)
        self.out = nn.Linear(hidden_size, output_size).to(device)
        self.dropout = nn.Dropout(dropout_p).to(device)
        self.hidden_transform = nn.Linear(hidden_size * 2, hidden_size).to(device)
        self.hidden_input_transform = nn.Linear(hidden_size * 2, hidden_size).to(device)
        self.hidden_size = hidden_size

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        encoder_outputs = encoder_outputs.to(device)
        encoder_hidden = encoder_hidden.to(device)
        if target_tensor is not None:
            target_tensor = target_tensor.to(device)

        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token).to(device)
        decoder_hidden = self.transform_bidirectional_hidden(encoder_hidden)
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1).to(device)  # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach().to(device)  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1).to(device)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1).to(device)
        attentions = torch.cat(attentions, dim=1).to(device)

        return decoder_outputs, decoder_hidden, attentions

    def transform_bidirectional_hidden(self, encoder_hidden):
        forward_states = encoder_hidden[0::2, :, :].to(device)  # Forward states: (batch, num_layers, hidden_size)
        backward_states = encoder_hidden[1::2, :, :].to(device)  # Backward states: (batch, num_layers, hidden_size)
        combined_hidden = torch.cat((forward_states, backward_states), dim=2).to(device)  # Shape: (batch, num_layers, hidden_size * 2)
        combined_hidden = self.hidden_transform(combined_hidden).to(device)
        return combined_hidden

    def forward_step(self, input, hidden, encoder_outputs):
        encoder_outputs = encoder_outputs.to(device)

        embedded = self.embedding(input).to(device)
        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)
        input_gru = self.hidden_input_transform(input_gru)
        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights


In [None]:
class Translator(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Translator, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, input_tensor, target_tensor=None):
        if target_tensor is not None:
            target_tensor = target_tensor.to(self.device)

        encoder_outputs, encoder_hidden = self.encoder(input_tensor)
        decoder_outputs, _, _ = self.decoder(encoder_outputs, encoder_hidden, target_tensor)
        return decoder_outputs

    def eval(self):
        self.encoder.eval()
        self.decoder.eval()


In [None]:
def load_model_GRU(model, path):
    model.load_state_dict(torch.load(path))
    model.eval()

In [None]:
def clean_decoded_sentence(sentence):
    special_tokens = ["<s>", "</s>", "<pad>", "<unk>"]
    for token in special_tokens:
        sentence = sentence.replace(token, "").strip() 
    return sentence

In [None]:
def translate_GRU(input_text):
    encoder_pth = "/kaggle/working/model/models--Sag1012--machine-translation/snapshots/c5f85377fb64307c86c935bae0edf64b764d8db8/GRU_with_attention ver3/encoder.pth"
    decoder_pth = "/kaggle/working/model/models--Sag1012--machine-translation/snapshots/c5f85377fb64307c86c935bae0edf64b764d8db8/GRU_with_attention ver3/decoder.pth"
    VOCAB_SIZE = 64000
    hidden_size = 256
    encoder = EncoderRNN(VOCAB_SIZE, hidden_size)
    decoder = AttnDecoderRNN(hidden_size, VOCAB_SIZE)
    load_model_GRU(encoder, encoder_pth)
    load_model_GRU(decoder, decoder_pth)
    translator = Translator(encoder,decoder,device)
    english_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    vietnamese_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
    english_tokens = english_tokenizer.encode(input_text)
    english_tensor = torch.tensor(english_tokens).unsqueeze(0).to(device)
    print(device)
    with torch.no_grad():
        output_tensor = translator(english_tensor)
    predicted_token_ids = torch.argmax(output_tensor, dim=-1).squeeze(0).tolist()
    vietnamese_sentence = vietnamese_tokenizer.decode(predicted_token_ids)
    vietnamese_sentence_cleaned = clean_decoded_sentence(vietnamese_sentence)
    
    return vietnamese_sentence_cleaned
    

In [None]:
def get_predict_GRU(model_path,test_data):
    import os
    directory_path = os.path.dirname(model_path)
    encoder_pth = directory_path + "/encoder.pth"
    decoder_pth = directory_path + "/decoder.pth"
    
    texts = []
    predictions = []
    references = []

    VOCAB_SIZE = 64000
    hidden_size = 256
    encoder = EncoderRNN(VOCAB_SIZE, hidden_size)
    decoder = AttnDecoderRNN(hidden_size, VOCAB_SIZE)
    load_model_GRU(encoder, encoder_pth)
    load_model_GRU(decoder, decoder_pth)
    translator = Translator(encoder,decoder,device)
    english_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    vietnamese_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

    for item in test_data:
        source = item["en"]
        target = item["vi"]
        
        input_sentence = source
        english_tokens = english_tokenizer.encode(input_sentence)
        english_tensor = torch.tensor(english_tokens).unsqueeze(0).to(device)
        with torch.no_grad():
            output_tensor = translator(english_tensor)
        predicted_token_ids = torch.argmax(output_tensor, dim=-1).squeeze(0).tolist()
        vietnamese_sentence = vietnamese_tokenizer.decode(predicted_token_ids)
        vietnamese_sentence_cleaned = clean_decoded_sentence(vietnamese_sentence)
                    
        texts.append(source)
        predictions.append(vietnamese_sentence_cleaned)
        references.append(target)
    return texts, predictions, references

## Marian MT

In [None]:
!pip install sacremoses

In [18]:
from transformers import MarianMTModel, MarianTokenizer

def load_model_MarianMT(model_path):

    tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-vi')
    model = MarianMTModel.from_pretrained(model_path)
    return tokenizer, model

In [19]:
def get_predict_MarianMT(model_path, test_data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer, model = load_model_MarianMT(model_path)
    model = model.to(device)
    
    texts = []
    predictions = []
    references = []
    
    for item in test_data:
        source = item["en"] 
        target = item["vi"] 
        
        # Tokenize input
        inputs = tokenizer(source, return_tensors="pt", padding=True, truncation=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Generate translation
        outputs = model.generate(**inputs, max_length=64, num_beams=4)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        texts.append(source)
        predictions.append(prediction)
        references.append(target)
    
    return texts, predictions, references

## LSTM with Attention

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model

from transformers import AutoTokenizer
from tokenizers import Tokenizer

In [None]:
tokenizer_en = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer_vi = AutoTokenizer.from_pretrained("vinai/phobert-base") 

In [None]:
def greedy_decode(input_sequence, model, tokenizer_target, max_length=50):

    input_sequence = tf.constant([input_sequence], dtype=tf.int64)

    start_token = tokenizer_vi.cls_token_id
    end_token = tokenizer_vi.sep_token_id

    target_sequence = [start_token]

    for _ in range(max_length):
        decoder_input = tf.constant([target_sequence], dtype=tf.int64)
        predictions = model.predict([input_sequence, decoder_input], verbose=0)
        next_token = tf.argmax(predictions[:, -1, :], axis=-1).numpy()[0]
        target_sequence.append(next_token)

        if next_token == end_token:
            break
    translated_sentence = tokenizer_target.decode(target_sequence[1:], skip_special_tokens=True)
    return translated_sentence

In [1]:
def get_predict_lstm_attention(model_path, test_data):
    model = load_model(model_path)
    tokenizer_en = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokenizer_vi = AutoTokenizer.from_pretrained("vinai/phobert-base") 

    texts = []
    predictions = []
    references = []
    
    for item in test_data:
        source = item["en"] 
        target = item["vi"]
        input_sequence = tokenizer_en.encode(source, add_special_tokens=True )
  
        prediction = greedy_decode(input_sequence, model, tokenizer_vi)
        
        texts.append(source)
        predictions.append(prediction)
        references.append(target)
    
    return texts, predictions, references

# Evaluation

## Cosin similarity

In [20]:
def cos_sim(predictions, references):
    from torch.nn.functional import cosine_similarity as torch_cosine_similarity
    from transformers import  AutoTokenizer, AutoModel
    
    # Load model and tokenizer
    decoder_model_name = "vinai/bartpho-word"
    tokenizer = AutoTokenizer.from_pretrained(decoder_model_name)
    model = AutoModel.from_pretrained(decoder_model_name)

    # Move the model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    cos = []

    # Process each prediction and reference pair
    for pred, ref in zip(predictions, references):
        # Tokenize predictions and references
        p = tokenizer(pred, return_tensors="pt", padding=True, truncation=True).to(device)
        p.pop("token_type_ids", None)
        r = tokenizer(ref, return_tensors="pt", padding=True, truncation=True).to(device)
        r.pop("token_type_ids", None)

        # Generate embeddings
        with torch.no_grad():
            embeddings1 = model(**p).last_hidden_state.mean(dim=1)  # Prediction embeddings
            embeddings2 = model(**r).last_hidden_state.mean(dim=1)  # Reference embeddings

        # Compute cosine similarity
        similarity = torch_cosine_similarity(embeddings1, embeddings2).item()  # Use PyTorch cosine similarity
        cos.append(similarity)

    return cos


## BLEU scores

In [21]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu_scores(predictions, references, weights = (1,0,0,0)):

    smoothing = SmoothingFunction().method1
    BLEU_scores = []
    
    for pred, ref in zip(predictions, references):
        ref_tokens = ref.split()
        pred_tokens = pred.split()

        bleu = sentence_bleu([ref_tokens], pred_tokens, weights=weights, smoothing_function=smoothing)
        BLEU_scores.append(bleu)
    
    return BLEU_scores

## Evaluate model

In [23]:
def evaluate_model(model_name, model_path, test_data):
    fn = globals()[f"get_predict_{model_name}"]
    texts, predictions, references = fn(model_path, test_data)
    cos = cos_sim(predictions, references)
    scores1 = calculate_bleu_scores(predictions, references, weights = (1,0,0,0))
    scores2 = calculate_bleu_scores(predictions, references, weights = (0.5,0.5,0,0))
    scores3 = calculate_bleu_scores(predictions, references, weights = (0.5,0.25,0.25,0))
    scores4 = calculate_bleu_scores(predictions, references, weights = (0.25,0.25,0.25,0.25))
    
    data ={
         'texts': texts,
         'predictions': predictions,
         'references': references,
         'BLEU_1': scores1,
         'BLEU_2': scores2,
        'BLEU_3': scores3,
        'BLEU_4': scores4,
         "cosin similarity": cos
     }
        
    
    return pd.DataFrame(data)

In [24]:
model_name = 'MarianMT'  # Model name of model
model_path= '/kaggle/working/model/models--Sag1012--machine-translation/snapshots/dff7854613e72ee87d975bc13c01813f05dd3dc5/MarianMT_ver2'   # Model Path of model

df = evaluate_model(model_name, model_path, test_dataset)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [25]:
df

Unnamed: 0,texts,predictions,references,BLEU_1,BLEU_2,BLEU_3,BLEU_4,cosin similarity
0,Murillo has served as the Nicaraguan governmen...,Murillo đã từng là người phát ngôn chính của c...,Murillo đã từng là phát ngôn viên chính của ch...,0.817283,0.716367,0.668703,0.556204,0.969939
1,Work : People may regret not following a diffe...,Công việc : Người ta có thể hối tiếc vì không ...,Công việc : Con người có thể cảm thấy hối tiếc...,0.641245,0.494335,0.436449,0.297307,0.921119
2,M- my wife barely lets me see her naked .,Vợ tôi hầu như không cho tôi thấy cô ấy trần t...,Vợ ... vợ tôi còn hiếm khi để tôi nhìn cô ấy k...,0.395725,0.168151,0.096638,0.037076,0.717559
3,"It 's full of panic and fear , and I 'd heard ...","Nó đầy hoảng loạn và sợ hãi , và tôi nghe thấy...",Nó chứa đầy sợ hãi và hoảng loạn và tôi nghe t...,0.680000,0.504975,0.387803,0.110886,0.898981
4,This will make it much easier for you to expos...,Điều này sẽ giúp bạn dễ dàng vạch trần quả bón...,Bước này sẽ giúp bạn dễ dàng làm lộ rễ cây để ...,0.545455,0.412861,0.366260,0.257534,0.841695
...,...,...,...,...,...,...,...,...
95,You can click on any text box and start typing...,Bạn có thể nhấp vào hộp văn bản và bắt đầu gõ ...,Bạn có thể nhấp chuột vào bất kỳ hộp thoại nào...,0.619198,0.433439,0.403613,0.280517,0.850169
96,"Yeah , they 're rocket scientists , remember ?","Phải , họ là những nhà khoa học tên lửa , nhớ ...","Ừ , họ là các nhà khoa học tên lửa , nhớ không ?",0.857143,0.811998,0.783462,0.699752,0.915851
97,Fidgeting is a sign that you lack confidence .,Tán tỉnh là dấu hiệu cho thấy bạn thiếu tự tin .,Đó là dấu hiệu cho thấy sự thiếu tự tin của bạn .,0.766704,0.620294,0.574014,0.412491,0.807538
98,Agreeing with the angry person might help diff...,Đồng ý với người có tính nóng giận có thể giúp...,Việc bày tỏ sự đồng tình với người đang tức gi...,0.378733,0.259808,0.221808,0.134033,0.766936


In [26]:
# Print average scores of BLEU 1

df['BLEU_1'].sum()/len(df)

0.6070899981850879

In [27]:
# Print average scores of BLEU 2

df['BLEU_2'].sum()/len(df)

0.4849884020116053

In [28]:
# Print average scores of BLEU 3

df['BLEU_3'].sum()/len(df)

0.43604474040861013

In [29]:
# Print average scores of BLEU 4

df['BLEU_4'].sum()/len(df)

0.33090962475722635

In [30]:
# Print average scores of cosin similarity

df['cosin similarity'].sum()/len(df)

0.8441271716356278