In [1]:
import tensorflow as tf
print(tf.__version__)

2.10.1


In [31]:
file = "./models-100k-2k/modelrnn-2_layer-10.keras"
model_2_layer = tf.keras.models.load_model(file)

model_2_layer.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 64)            1999680   
                                                                 
 simple_rnn (SimpleRNN)      (None, 60, 64)            8256      
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 60, 128)           24704     
                                                                 
 dense (Dense)               (None, 60, 35276)         4550604   
                                                                 
Total params: 6,583,244
Trainable params: 6,583,244
Non-trainable params: 0
_________________________________________________________________


In [32]:
file = "./models-50k/modelrnn-10.keras"
model_50k = tf.keras.models.load_model(file)

model_50k.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 64)            1358592   
                                                                 
 simple_rnn (SimpleRNN)      (None, 60, 128)           24704     
                                                                 
 dense (Dense)               (None, 60, 24066)         3104514   
                                                                 
Total params: 4,487,810
Trainable params: 4,487,810
Non-trainable params: 0
_________________________________________________________________


In [33]:
file = "./models-100k/modelrnn-10.keras"
model_1_layer = tf.keras.models.load_model(file)

model_1_layer.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 64)            1999680   
                                                                 
 simple_rnn (SimpleRNN)      (None, 60, 128)           24704     
                                                                 
 dense (Dense)               (None, 60, 35276)         4550604   
                                                                 
Total params: 6,574,988
Trainable params: 6,574,988
Non-trainable params: 0
_________________________________________________________________


In [39]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json

with open('english_tokenizer_100k.json') as f:
    data = f.read()

en_tokenizer_100k = tokenizer_from_json(data)

with open('indonesian_tokenizer_100k.json') as f:
    data = f.read()

id_tokenizer_100k = tokenizer_from_json(data)

with open('english_tokenizer.json') as f:
    data = f.read()

en_tokenizer_50k = tokenizer_from_json(data)

with open('indonesian_tokenizer.json') as f:
    data = f.read()

id_tokenizer_50k = tokenizer_from_json(data)

print("English 100k vocabulary size:", len(en_tokenizer_100k.word_index))
print("Indonesian 100k vocabulary size:", len(id_tokenizer_100k.word_index))
print("English 50k vocabulary size:", len(en_tokenizer_50k.word_index))
print("Indonesian 50k vocabulary size:", len(id_tokenizer_50k.word_index))

English 100k vocabulary size: 31244
Indonesian 100k vocabulary size: 35275
English 50k vocabulary size: 21227
Indonesian 50k vocabulary size: 24065


In [24]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def encode_seq(tokenizer, maxlen, lines):
    seq = tokenizer.texts_to_sequences([lines])
    seq = pad_sequences(seq, maxlen=maxlen, padding='post')
    return seq

In [25]:
import numpy as np

def decode_seq(tokenizer, prediction):
    seq = np.argmax(prediction[0], axis=-1)
    text = ' '.join([tokenizer.index_word.get(idx, '') for idx in seq if idx != 0])
    return text

In [26]:
sentence = "can we play now?"

seq = encode_seq(en_tokenizer_100k, 60, sentence)

print(seq.shape)

(1, 60)


In [27]:
prediction = model_2_layer.predict(seq)

print(prediction.shape)
print(prediction[0])

(1, 60, 35276)
[[1.7392549e-04 8.5469671e-03 2.5979618e-03 ... 9.0715639e-09
  9.9564206e-09 2.1686098e-08]
 [1.4632411e-03 2.0925878e-03 2.0414265e-03 ... 6.6285994e-09
  1.1571341e-09 3.0722617e-08]
 [6.8823554e-02 1.6513841e-03 1.8651620e-03 ... 5.1829102e-07
  1.6992825e-08 4.3987814e-08]
 ...
 [9.9944955e-01 1.1587454e-05 1.7401433e-05 ... 1.5480112e-10
  2.2717441e-11 1.7099500e-10]
 [9.9944955e-01 1.1585531e-05 1.7399823e-05 ... 1.5478606e-10
  2.2717916e-11 1.7103087e-10]
 [9.9944955e-01 1.1586339e-05 1.7404320e-05 ... 1.5475743e-10
  2.2711721e-11 1.7106480e-10]]


In [28]:
text_prediction = decode_seq(id_tokenizer_100k, prediction)

print(text_prediction)

bisa kita bermain sekarang


In [30]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def avg_bleu_score(input_data, output_data, input_tokenizer, output_tokenizer, max_sequence_length, model):
    bleu_scores = []
    results = []
    smoothing_function = SmoothingFunction().method1  # Apply smoothing

    for i in range(len(input_data)):
        
        # Tokenize and pad single sentence
        input_seq = encode_seq(input_tokenizer, 60, input_data[i])
        
        # Predict for this single input
        prediction = model.predict(input_seq)
        
        # Decode the prediction to text
        predicted_text = decode_seq(output_tokenizer, prediction)
        
        # Get the reference text
        reference_text = output_data[i]
        
        # Tokenize the reference and predicted sentences
        reference_tokens = [reference_text.split()]
        predicted_tokens = predicted_text.split()
        
        # Calculate BLEU score for this sentence
        bleu_score_value = sentence_bleu(reference_tokens, predicted_tokens, smoothing_function=smoothing_function)
        bleu_scores.append(bleu_score_value)

        # Save each input, reference, prediction, and BLEU score in the results list
        results.append({
            "input": input_data[i],
            "reference": reference_text,
            "prediction": predicted_text,
            "bleu_score": bleu_score_value
        })

    # Calculate the average BLEU score
    avg_bleu_score = np.mean(bleu_scores)
    print(f"Average BLEU Score: {avg_bleu_score:.4f}")

    return avg_bleu_score, results
    

In [34]:
import pandas as pd
df_test = pd.read_parquet("hf://datasets/Helsinki-NLP/opus-100/en-id/test-00000-of-00001.parquet")

df_test.info()

  from .autonotebook import tqdm as notebook_tqdm


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   translation  2000 non-null   object
dtypes: object(1)
memory usage: 15.8+ KB


In [35]:
df_test[:5]

Unnamed: 0,translation
0,"{'en': 'He was on the run.', 'id': 'Dia sedang..."
1,"{'en': 'How d'ya like that, Spidey?', 'id': 'B..."
2,"{'en': '- Staff Sergeant, sir!', 'id': '- Staf..."
3,"{'en': 'I'll be with her.', 'id': 'Aku akan be..."
4,"{'en': 'Because of us, they will be ruined.', ..."


In [38]:
english_test = df_test['translation'].apply(lambda x: x['en']).tolist()
indonesian_test = df_test['translation'].apply(lambda x: x['id']).tolist()

print(english_test[:5])
print(indonesian_test[:5])

['He was on the run.', "How d'ya like that, Spidey?", '- Staff Sergeant, sir!', "I'll be with her.", 'Because of us, they will be ruined.']
['Dia sedang dalam pelarian.', 'Bagaimana sekarang, Spidey?', '- Staf Sersan, Pak!', 'Aku akan bersamanya.', 'Karena kita, mereka akan hancur.']


In [42]:
bleu_2_layer, results_2_layer = avg_bleu_score(english_test, indonesian_test, en_tokenizer_100k, id_tokenizer_100k, 60, model_2_layer)

Average BLEU Score: 0.0273


In [43]:
bleu_1_layer, results_1_layer = avg_bleu_score(english_test, indonesian_test, en_tokenizer_100k, id_tokenizer_100k, 60, model_1_layer)

Average BLEU Score: 0.0295


In [44]:
bleu_50k, results_50k = avg_bleu_score(english_test, indonesian_test, en_tokenizer_50k, id_tokenizer_50k, 60, model_50k)

Average BLEU Score: 0.0292


In [61]:
print("Avg BLEU Score for 2 layers model:", bleu_2_layer)
print("Avg BLEU Score for 1 layers model:", bleu_1_layer)
print("Avg BLEU Score for 50k train data model:", bleu_50k)

Avg BLEU Score for 2 layers model: 0.02725818329571638
Avg BLEU Score for 1 layers model: 0.02949756890353104
Avg BLEU Score for 50k train data model: 0.02923292450757474


In [47]:
df_2_layer = pd.DataFrame(results_2_layer)

df_2_layer.head(10)

Unnamed: 0,input,reference,prediction,bleu_score
0,He was on the run.,Dia sedang dalam pelarian.,dia adalah di,0.0
1,"How d'ya like that, Spidey?","Bagaimana sekarang, Spidey?",bagaimana dia suka,0.0
2,"- Staff Sergeant, sir!","- Staf Sersan, Pak!",lanjutkan bermain tuan,0.0
3,I'll be with her.,Aku akan bersamanya.,aku akan dengan,0.113622
4,"Because of us, they will be ruined.","Karena kita, mereka akan hancur.",karena tentu kita mereka menjadi,0.053728
5,- Now sit you down.,- Sekarang coba duduk.,sekarang duduklah kau,0.0
6,"Okay, listen to me, Cole.","Oke, dengarkan aku, Cole.",oke dengar aku,0.0
7,- You're going to hell!,Kau akan masuk Neraka!,kau akan ke,0.081414
8,Then perhaps I can be of assistance in a diffe...,Mungkin aku bisa menolongamu dengan cara lain.,lalu mungkin aku bisa bisa,0.076163
9,For the little kids.,Untuk anak-anak kecil.,untuk saat,0.0


In [48]:
df_1_layer = pd.DataFrame(results_1_layer)

df_1_layer.head(10)

Unnamed: 0,input,reference,prediction,bleu_score
0,He was on the run.,Dia sedang dalam pelarian.,dia adalah di,0.0
1,"How d'ya like that, Spidey?","Bagaimana sekarang, Spidey?",bagaimana dia seperti,0.0
2,"- Staff Sergeant, sir!","- Staf Sersan, Pak!",kau sersan tuan,0.0
3,I'll be with her.,Aku akan bersamanya.,aku akan dengan dia,0.080343
4,"Because of us, they will be ruined.","Karena kita, mereka akan hancur.",karena itu kita mereka akan,0.113622
5,- Now sit you down.,- Sekarang coba duduk.,sekarang duduk,0.0
6,"Okay, listen to me, Cole.","Oke, dengarkan aku, Cole.",oke dengar,0.0
7,- You're going to hell!,Kau akan masuk Neraka!,kau akan pergi,0.081414
8,Then perhaps I can be of assistance in a diffe...,Mungkin aku bisa menolongamu dengan cara lain.,lalu mungkin aku bisa bisa,0.076163
9,For the little kids.,Untuk anak-anak kecil.,untuk satu kecil anak,0.0


In [49]:
df_50k = pd.DataFrame(results_50k)

df_50k.head(10)

Unnamed: 0,input,reference,prediction,bleu_score
0,He was on the run.,Dia sedang dalam pelarian.,dia adalah di yang,0.0
1,"How d'ya like that, Spidey?","Bagaimana sekarang, Spidey?",bagaimana dia suka itu,0.0
2,"- Staff Sergeant, sir!","- Staf Sersan, Pak!",dan sersan tuan,0.0
3,I'll be with her.,Aku akan bersamanya.,aku akan dengan,0.113622
4,"Because of us, they will be ruined.","Karena kita, mereka akan hancur.",karena orang kita mereka akan,0.113622
5,- Now sit you down.,- Sekarang coba duduk.,sekarang duduk,0.0
6,"Okay, listen to me, Cole.","Oke, dengarkan aku, Cole.",oke dengar,0.0
7,- You're going to hell!,Kau akan masuk Neraka!,kau akan ke,0.081414
8,Then perhaps I can be of assistance in a diffe...,Mungkin aku bisa menolongamu dengan cara lain.,kemudian mungkin aku bisa bisa dari,0.07308
9,For the little kids.,Untuk anak-anak kecil.,untuk kepuasanku yang anak,0.0


In [53]:
path = "./h5-models/"

model_2_layer.save(path + "model-rnn_2_layer.h5")
model_1_layer.save(path + "model-rnn_1_layer.h5")
model_50k.save(path + "model-rnn_50k.h5")

In [56]:
df_2_layer = df_2_layer.rename(columns={'prediction': 'prediction_2_layer'})
df_1_layer = df_1_layer.rename(columns={'prediction': 'prediction_1_layer'})
df_50k = df_50k.rename(columns={'prediction': 'prediction_50k'})

merged_df = df_2_layer[['input', 'reference', 'prediction_2_layer']].merge(
    df_1_layer[['input', 'reference', 'prediction_1_layer']], on=['input', 'reference']
).merge(
    df_50k[['input', 'reference', 'prediction_50k']], on=['input', 'reference']
)

merged_df.head(10)

Unnamed: 0,input,reference,prediction_2_layer,prediction_1_layer,prediction_50k
0,He was on the run.,Dia sedang dalam pelarian.,dia adalah di,dia adalah di,dia adalah di yang
1,"How d'ya like that, Spidey?","Bagaimana sekarang, Spidey?",bagaimana dia suka,bagaimana dia seperti,bagaimana dia suka itu
2,"- Staff Sergeant, sir!","- Staf Sersan, Pak!",lanjutkan bermain tuan,kau sersan tuan,dan sersan tuan
3,I'll be with her.,Aku akan bersamanya.,aku akan dengan,aku akan dengan dia,aku akan dengan
4,"Because of us, they will be ruined.","Karena kita, mereka akan hancur.",karena tentu kita mereka menjadi,karena itu kita mereka akan,karena orang kita mereka akan
5,- Now sit you down.,- Sekarang coba duduk.,sekarang duduklah kau,sekarang duduk,sekarang duduk
6,"Okay, listen to me, Cole.","Oke, dengarkan aku, Cole.",oke dengar aku,oke dengar,oke dengar
7,- You're going to hell!,Kau akan masuk Neraka!,kau akan ke,kau akan pergi,kau akan ke
8,Then perhaps I can be of assistance in a diffe...,Mungkin aku bisa menolongamu dengan cara lain.,lalu mungkin aku bisa bisa,lalu mungkin aku bisa bisa,kemudian mungkin aku bisa bisa dari
9,For the little kids.,Untuk anak-anak kecil.,untuk saat,untuk satu kecil anak,untuk kepuasanku yang anak


In [59]:
merged_df.to_excel('test_data_comparison.xlsx', index=False)