# Data Acquisition

In [1]:
!git clone https://github.com/gunnxx/indonesian-mt-data.git

Cloning into 'indonesian-mt-data'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 44 (delta 0), reused 3 (delta 0), pack-reused 31 (from 1)[K
Receiving objects: 100% (44/44), 243.82 MiB | 11.05 MiB/s, done.
Updating files: 100% (27/27), done.


In [2]:
import pandas as pd

# Dataset paths for English and Indonesian files
datasets = {
    'train': ('indonesian-mt-data/news/train.en', 'indonesian-mt-data/news/train.id'),
    'val': ('indonesian-mt-data/news/valid.en', 'indonesian-mt-data/news/valid.id'),
    'test': ('indonesian-mt-data/news/test.en', 'indonesian-mt-data/news/test.id')
}

# Function to load datasets into a DataFrame
def load_dataset(en_path, id_path):
    en_sentences = [line.strip() for line in open(en_path, 'r')]
    id_sentences = [line.strip() for line in open(id_path, 'r')]
    return pd.DataFrame({'Indonesian': id_sentences, 'English': en_sentences})

# Load datasets into DataFrames
df_train = load_dataset(*datasets['train'])
df_val = load_dataset(*datasets['val'])
df_test = load_dataset(*datasets['test'])

We use the **News En-Id** dataset, a parallel Indonesian-English corpus from the paper *Benchmarking Multidomain English-Indonesian Machine Translation*. This dataset was chosen for its complex sentence structures and formal language, making it a good fit for sentence simplification tasks. We cloned the dataset from GitHub and loaded the training, validation, and test sets into Pandas DataFrames for processing.

# Data Preprocessing

In [None]:
# Check for missing values in each column
missing_values = df_train.isnull().sum()
print(missing_values)

Indonesian    0
English       0
dtype: int64


In [3]:
import pandas as pd

# Function to calculate sentence length (word count) for both languages
def sentence_length_analysis(df, indonesian_col='Indonesian', english_col='English'):
    df[indonesian_col + '_length'] = df[indonesian_col].astype(str).apply(lambda x: len(x.split()))  # Word count in Indonesian
    df[english_col + '_length'] = df[english_col].astype(str).apply(lambda x: len(x.split()))  # Word count in English
    return df

#Apply sentence length
df_train = sentence_length_analysis(df_train)
df_val = sentence_length_analysis(df_val)
df_test = sentence_length_analysis(df_test)

In [None]:
#Summary statistics for sentence length
df_train.describe()

Unnamed: 0,Indonesian_length,English_length
count,38469.0,38469.0
mean,19.169487,20.882243
std,10.173374,10.909587
min,1.0,0.0
25%,12.0,13.0
50%,18.0,20.0
75%,26.0,28.0
max,188.0,200.0


In [None]:
# Check for empty sentences in both columns
empty_indonesian = df_train[df_train['Indonesian'].str.strip() == '']
empty_english = df_train[df_train['English'].str.strip() == '']

# Show the counts of empty sentences
print("Empty Indonesian sentences count:", empty_indonesian.shape[0])
print("Empty English sentences count:", empty_english.shape[0])

Empty Indonesian sentences count: 0
Empty English sentences count: 7


In [4]:
# Set the minimum words into 10 and maximum 100
def filter_sentence_length(df, min_length=10, max_length=100, indonesian_col='Indonesian_length', english_col='English_length'):
    filtered_df = df[
        (df[english_col] >= min_length) & (df[english_col] <= max_length) &
        (df[indonesian_col] >= min_length) & (df[indonesian_col] <= max_length)
    ]
    return filtered_df

df_train = filter_sentence_length(df_train)

The threshold was set to a minimum of 10 words for both Indonesian and English sentences. For Indonesian, this ensures the sentence is complex enough, while for English, it helps address the empty sentences that were detected. The maximum length is limited to 100 words to avoid extremely lengthy sentences.

In [None]:
df_train.describe()

Unnamed: 0,Indonesian_length,English_length
count,31050.0,31050.0
mean,22.122705,23.983156
std,8.843952,9.444574
min,10.0,10.0
25%,15.0,17.0
50%,21.0,23.0
75%,27.0,30.0
max,96.0,96.0


In [None]:
# Total data in validation set and test set before applying filters
print("Total data in the validation set:", df_val.shape)
print("Total data in the test set:", df_test.shape)

Total data in the validation set: (1953, 4)
Total data in the test set: (1954, 4)


In [None]:
df_val = filter_sentence_length(df_val)
df_test = filter_sentence_length(df_test)

In [None]:
# Total data in validation set and test set after applying filters
print("Total data in the validation set:", df_val.shape)
print("Total data in the test set:", df_test.shape)

Total data in the validation set: (1489, 4)
Total data in the test set: (1463, 4)


After filtering out the data with sentences between 10 and 100 words, we ended up with **31,050** sentences from 38,469 in the **training set**, **1,489** from 1,953 in the **validation set**, and **1,463** from 1,954 in the **test set**.

# Translation

In [5]:
# Randomly sample 25% of the dataframe
df_train = df_train.sample(frac=0.25, random_state=42)


We randomly sample 25% of the training dataset due to the computational resources.

In [6]:
# Total data in train set
print("Total data in the training set:", df_train.shape)

Total data in the training set: (7762, 4)


In [None]:
# Install necessary libraries
!pip install transformers sentencepiece pandas tqdm

# Import libraries
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
from tqdm import tqdm
import math
import torch



In [None]:
# Load MarianMT English to Indonesian translation model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-id'  # MarianMT for English to Indonesian
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Function to translate a sentence
def translate(sentence):
    try:
        inputs = tokenizer([sentence], return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        translated = model.generate(**inputs)
        translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
        return translated_sentence
    except Exception as e:
        print(f"Error translating: {e}")
        return None

In [None]:
# Function to translate columns in a DataFrame
def translate_dataframe(df, source_col='English', target_col='Indonesian_Translation'):
    df[target_col] = ''

    for idx in tqdm(df.index):
        source_sentence = df.at[idx, source_col]
        translated = translate(source_sentence)
        df.at[idx, target_col] = translated

    return df

In [None]:
# Apply function to dataframes
df_train = translate_dataframe(df_train)

100%|██████████| 7762/7762 [35:15<00:00,  3.67it/s]


In [None]:
df_train.head()

Unnamed: 0,Indonesian,English,Indonesian_length,English_length,Indonesian_Translation
26100,"Para pendukung Chelsea, yang terakhir kali mel...","The Chelsea fans, who last saw their team lose...",39,48,"Para penggemar Chelsea, yang terakhir melihat ..."
22860,Jurubicara BASF menyatakan karyawan di tempat ...,A BASF spokesman said workers on the site were...,33,41,Juru bicara BASF mengatakan para pekerja di lo...
20374,Padahal sebenarnya saat ini merupakan momentum...,Meanwhile now is actually the time for the com...,18,12,Sementara itu sekarang adalah waktu bagi perus...
8060,"""Meski tak berbahaya, karakter-karakter lucu d...","""Although harmless, cute images are adopted to...",24,25,"""Meskipun tidak berbahaya, gambar lucu diadops..."
36835,Saya merasa kami dirampok dan Mr Duhamel meram...,I feel we've been robbed and Mr Duhamel robbed...,20,20,Aku merasa kita dirampok dan Tn. Duhamel meram...


In [None]:
# Apply function to dataframes
df_val = translate_dataframe(df_val)

100%|██████████| 1489/1489 [06:15<00:00,  3.96it/s]


In [None]:
# Apply function to dataframes
df_test = translate_dataframe(df_test)

100%|██████████| 1463/1463 [06:46<00:00,  3.60it/s]


In [None]:
# Remove the specified columns from the dataframe
def remove_columns(df):
    columns_to_remove = ['English', 'Indonesian_length', 'English_length']
    df = df.drop(columns=columns_to_remove, axis=1)
    return df

# Apply the column removal to each dataframe
df_train = remove_columns(df_train)
df_val = remove_columns(df_val)
df_test = remove_columns(df_test)

In [None]:
df_train.head()

Unnamed: 0,Indonesian,Indonesian_Translation
26100,"Para pendukung Chelsea, yang terakhir kali mel...","Para penggemar Chelsea, yang terakhir melihat ..."
22860,Jurubicara BASF menyatakan karyawan di tempat ...,Juru bicara BASF mengatakan para pekerja di lo...
20374,Padahal sebenarnya saat ini merupakan momentum...,Sementara itu sekarang adalah waktu bagi perus...
8060,"""Meski tak berbahaya, karakter-karakter lucu d...","""Meskipun tidak berbahaya, gambar lucu diadops..."
36835,Saya merasa kami dirampok dan Mr Duhamel meram...,Aku merasa kita dirampok dan Tn. Duhamel meram...


# Selector

In [None]:
!pip install textstat sacrebleu
import nltk
import textstat
import sacrebleu
import pandas as pd
nltk.download('punkt')

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-non

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Rename columns in the dataframe
def rename_columns(df):
    df = df.rename(columns={
        'Indonesian': 'Id_Complex',
        'Indonesian_Translation': 'Id_Simple'
    })
    return df

# Apply to each dataframe
df_train = rename_columns(df_train)
df_val = rename_columns(df_val)
df_test = rename_columns(df_test)

In [None]:
df_train.head()

Unnamed: 0,Id_Complex,Id_Simple
26100,"Para pendukung Chelsea, yang terakhir kali mel...","Para penggemar Chelsea, yang terakhir melihat ..."
22860,Jurubicara BASF menyatakan karyawan di tempat ...,Juru bicara BASF mengatakan para pekerja di lo...
20374,Padahal sebenarnya saat ini merupakan momentum...,Sementara itu sekarang adalah waktu bagi perus...
8060,"""Meski tak berbahaya, karakter-karakter lucu d...","""Meskipun tidak berbahaya, gambar lucu diadops..."
36835,Saya merasa kami dirampok dan Mr Duhamel meram...,Aku merasa kita dirampok dan Tn. Duhamel meram...


In [None]:
# Function to calculate BLEU score using SACREBLEU
def calculate_bleu(reference_sentence, translated_sentence):
    return sacrebleu.sentence_bleu(translated_sentence, [reference_sentence]).score

# Function to calculate FRES (Flesch Reading Ease Score)
def calculate_fres(sentence):
    return textstat.flesch_reading_ease(sentence)

# Function to apply BLEU and FRES scores to a dataframe
def apply_bleu_fres(df):
    # Apply BLEU calculation for each row
    df['bleu_score'] = df.apply(lambda row: calculate_bleu(row['Id_Complex'], row['Id_Simple']), axis=1)

    # Apply FRES calculation for each row
    df['fres_complex'] = df['Id_Complex'].apply(calculate_fres)
    df['fres_simple'] = df['Id_Simple'].apply(calculate_fres)

    # Calculate the FRES difference between the complex and simplified sentences
    df['fres_difference'] = df['fres_simple'] - df['fres_complex']

    return df

In [None]:
# Apply to each dataframe
df_train = apply_bleu_fres(df_train)
df_val = apply_bleu_fres(df_val)
df_test = apply_bleu_fres(df_test)

To select sentence pairs with higher difference in complexity, we use two selectors BLEU and FRES.

* BLEU is used to assess the quality of paraphrasing by comparing the similarity
between the original and simplified sentences.
* FRES measures the readability of a text, with higher scores indicating easier readability.

In [None]:
df_train.head()

Unnamed: 0,Id_Complex,Id_Simple,bleu_score,fres_complex,fres_simple,fres_difference
26100,"Para pendukung Chelsea, yang terakhir kali mel...","Para penggemar Chelsea, yang terakhir melihat ...",29.36466,23.43,19.37,-4.06
22860,Jurubicara BASF menyatakan karyawan di tempat ...,Juru bicara BASF mengatakan para pekerja di lo...,22.539266,29.52,30.88,1.36
20374,Padahal sebenarnya saat ini merupakan momentum...,Sementara itu sekarang adalah waktu bagi perus...,2.031577,44.75,36.96,-7.79
8060,"""Meski tak berbahaya, karakter-karakter lucu d...","""Meskipun tidak berbahaya, gambar lucu diadops...",71.765326,30.2,38.66,8.46
36835,Saya merasa kami dirampok dan Mr Duhamel meram...,Aku merasa kita dirampok dan Tn. Duhamel meram...,17.426656,42.72,27.49,-15.23


In [None]:
#Summary statistics
print("Bleu Score Stats:")
print(df_train['bleu_score'].describe())

print("\nFres complex Stats:")
print(df_val['fres_complex'].describe())

print("\nFres difference Stats:")
print(df_test['fres_difference'].describe())

Bleu Score Stats:
count    7762.000000
mean       23.449636
std        16.375643
min         0.000000
25%        10.435265
50%        19.476283
75%        33.189057
max       100.000000
Name: bleu_score, dtype: float64

Fres complex Stats:
count    1489.000000
mean       38.755695
std        18.428094
min       -31.740000
25%        26.810000
50%        38.660000
75%        51.850000
max        98.720000
Name: fres_complex, dtype: float64

Fres difference Stats:
count    1463.000000
mean        0.538339
std        16.194426
min       -70.730000
25%        -9.480000
50%         0.000000
75%        10.170000
max        60.910000
Name: fres_difference, dtype: float64


In [None]:
# Set thresholds
threshold_bleu = 15
threshold_fres_diff = 10  # FRES difference threshold

# Function to filter DataFrame based on BLEU score and FRES difference
def filter_dataframe(df):
    # Filter the DataFrame based on BLEU score and FRES difference
    return df[(df['bleu_score'] > threshold_bleu) & (df['fres_difference'] > threshold_fres_diff)]

# Apply the filtering to each dataframe
SS_train= filter_dataframe(df_train)
SS_val = filter_dataframe(df_val)
SS_test = filter_dataframe(df_test)

We set the thresholds for BLEU and FRES score differences as follows:
*   A BLEU score threshold of 15 ensures that the translation sentences still similar to the original sentences.
*   A FRES score difference threshold of 10 indicates that the simplified version is approximately one grade level simpler in readability compared to its unsimplified counterpart.

In [None]:
# Remove the specified columns from the dataframe
def remove_columns(df):
    columns_to_remove = ['bleu_score', 'fres_complex', 'fres_simple', 'fres_difference']
    df = df.drop(columns=columns_to_remove, axis=1)
    return df

# Apply the column removal to each dataframe
SS_train = remove_columns(SS_train)
SS_val = remove_columns(SS_val)
SS_test = remove_columns(SS_test)

In [None]:
SS_train.head()

Unnamed: 0,Id_Complex,Id_Simple
10121,Penggunaan Tor di Rusia Melonjak sebagai Tangg...,Penggunaan Tor di Rusia Mencicipi Upaya Penyen...
28045,Peluang pembiayaan di sektor ini terbilang cer...,Prospek pembiayaan di sektor ini secara katego...
4144,"""Beberapa hari pertama peluncuran sistem bus d...","""Beberapa hari pertama peluncuran bus rusak ol..."
36690,Saya memperhitungkan set berikutnya akan lebih...,Aku berharap itu menjadi sulit dan itu adalah ...
23108,Selain itu undisbursed loan UL/pinjaman yang b...,Pinjaman yang tak terbantahkan di kuartal kedu...


In [None]:
# Total data after applying filters
print("Total data in the training set:", SS_train.shape)
print("Total data in the validation set:", SS_val.shape)
print("Total data in the test set:", SS_test.shape)

Total data in the training set: (1306, 2)
Total data in the validation set: (199, 2)
Total data in the test set: (194, 2)


After filtering the sentences, we are left with **1,306** in the training set, **199** in the validation set, and **194** in the test set for the sentence simplification dataset