# Dataset Preparation

In this notebook, we will:
1. Load the sentence-aligned files from the corpus (ADV→ELE, ADV→INT, INT→ELE)
2. Parse them into clean sentence pairs
3. Add readability scores using Flesch-Kincaid and Flesch Reading Ease
4. Save a CSV ready for training or evaluation

In [1]:

!pip install textstat pandas

import pandas as pd
import textstat

from google.colab import drive
drive.mount('/content/drive')


Collecting textstat
  Downloading textstat-0.7.10-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.10-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.2/239.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.10
Mounted at /content/drive


## Parse function

In [2]:
def parse_aligned_file(file_path, source_level, target_level, swap=False):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()

    data = []
    i = 0
    while i < len(lines):
        if lines[i].strip() == '':
            i += 1
            continue

        first_sentence = lines[i].strip()
        second_sentence = lines[i+1].strip()

        if swap:
            source = second_sentence
            target = first_sentence
        else:
            source = first_sentence
            target = second_sentence

        data.append({
            "source_text": source,
            "target_text": target,
            "source_level": source_level,
            "target_level": target_level
        })

        i += 3
    return data


## Load all sentence-alignment files

In [3]:
base_path = "/content/drive/MyDrive/Sentence-Aligned/"

adv_ele = parse_aligned_file(base_path + "ADV-ELE.txt", "ADV", "ELE")
adv_int = parse_aligned_file(base_path + "ADV-INT.txt", "ADV", "INT")
int_ele = parse_aligned_file(base_path + "ELE-INT.txt", "INT", "ELE", swap=True)

# combine
all_data = adv_ele + adv_int + int_ele
df = pd.DataFrame(all_data)
print(f"Total sentence pairs: {len(df)}")
df.head()

Total sentence pairs: 6006


Unnamed: 0,source_text,target_text,source_level,target_level
0,The Seattle-based company has applied for its ...,Amazon has asked for its company name to be a ...,ADV,ELE
1,"Until now, the differences between commercial,...","Until now, the differences between commercial,...",ADV,ELE
2,"Amazon has applied for dozens of new domains, ...","Amazon has applied for many new domains, inclu...",ADV,ELE
3,Allowing private companies to register geograp...,Allowing private companies to register geograp...,ADV,ELE
4,Brazil said its views were endorsed last month...,Brazil said other members of the Amazon Cooper...,ADV,ELE


## Add Readability Scores


In [4]:
fk_source = []
fk_target = []
fre_source = []
fre_target = []

for i in range(len(df)):
    s_text = df.loc[i, 'source_text']
    t_text = df.loc[i, 'target_text']

    fk_source.append(textstat.flesch_kincaid_grade(s_text))
    fk_target.append(textstat.flesch_kincaid_grade(t_text))
    fre_source.append(textstat.flesch_reading_ease(s_text))
    fre_target.append(textstat.flesch_reading_ease(t_text))

df['fk_source'] = fk_source
df['fk_target'] = fk_target
df['fre_source'] = fre_source
df['fre_target'] = fre_target


In [6]:
output_path = "/content/drive/MyDrive/all_levels_with_readability.csv"
df.to_csv(output_path, index=False)
print(f"Saved dataset to {output_path}")


Saved dataset to /content/drive/MyDrive/all_levels_with_readability.csv
