# Dataset Preparation


In [1]:
!pip install textstat pandas

import pandas as pd
import textstat


Collecting textstat
  Downloading textstat-0.7.10-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.10-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.2/239.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.10
Mounted at /content/drive


## Parsing Sentence-Aligned Text Files

The original files contain aligned sentence pairs, with the source sentence on one line and the simplified (target) sentence on the next, separated by blank lines.  

We used a parser function to:

- Read the file line by line.  
- Skip empty lines.  
- Pair each source sentence with its corresponding target sentence.  
- Store these pairs as dictionaries in a list.  

This transforms the raw text files into a structured dataset suitable for further processing and model training.


In [2]:
def parse_simplification_txt_1(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()

    i = 0
    while i < len(lines):
        if lines[i].strip() == '':
            i += 1
            continue

        source = lines[i].strip()
        i += 1

        if i < len(lines):
            target = lines[i].strip()
            i += 1
        else:
            break

        data.append({
            'source_text': source,
            'target_text': target
        })


        while i < len(lines) and lines[i].strip() == '':
            i += 1

    return data


In [6]:
file_path = "complex_simple.txt"
parsed_data = parse_simplification_txt_1(file_path)
df = pd.DataFrame(parsed_data)

df.to_csv("simplification_dataset.csv", index=False, encoding="utf-8")

## Data Cleaning

In [19]:

df_clean = pd.read_csv("simplification_dataset.csv")

df_clean = df[df["source_text"].notna() & df["target_text"].notna()]
df_clean = df[df["source_text"].str.strip() != ""]
df_clean = df[df["target_text"].str.strip() != ""]

df_clean.to_csv("simplification_dataset_clean.csv", index=False)

## Add Readability Scores for analysis


In [20]:
fk_source = []
fk_target = []
fre_source = []
fre_target = []

for i in range(len(df_clean)):
    s_text = df_clean.loc[i, 'source_text']
    t_text = df_clean.loc[i, 'target_text']

    fk_source.append(textstat.flesch_kincaid_grade(s_text))
    fk_target.append(textstat.flesch_kincaid_grade(t_text))
    fre_source.append(textstat.flesch_reading_ease(s_text))
    fre_target.append(textstat.flesch_reading_ease(t_text))

df_clean['fk_source'] = fk_source
df_clean['fk_target'] = fk_target
df_clean['fre_source'] = fre_source
df_clean['fre_target'] = fre_target

In [21]:
df_clean.to_csv("with_readability.csv", index=False)