# Newsela 

Elaborations : https://github.com/nehasrikn/elaborative-simplification/tree/main/data

- original article: version 0 
- simplest: version 4, 5

# Load data

In [43]:
import os
import pandas as pd

data_path = "../data/newsela_article_corpus_2016-01-29"
articles_path = os.path.join(data_path,"articles")
elaborations_path = os.path.join(data_path,"elaborations")

metadata_df = pd.read_csv(os.path.join(data_path, "articles_metadata.csv"))

In [None]:
metadata_df.head()

# Utils

## Text-files

In [40]:
import re

def preprocess(text):
    # all instances of unnecessary newlines (\n)
    text = re.sub(r'\n+', ' ', text)
    
    # escaped single quotes (\' -> ')
    text = text.replace(r"\'", "'")
    
    # escaped double quotes (\" -> ")
    text = text.replace(r'\"', '"')
    
    # any remaining backslashes
    text = re.sub(r'\\', '', text)
    
    # extra spaces that might have resulted from the replacements
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [64]:
import os

def read_in_article(file_name):
    try:
        with open(os.path.join(articles_path,file_name), 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        return "Error: File not found."
    except Exception as e:
        return f"An error occurred: {e}"

In [None]:
sample = metadata_df.sample(n=1,random_state=42)
slug = str(sample['slug'].iloc[0])
lang = str(sample['language'].iloc[0])
ver = str(sample['version'].iloc[0])
file_name = slug + '.' + lang + '.' + ver + '.txt'
article = read_in_article(file_name)
print(preprocess(article))

# Create elaboration dataset

In [61]:
import os 
import json
import pandas as pd

newsela_doc_path = os.path.join(data_path, "newsela_data_share-20150302")

train_metadata_df = pd.read_json(os.path.join(elaborations_path, "train.json"))
valid_metadata_df = pd.read_json(os.path.join(elaborations_path, "val.json"))
test_metadata_df = pd.read_json(os.path.join(elaborations_path, "test.json"))

train_elab_metadata_df = pd.json_normalize(train_metadata_df["elaborations"])
valid_elab_metadata_df = pd.json_normalize(valid_metadata_df["elaborations"])
test_elab_metadata_df = pd.json_normalize(test_metadata_df["elaborations"])

In [None]:
train_elab_metadata_df.head()

In [62]:
print(newsela_doc_path)

../data/newsela_article_corpus_2016-01-29/newsela_data_share-20150302


In [None]:
pd.read_json(os.path.join(newsela_doc_path, "newsela_articles_20150302.5versions.sents.json"))

In [67]:
with open(os.path.join(newsela_doc_path,"newsela_articles_20150302.aligned.sents.txt"), 'r', encoding='utf-8') as file:
            content = file.read()