### Get a list of URLs from the downloaded files

- data/raw/html/wikipedia_*.html.meta.json (url field)
- data/raw/json/nasa_apod_*.json (url and hdurl fields)
- data/raw/xml/arxiv_*.xml (arXiv entry id fields)

In [1]:
import os
import pandas as pd
import json
import glob
import xml.etree.ElementTree as ET

In [2]:
RAW_HTML_DIR = "data/raw/html"
RAW_JSON_DIR = "data/raw/json"
RAW_XML_DIR = "data/raw/xml"
OUTPUT_FILE = "data/eval/url_sources.csv"

In [3]:
def get_wikipedia_urls():
    urls = []
    meta_files = glob.glob(os.path.join(RAW_HTML_DIR, "wikipedia_*.html.meta.json"))
    for meta_path in meta_files:
        with open(meta_path, "r") as f:
            meta = json.load(f)
            if "url" in meta:
                urls.append({"source": "wikipedia", "url": meta["url"]})
    return urls

In [4]:
def get_nasa_apod_urls():
    urls = []
    json_files = glob.glob(os.path.join(RAW_JSON_DIR, "nasa_apod_*.json"))
    for json_path in json_files:
        with open(json_path, "r") as f:
            data = json.load(f)
            if "url" in data:
                urls.append({"source": "nasa", "url": data["url"]})
            if "hdurl" in data:
                urls.append({"source": "nasa", "url": data["hdurl"]})
    return urls

In [5]:
def get_arxiv_urls():
    urls = []
    xml_files = glob.glob(os.path.join(RAW_XML_DIR, "arxiv_*.xml"))
    for xml_path in xml_files:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        ns = {'atom': 'http://www.w3.org/2005/Atom'}
        for entry in root.findall('atom:entry', ns):
            id_elem = entry.find('atom:id', ns)
            if id_elem is not None:
                urls.append({"source": "arxiv", "url": id_elem.text})
    return urls

In [6]:
all_url_dicts = []
all_url_dicts.extend(get_wikipedia_urls())
all_url_dicts.extend(get_nasa_apod_urls())
all_url_dicts.extend(get_arxiv_urls())

In [7]:
df = pd.DataFrame(all_url_dicts)
df.to_csv(OUTPUT_FILE, index=False)
df.head()

Unnamed: 0,source,url
0,wikipedia,https://en.wikipedia.org/wiki/Perseverance_(ro...
1,wikipedia,https://en.wikipedia.org/wiki/Transformer
2,wikipedia,https://en.wikipedia.org/wiki/Astronomy
3,wikipedia,https://en.wikipedia.org/wiki/Hubble_Space_Tel...
4,wikipedia,https://en.wikipedia.org/wiki/NASA
