# File 01/00

# DESCRIPTION:
### This script downloads the primary source files (Old East Slavic, (Old) Church Slavonic) from the TOROT Treebank GitHub page,
### along with the Modern Russian and English translations from the PROEIL Syntacticus site. 
### If files already exist, they will be overwritten! 


# INPUT_FILES:
- ../source_data/treebank-releases-20180919/ # xml files (afnik.xml usw. )
- ../source_data/translations/orv.xml # Translations for the Old East Slavic words
- ../source_data/translations/chu.xml # Translations for the (Old) Church Slavonic words
# OUTPUT_FILE:
- ../source_data/treebank-releases-20180919/*xml
- ../source_data/translations/orv.xml
- ../source_data/translations/chu.xml

In [1]:
# library imports 
import os, pathlib, requests
import glob
import shutil
from pathlib import Path
import zipfile

## Download primary source files (i.e. treebank files, VERSION: 20180919)  

In [2]:
# --- Config ---
url = "https://github.com/torottreebank/treebank-releases/archive/refs/tags/20180919.zip"
zip_name = "treebank_20180919.zip"

target_dir = "../source_data/treebank-releases-20180919"
os.makedirs(target_dir, exist_ok=True)

# --- 1) Download ZIP ---
r = requests.get(url)
r.raise_for_status()

with open(zip_name, "wb") as f:
    f.write(r.content)

# --- 2) Unzip into temp directory ---
tmp_dir = "__tmp_treebank_20180919"
with zipfile.ZipFile(zip_name, "r") as z:
    z.extractall(tmp_dir)

# --- 3) Move extracted files to target directory ---
# GitHub ZIPs always contain a single top-level folder
extracted_root = os.path.join(tmp_dir, os.listdir(tmp_dir)[0])

os.makedirs(target_dir, exist_ok=True)

for item in os.listdir(extracted_root):
    src = os.path.join(extracted_root, item)
    dst = os.path.join(target_dir, item)
    if os.path.exists(dst):
        shutil.rmtree(dst) if os.path.isdir(dst) else os.remove(dst)
    shutil.move(src, dst)

# --- 4) Cleanup ---
os.remove(zip_name)
shutil.rmtree(tmp_dir)

for path in glob.glob(os.path.join(target_dir, "*.conll")): 
    os.remove(path)
for path in glob.glob(os.path.join(target_dir, "*.md")): 
    os.remove(path)

assert not os.path.exists("__tmp_treebank_20180919/")
assert os.path.exists("../source_data/treebank-releases-20180919/")
assert glob.glob("../source_data/treebank-releases-20180919/*.xml"), "No XML files found"

## Translation files:
### orv.xml (Old Russian)
### chu.xml (Old Church Slavonic)

In [3]:
urls = {
    "orv": "https://raw.githubusercontent.com/proiel/syntacticus-dictionaries/master/orv.xml",
    "chu": "https://raw.githubusercontent.com/proiel/syntacticus-dictionaries/master/chu.xml",
}

# Target directory
target_dir = "../source_data/translations"
os.makedirs(target_dir, exist_ok=True)

# Download and save
for key, url in urls.items():
    target_path = os.path.join(target_dir, f"{key}.xml")

    r = requests.get(url)
    r.raise_for_status()

    with open(target_path, "wb") as f:
        f.write(r.content)

assert os.path.exists("../source_data/translations//")
assert glob.glob("../source_data/translations//*.xml"), "No XML files found"