### I. Извлечение архивов

In [None]:
import os
import subprocess
from urllib.parse import urlparse

base_dir = "/Users/vasilii/Documents/Python"

sites = [
    "https://nginx.org/en/docs/",
    'https://curl.se/docs/',
    'https://alpinejs.dev/start-here',
    'https://www.lua.org/manual/5.4/',
    'https://doc.rust-lang.org/book/'
]

def archive_site(url):
    parsed = urlparse(url)
    domain = parsed.netloc.replace("www.", "")
    path = parsed.path.rstrip('/')
    domain_path = os.path.join(base_dir, domain)

    os.makedirs(domain_path, exist_ok=True)

    cmd = [
        "wpull", url,
        "--strip-session-id",
        "--no-check-certificate",
        "--no-robots",
        "--page-requisites",
        "--inet4-only",
        "--timeout", "20",
        "--tries", "3",
        "--waitretry", "2",
        "--recursive",
        "--no-parent",
        "--level", "2",
        "--domains", domain,
        "--span-hosts-allow", "page-requisites",
        "--reject-regex", ".*(download|fossil).*",
        "--retry-connrefused",
        "--retry-dns-error",
        "--delete-after",
        "--warc-append",
        "--warc-cdx",
        "-U", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:101.0) Gecko/20100101 Firefox/101.0",
        "-d", "-a", f"{domain_path}/{domain}.log",
        "--database", f"{domain_path}/sitearchive-{domain}.db",
        "--warc-file", f"{domain_path}/{domain}",
        "--warc-header", "operator: No Name",
        "--warc-header", "downloaded-by: Student",
        "--concurrent", "6",
        "--wait", "0.1"
    ]

    print(f"\n{'='*60}")
    print(f"url: {url}")

    result = subprocess.run(cmd)

    if result.returncode == 0:
        print(f"success")
    else:
        print(f"failed")

    return result.returncode == 0

results = {url: archive_site(url) for url in sites}



url: https://nginx.org/en/docs/
success

url: https://doc.rust-lang.org/book/
success


### II. Оценка ArchiveReady

In [None]:
import requests
import json

url = "https://archiveready.com/api?url="
websites = [
    'https://alpinejs.dev/start-here',
    'https://curl.se/docs/',
    'https://doc.rust-lang.org/book/',
    'https://nginx.org/en/docs/',
    'https://www.lua.org/manual/5.4/'
]

for i in websites:
    response = requests.get(url + i)
    data = response.json()
    
    filename = 'archive_ready_' + i.replace('https://', '').replace('/', '_').rstrip('_') + '.json'
    
    with open(filename, 'w') as f:
        json.dump(data, f, indent=2)
    
    print(f"saved: {filename}")

Saved: archive_ready_alpinejs.dev_start-here.json
Saved: archive_ready_curl.se_docs.json
Saved: archive_ready_doc.rust-lang.org_book.json
Saved: archive_ready_nginx.org_en_docs.json
Saved: archive_ready_www.lua.org_manual_5.4.json


### III. Метаданные metawarc

In [48]:
import json
import subprocess
from pathlib import Path

base_path = Path("/Users/vasilii/Documents/study/webarchive/HW3/wpull_archives")

def get_archiveready_data(domain_folder):
    json_files = list(domain_folder.glob("archive_ready_*.json"))
    if json_files:
        with open(json_files[0], 'r') as f:
            return json.load(f)
    return None

def calculate_mean(test_data):
    scores = [
        test_data.get('Accessibility', 0),
        test_data.get('Cohesion', 0),
        test_data.get('Metadata', 0),
        test_data.get('Standards_Compliance', 0)
    ]
    return round(sum(scores) / len(scores), 2)

def run_metawarc(warc_path):
    results = {}
    metadata_output_path = warc_path.parent / f"{warc_path.stem}_metadata.jsonl"
    
    for cmd in ["analyze", "metadata", "index"]:
        try:
            cmd_args = ["metawarc", cmd, str(warc_path)]
            if cmd == "metadata":
                cmd_args.extend(["-o", str(metadata_output_path)])
            
            result = subprocess.run(
                cmd_args,
                capture_output=True,
                text=True,
                timeout=30
            )
            results[cmd] = {
                'status': 'success' if result.returncode == 0 else 'failed',
                'output': result.stdout
            }
            
            if cmd == "metadata":
                results[cmd]['file'] = metadata_output_path.name if metadata_output_path.exists() else 'not created'
            
        except Exception as e:
            results[cmd] = {'status': 'error', 'output': str(e)}
            if cmd == "metadata":
                results[cmd]['file'] = 'error'
    
    return results

markdown_output = []

domain_folders = [f for f in base_path.iterdir() if f.is_dir()]

for domain_folder in domain_folders:
    domain = domain_folder.name
    warc_files = list(domain_folder.glob("*.warc.gz"))
    
    if not warc_files:
        continue
    
    warc_path = warc_files[0]
    size_mb = round(warc_path.stat().st_size / (1024**2), 2)
    
    md = f"# Архив сайта {domain}\n\n"
    md += "=== Комментарий о сайте ===\n\n"
    
    ar_data = get_archiveready_data(domain_folder)
    if ar_data and 'test' in ar_data:
        test_data = ar_data['test']
        md += "## ArchiveReady результаты\n\n"
        md += f"- **Accessibility**: {test_data.get('Accessibility', 'N/A')}\n"
        md += f"- **Cohesion**: {test_data.get('Cohesion', 'N/A')}\n"
        md += f"- **Metadata**: {test_data.get('Metadata', 'N/A')}\n"
        md += f"- **Standards Compliance**: {test_data.get('Standards_Compliance', 'N/A')}\n"
        md += f"- **Overall Rating**: {calculate_mean(test_data)}\n\n"
    else:
        md += "## ArchiveReady результаты\n\n*Данные не найдены*\n\n"
    
    md += "## Metawarc анализ\n\n"
    md += f"**Файл**: {warc_path.name} ({size_mb} MB)\n\n"
    
    metawarc_results = run_metawarc(warc_path)
    
    md += "### ANALYZE\n\n"
    md += f"**Статус**: {metawarc_results['analyze']['status']}\n\n"
    md += f"```\n{metawarc_results['analyze']['output']}\n```\n\n"
    
    md += "### METADATA\n\n"
    md += f"**Статус**: {metawarc_results['metadata']['status']}\n"
    md += f"**Файл**: {metawarc_results['metadata'].get('file', 'N/A')}\n\n"
    md += f"```\n{metawarc_results['metadata']['output']}\n```\n\n"
    
    md += "### INDEX\n\n"
    md += f"**Статус**: {metawarc_results['index']['status']}\n\n"
    md += f"```\n{metawarc_results['index']['output']}\n```\n\n"
    md += "В результате работы команды на основе метаданных была создана база данных (файл metawarc.db), которая может использоваться для реализации других команд, например, stats или dump.\n\n"
    
    md += "---\n\n"
    
    markdown_output.append(md)

with open("warc_archive_report.md", "w", encoding="utf-8") as f:
    f.write("\n".join(markdown_output))

print(f"{len(markdown_output)}")

5
