In [13]:
from pathlib import Path

files = list(Path('../dump').glob('*.wiki'))

# Extract numeric parts from filenames and convert to a sorted list of integers
numbers = sorted(int(f.stem) for f in files)

# Find gaps in the sequence
gaps = [n for n in range(numbers[0], numbers[-1] + 1) if n not in numbers]

# Print the gaps

new_gaps = [g for g in gaps if g > 10471]

print(new_gaps)


[10851, 10852, 10853, 10854, 10855, 10856, 10857, 10858, 10859, 10860, 10861, 10862, 10863, 10864, 10865, 10866, 10867, 10868, 10869, 10870, 10871, 10872, 10873, 10874, 10875, 10876, 10877, 10878, 10879, 10880, 10881, 10882, 10883, 10884, 10885, 10886, 10887, 10888, 10889, 10890, 10891, 10892, 10893, 10894, 10895, 10896, 10897, 10898, 10899, 10900, 10901, 10902, 10903, 10904, 10905, 10906, 10907, 10908, 10909, 10910, 10911, 10912, 10913, 10914, 10915, 10916, 10917, 10918, 10919, 10920, 10921, 10922, 10923, 10924, 10925, 10926, 10927, 10928, 10929, 10930, 10931, 10932, 10933, 10934, 10935, 10936, 10937, 10938, 10939, 10940, 10941, 10942, 10943, 10944, 10945, 10946, 11260, 11845, 12156, 12904, 13212, 13282, 13348, 13573, 14780, 16014]


In [11]:
import time
import math
from pathlib import Path
from atwiki import AtWikiAPI, AtWikiURI
from urllib.error import HTTPError

REQUEST_INTERVAL = 8
MAX_RETRIES = 5
OUTPUT_DIR = Path("../dump")
OUTPUT_DIR.mkdir(exist_ok=True)

api = AtWikiAPI(
    AtWikiURI('https://w.atwiki.jp/hmiku/'),
    sleep=REQUEST_INTERVAL
)

start_at = new_gaps[0]
start_from = math.floor(start_at / 100) - 1


print("Fetching the list of pages...")
page_list = api.get_list(_start=start_from)

for page in page_list:
    page_id = page['id']
    page_name = page['name']
    filename = OUTPUT_DIR / f"{page_id}.wiki"

    if filename.exists():
        print(f"Page {page_id} already downloaded. Skipping.")
        continue

    retries = 0

    while retries < MAX_RETRIES:
        try:
            source = api.get_source(page_id)
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(source)
            print(f"Downloaded page {page_id}: {page_name}")
            break
        except HTTPError as e:
            # retries += 1
            # wait_time = 480
            print(f"Failed to download page {page_id} (attempt {retries}): {e}")
            break
            # print(f"Waiting {wait_time} seconds before retrying...")
            # time.sleep(wait_time)
        except Exception as e:
            print("Unknown error", e)
            break
            
    else:
        print(f"Failed to download page {page_id} after {MAX_RETRIES} attempts. Skipping.")


Fetching the list of pages...
Page 5933 already downloaded. Skipping.
Page 5934 already downloaded. Skipping.
Page 5935 already downloaded. Skipping.
Page 5936 already downloaded. Skipping.
Page 5937 already downloaded. Skipping.
Page 5938 already downloaded. Skipping.
Page 5939 already downloaded. Skipping.
Page 5940 already downloaded. Skipping.
Page 5941 already downloaded. Skipping.
Page 5942 already downloaded. Skipping.
Page 5943 already downloaded. Skipping.
Page 5944 already downloaded. Skipping.
Page 5945 already downloaded. Skipping.
Page 5946 already downloaded. Skipping.
Page 5947 already downloaded. Skipping.
Page 5948 already downloaded. Skipping.
Page 5949 already downloaded. Skipping.
Page 5950 already downloaded. Skipping.
Page 5951 already downloaded. Skipping.
Page 5952 already downloaded. Skipping.
Page 5953 already downloaded. Skipping.
Page 5954 already downloaded. Skipping.
Page 5955 already downloaded. Skipping.
Page 5956 already downloaded. Skipping.
Page 5957 

KeyboardInterrupt: 