In [1]:
from pathlib import Path
import re
import zipfile
import io

In [2]:
cards_dir = "/Users/ec2-user/Data/aozorabunko/cards"
dest_dir = "/Users/ec2-user/Data/aozorabunko-raw/"

In [10]:
def get_cards(cards_dir):
    return [
        sub_dir.name
        for sub_dir in Path(cards_dir).resolve().iterdir()
        if sub_dir.is_dir() and re.match("[0-9]{6}", sub_dir.name)
    ]
aozora_cards = get_cards(cards_dir)

In [37]:
def list_zips(card):
    try:
        return [
            sub_dir.name
            for sub_dir in Path(cards_dir).joinpath(card, "files").resolve().iterdir()
            if sub_dir.is_file() and sub_dir.suffix == ".zip"
        ]
    except Exception as e:
        print(e)
        return None

zips = list_zips(cards[0])

In [45]:
def read_zip(file, encoding="shift_jis"):
    with open(file, 'rb') as f:
        buf = f.read()
        try:
            z = zipfile.ZipFile(io.BytesIO(buf))
            for file in z.infolist():
                files = [
                    (info.filename, z.read(info).decode(encoding, errors="ignore"))
                    for info in z.infolist()
                    if Path(info.filename).suffix == ".txt"
                ]
        except Exception as e:
            return None
    return files

files = read_zip(Path(cards_dir).joinpath(cards[0], "files", zips[0]))

In [46]:
def strip_text(text):
    text = text.replace("\r\n", "\n")
    text = text.replace("\u3000", " ")
    splitted = re.split(r'\-{5,}', text)
    text = splitted[2] if len(splitted) > 2 else text
    text = re.split(r'底本：', text)[0]
    text = re.sub(r'《.+?》', '', text)
    text = re.sub(r'［＃.+?］', '', text)
    text = text.strip()
    return text

strip_text(files[0][1])

'三月の午後\n雪解けの土堤っ原で\n子供らが蕗のとうを摘んでいる\nやせこけたくびすじ\n血の気のない頬の色\n\nざるの中を覗き込んで\n淋しそうに微笑んだ少女の横顔のいたいたしさ\n\nおお、飢えと寒さの中に\n今も凶作地の子供達は\n熱心に蕗のとうを摘んでいる\n\n子供等よ！\nお前らの兄んちゃんは\n何をして警官に縛られたのか\n何の為に満洲へ送られて行ったのか\n姉さん達はどうして都会から帰って来たのか\nお前らは知ってるね\n何十年の間、お前らの父ちゃんから税金を捲きあげていた地主は\nお前らの生活を保証してくれたか？\nおまんまのかわりに\n苦がい蕗のとうを喰うお前らの小さな胸にも\n今は強い敵意が燃えている\n天災だと云って\n\nしらを切ったのはど奴だ！\n「困るのは小作だけでない」\nそう云った代議士（地主）の言葉にウソがなかったか\n子供等よ！ いつ地主の子供が\nお前等と一緒に蕗のとうを摘みに行ったか\nいつ、地主のお膳に\nぬか団子が転っていたか\n修身講話が次から次へとウソになって現れて来たいま\nおお お前らのあたまも「学校」から離れる\n\n北風の吹く夕暮れ\n母親は馬カゴのもち草を\n河っぷちで洗ってる\n子供らはざるを抱えて家路へ急ぐ\n背中の児は空腹を訴えて泣き\n背負った子供は寒さに震える\nだが、見るがよい\n水涕をたらした男の児等の面がまえを！\n児を背負った少女の瞳を！\nおお、凶作地の子供等よ！\nその顔に現れた反抗と憎悪をもって\n兄んちゃんのような強つい人間に成れ！\n苦がい蕗のとうのざるをほうり出して\n父ちゃんから税金を捲きあげた奴等に向って\nあったかい米のご飯を要求するんだ！\n\n（『プロレタリア文学』一九三二年二月号に発表）'

In [47]:
def mkdir(card, zip_file):
    zip_stem = Path(zip_file).stem
    Path(dest_dir).joinpath(card, zip_stem).mkdir(parents=True, exist_ok=True)

def savefile(card, zip_file, filename, text):
    zip_stem = Path(zip_file).stem
    filename_stem = Path(filename).stem
    filepath = Path(dest_dir).joinpath(card, zip_stem, filename_stem+".txt").resolve()
    with open(filepath, "w", encoding="utf8") as f:
        f.write(text)
        print(f"saved {filename} to {filepath}")

mkdir(cards[0], zips[0])
savefile(cards[0], zips[0], files[0][0], strip_text(files[0][1]))

saved fukino_too_tsumu_kodomora.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54268_ruby_57883/fukino_too_tsumu_kodomora.txt


In [48]:
def process_zip(card, zip_file):
    mkdir(card, zip_file)
    files = read_zip(Path(cards_dir).joinpath(card, "files", zip_file))
    if files is None:
        print(f"skipping zip_file {card}/{zip_file}")
        return
    texts_to_save = [ (filename, strip_text(text)) for filename, text in files]
    for (filename, text) in texts_to_save:
        savefile(card, zip_file, filename, text)

process_zip(cards[0], zips[0])

saved fukino_too_tsumu_kodomora.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54268_ruby_57883/fukino_too_tsumu_kodomora.txt


In [49]:
def process_card(card):
    zip_files = list_zips(card)
    if zip_files is None:
        print(f"skipping card {card}")
        return
    for zip_file in zip_files:
       process_zip(card, zip_file)

process_card(aozora_cards[0])

saved fukino_too_tsumu_kodomora.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54268_ruby_57883/fukino_too_tsumu_kodomora.txt
saved oyajino_kotoba.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54264_ruby_57982/oyajino_kotoba.txt
saved reporter.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54269_ruby_57884/reporter.txt
saved hahae.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54266_ruby_57981/hahae.txt
saved shiroi_mano_te.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54265_ruby_57867/shiroi_mano_te.txt
saved hinnono_utaeru_shi.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54267_ruby_57866/hinnono_utaeru_shi.txt


In [50]:
def process_all_cards(cards):
    for card in cards:
        print(f"#### processing card:{card}")
        process_card(card)
    print("#### finished #####")

process_all_cards(aozora_cards[0:2])

#### processing card:001655
saved fukino_too_tsumu_kodomora.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54268_ruby_57883/fukino_too_tsumu_kodomora.txt
saved oyajino_kotoba.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54264_ruby_57982/oyajino_kotoba.txt
saved reporter.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54269_ruby_57884/reporter.txt
saved hahae.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54266_ruby_57981/hahae.txt
saved shiroi_mano_te.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54265_ruby_57867/shiroi_mano_te.txt
saved hinnono_utaeru_shi.txt to /Users/ec2-user/Data/aozorabunko-raw/001655/54267_ruby_57866/hinnono_utaeru_shi.txt
#### processing card:001231
saved sekaini_okeru_indo.txt to /Users/ec2-user/Data/aozorabunko-raw/001231/46297_txt_37292/sekaini_okeru_indo.txt
saved indono_seijin.txt to /Users/ec2-user/Data/aozorabunko-raw/001231/46296_txt_35652/indono_seijin.txt


In [51]:
process_all_cards(aozora_cards)

txt
saved watashino_sukina_hito.txt to /Users/ec2-user/Data/aozorabunko-raw/001288/47119_txt_28894/watashino_sukina_hito.txt
saved kokorono_shirabe.txt to /Users/ec2-user/Data/aozorabunko-raw/001288/47112_txt_27517/kokorono_shirabe.txt
saved otono_sekaini_ikiru.txt to /Users/ec2-user/Data/aozorabunko-raw/001288/47107_ruby_28769/otono_sekaini_ikiru.txt
saved watashino_wakaikoro.txt to /Users/ec2-user/Data/aozorabunko-raw/001288/47120_txt_27519/watashino_wakaikoro.txt
saved junsuino_koe.txt to /Users/ec2-user/Data/aozorabunko-raw/001288/47114_txt_28891/junsuino_koe.txt
saved koeto_shokumotsu.txt to /Users/ec2-user/Data/aozorabunko-raw/001288/47109_ruby_28888/koeto_shokumotsu.txt
#### processing card:000168
saved nanboku.txt to /Users/ec2-user/Data/aozorabunko-raw/000168/50759_ruby_45904/nanboku.txt
saved hino_tsuita_tabako.txt to /Users/ec2-user/Data/aozorabunko-raw/000168/59147_ruby_66502/hino_tsuita_tabako.txt
saved onmi_ruby.txt to /Users/ec2-user/Data/aozorabunko-raw/000168/908_ruby/