In [None]:
import csv
import os
import regex as re

from collections import namedtuple
from pprint import pprint


In [None]:
def resolve_filename(filename):
    if filename.startswith("~"):
        return os.path.expanduser(filename)
    else:
        return filename
    
def with_resolution(function):
    def resolution_function(filename, *args, **kwargs):
        resolved_name = resolve_filename(filename)
        return function(resolved_name, *args, **kwargs)
    return resolution_function

@with_resolution
def read_all_lines(filename, delete_trailing_carriage_returns=True):
    with open(filename, mode="rt") as fp:
        if delete_trailing_carriage_returns:
            return [line[:-1] for line in  fp.readlines()]
        else:
            return fp.readlines()
        
def remove_empty_and_whitespace_lines(lines):
    return [line for line in lines if len(line.strip()) > 0]

def write_csv(filename, fieldnames, rows):
    with open(filename, mode="w", newline="") as fp:
        writer = csv.DictWriter(fp, fieldnames)
        writer.writeheader()
        writer.writerows(rows)
        
@with_resolution
def change_ext(filename, new_ext):
    filename_wo_ext, _ = os.path.splitext(filename)
    if new_ext[0] == ".":
        return f"{filename_wo_ext}{new_ext}"
    else:
        return f"{filename_wo_ext}.{new_ext}"

In [None]:
TO_BE_PARSED = "~/temp/routledge-handbook-to-music-under-german-occuptation.txt"

SECTION_RE = re.compile(r"^Section\s(?P<section_number>\d+)\s" 
                        r"(?P<section_title>.+)")
CHAPTER_TITLE_RE = re.compile(r"(?<=^\d+\s+).+")

ParsedFile = namedtuple("ParsedFile", "author title section_number section_title")

def match_section(line):
    return SECTION_RE.search(line)

def match_chapter_title(line):
    return CHAPTER_TITLE_RE.search(line)

def parse_file(TO_BE_PARSED):
    lines = remove_empty_and_whitespace_lines(read_all_lines(TO_BE_PARSED))
    yield ParsedFile(lines[1], lines[0], None, None)

    for line in lines[2:]:
        section_match = match_section(line)
        if section_match:
            current_section_number = section_match["section_number"]
            current_section_title = section_match["section_title"]
#             yield ParsedFile(None, None, 
#                              section_number=current_section_number, 
#                              section_title=current_section_title)
            continue
        else:
            pass
        
        chapter_title_match = match_chapter_title(line)
        if chapter_title_match:
            current_chapter_title = chapter_title_match[0]
#             yield ParsedFile(None, current_chapter_title, 
#                              current_section_number, current_section_title)
            continue
        else:
            current_author = line
            yield ParsedFile(current_author, 
                             current_chapter_title, 
                             current_section_number,
                             current_section_title)
        
result = parse_file(TO_BE_PARSED)

rows = list(result)

pprint(rows)

In [None]:
write_csv(change_ext(TO_BE_PARSED, "csv"), ParsedFile._fields, [row._asdict() for row in rows])