### Download data

- Download the dataset from https://www.kaggle.com/datasets/michaelarman/poemsdataset/code
- Unpack the downloaded zip inside the './data/poems' folder


In [None]:
import typing as tp
import os
from pathlib import Path

import eng_to_ipa as ipa

In [None]:
poems_path = './data/poems/forms/abc'
output_path = './data/poems/phonetic'

### Read

In [None]:
def read_abc_poems(_filepath: str) -> tp.Dict[str, tp.List[str]]:
    _poems = {}
    for filename in os.listdir(_filepath)[:3]:
        _poem = []
        _full_path = os.path.join(_filepath, filename)
        with open(_full_path, 'r', encoding='utf-8-sig') as f:
            _poems[filename] = f.read().splitlines()
    return _poems

def print_poem(_poems: dict, _poem_number: int):
    print('\n'.join(list(_poems.values())[_poem_number]))

poems_raw = read_abc_poems(poems_path)
print_poem(poems_raw, 0)

### Prepare


In [None]:
def validate_line(_line: str) -> bool:
    if _line.startswith('-') or _line.startswith('*'):
        return False
    return True # TODO

def preprocess(_poem: tp.List[str]) -> tp.List[str]:
    _result = []
    for _line in _poem:
        if validate_line(_line):
            _line = _line.lower()
            _result.append(_line)
    return _result

poems_processed = {title: preprocess(poem) for title, poem in poems_raw.items()}
print_poem(poems_processed, 0)

### Convert into phonetic

In [None]:
def to_ipa(_poem: tp.List[str]) -> tp.List[str]:
    return [ipa.convert(_line) for _line in _poem]

poems_phonetically = {title: to_ipa(poem) for title, poem in poems_raw.items()}
print_poem(poems_phonetically, 0)

### Save

In [None]:
def save_poems(_poems: tp.Dict[str, tp.List[str]], _output_path: str):
    for _poem_title, _poem_text in _poems.items():
        Path(_output_path).mkdir(parents=True, exist_ok=True)
        _full_path = os.path.join(_output_path, _poem_title)
        with open(_full_path, 'w', encoding='utf-8-sig') as f:
            f.writelines('\n'.join(_poem_text))
    print("Poems saved")

save_poems(poems_phonetically, output_path)
