### Download data

- Download the dataset from https://www.kaggle.com/datasets/michaelarman/poemsdataset/code
- Unpack the downloaded zip inside the './data/poems' folder


In [1]:
import typing as tp
import os
from pathlib import Path

import eng_to_ipa as ipa

In [2]:
poems_path = './data/poems/forms/abc'
output_path = './data/poems/phonetic'

### Read

In [3]:
def read_abc_poems(_filepath: str) -> tp.Dict[str, tp.List[str]]:
    _poems = {}
    for filename in os.listdir(_filepath)[:3]:
        _poem = []
        _full_path = os.path.join(_filepath, filename)
        with open(_full_path, 'r', encoding='utf-8-sig') as f:
            _poems[filename] = f.read().splitlines()
    return _poems

def print_poem(_poems: dict, _poem_number: int):
    print('\n'.join(list(_poems.values())[_poem_number]))

poems_raw = read_abc_poems(poems_path)
print_poem(poems_raw, 0)

2 ABC of H.k. and China revised vision.
Barrels tears are wines and salts.
With a whisk on goody tails!
Wiggle maces to fix the heads.
Heads in jack on boxes are ceased.
Cry to paranoid truly bosses.
Bosses are jokers take your boys.
Studs are bogs with fire apples.
True predicates worth cases.’
Descents wash in badly bands.
Wholly sales are smart with cats.
Who got tenth honors in China?
Homage grand to play and plays!
Trim the times of hearts then cry.
Tanks in steels but voice wail.
Bossy dragged by tails that whisked.
Go very timid and love the wise.
Hands are lent but laws are ends.
Cases on courts are borrowed lands.
Length long with treads to retch!
Straps on times and watch here.
Arrays tanks but all are men.
Cross all suctions steal the ends.
Cave on minds are cages on objects.
Rouser rockets powers holes.
Confine curses to stop our wounds.
Whirl your bodies and jump on grounds.
Crouch of soldiers after kicks with flings.
Block one leg and hit the middle.
Cauchy3 know the tric

### Prepare


In [4]:
def validate_line(_line: str) -> bool:
    if _line.startswith('-') or _line.startswith('*'):
        return False
    return True # TODO

def preprocess(_poem: tp.List[str]) -> tp.List[str]:
    _result = []
    for _line in _poem:
        if validate_line(_line):
            _line = _line.lower()
            _result.append(_line)
    return _result

poems_processed = {title: preprocess(poem) for title, poem in poems_raw.items()}
print_poem(poems_processed, 0)

2 abc of h.k. and china revised vision.
barrels tears are wines and salts.
with a whisk on goody tails!
wiggle maces to fix the heads.
heads in jack on boxes are ceased.
cry to paranoid truly bosses.
bosses are jokers take your boys.
studs are bogs with fire apples.
true predicates worth cases.’
descents wash in badly bands.
wholly sales are smart with cats.
who got tenth honors in china?
homage grand to play and plays!
trim the times of hearts then cry.
tanks in steels but voice wail.
bossy dragged by tails that whisked.
go very timid and love the wise.
hands are lent but laws are ends.
cases on courts are borrowed lands.
length long with treads to retch!
straps on times and watch here.
arrays tanks but all are men.
cross all suctions steal the ends.
cave on minds are cages on objects.
rouser rockets powers holes.
confine curses to stop our wounds.
whirl your bodies and jump on grounds.
crouch of soldiers after kicks with flings.
block one leg and hit the middle.
cauchy3 know the tric

### Convert into phonetic

In [5]:
def to_ipa(_poem: tp.List[str]) -> tp.List[str]:
    return [ipa.convert(_line) for _line in _poem]

poems_phonetically = {title: to_ipa(poem) for title, poem in poems_raw.items()}
print_poem(poems_phonetically, 0)

2 ˈeɪˌbiˌsi əv h.k*. ənd ˈʧaɪnə rɪˈvaɪzd ˈvɪʒən.
ˈbɛrəlz tɪrz ər waɪnz ənd sɔlts.
wɪθ ə wɪsk ɔn ˈgʊdi teɪlz!
ˈwɪgəl maces* tɪ fɪks ðə hɛdz.
hɛdz ɪn ʤæk ɔn ˈbɑksɪz ər sist.
kraɪ tɪ ˈpɛrəˌnɔɪd ˈtruli ˈbɔsɪz.
ˈbɔsɪz ər ˈʤoʊkərz teɪk jʊr bɔɪz.
stədz ər bɔgz wɪθ faɪər ˈæpəlz.
tru ˈprɛdɪkəts wərθ cases.’*.’
dɪˈsɛnts wɑʃ ɪn ˈbædli bændz.
ˈhoʊli seɪlz ər smɑrt wɪθ kæts.
hu gɑt tɛnθ ˈɑnərz ɪn ˈʧaɪnə?
ˈɑməʤ grænd tɪ pleɪ ənd pleɪz!
trɪm ðə taɪmz əv hɑrts ðɛn kraɪ.
tæŋks ɪn stilz bət vɔɪs weɪl.
ˈbɔsi drægd baɪ teɪlz ðət wɪskt.
goʊ ˈvɛri ˈtɪmɪd ənd ləv ðə waɪz.
hænz ər lɛnt bət lɔz ər ɛndz.
ˈkeɪsɪz ɔn kɔrts ər ˈbɑˌroʊd lændz.
lɛŋθ lɔŋ wɪθ trɛdz tɪ retch*!
stræps ɔn taɪmz ənd wɔʧ hir.
əreɪz tæŋks bət ɔl ər mɛn.
krɔs ɔl suctions* stil ðə ɛndz.
keɪv ɔn maɪndz ər ˈkeɪʤɪz ɔn ˈɑbʤɛkts.
ˈraʊzər ˈrɑkəts paʊərz hoʊlz.
kənˈfaɪn ˈkərsɪz tɪ stɑp ɑr wundz.
wərl jʊr ˈbɑdiz ənd ʤəmp ɔn graʊnz.
kraʊʧ əv ˈsoʊlʤərz ˈæftər kɪks wɪθ flɪŋz.
blɑk wən lɛg ənd hɪt ðə ˈmɪdəl.
cauchy3* noʊ ðə trɪks tɪ kɪl.
θˈrɛtən wik əˈpr

### Save

In [6]:
def save_poems(_poems: tp.Dict[str, tp.List[str]], _output_path: str):
    for _poem_title, _poem_text in _poems.items():
        Path(_output_path).mkdir(parents=True, exist_ok=True)
        _full_path = os.path.join(_output_path, _poem_title)
        with open(_full_path, 'w', encoding='utf-8-sig') as f:
            f.writelines('\n'.join(_poem_text))
    print("Poems saved")

save_poems(poems_phonetically, output_path)


Poems saved
