In [2]:
import os

DATA_DIR = os.path.join("english")

for dir in os.listdir(DATA_DIR):
    print(dir)

(I_Can%27t_Get_No)_Satisfaction
0.999..%2e
1%3a64_scale
1755_Lisbon_earthquake
1900_Galveston_hurricane
1910_London_to_Manchester_air_race
1937_Fox_vault_fire
1942_(video_game)
1943%3a_The_Battle_of_Midway
1958_Tybee_Island_mid-air_collision
1968_Illinois_earthquake
1973_National_500
1973_world_oil_market_chronology
1994_San_Marino_Grand_Prix
1995_Japanese_Grand_Prix
1995_Pacific_Grand_Prix
2005_Atlantic_hurricane_season
2006_Atlantic_hurricane_season
2011_Wuxi_Classic
2012_Buenos_Aires_rail_disaster
2012_phenomenon
300_(film)
4chan
500_euro_note
90377_Sedna
A._E._J._Collins
AAA_400_Drive_for_Autism
Abbot_Augustus_Low
Abiogenesis
Accessibility
Accurate_News_and_Information_Act
Acharya_Jagadish_Chandra_Bose_College
Acid2
Action_potential
Adam_Curry
Adam_Elsheimer
Adewale_Akinnuoye-Agbaje
Adrianne_Wadewitz
Adrian_Lowe
Aegaeon_(moon)
African_Union
Age_of_Empires
Aggie_Bonfire
Air-tractor_sledge
Alan_Harper_(Two_and_a_Half_Men)
Alexandra_of_Denmark
Alexi_Salamone
Alex_Harris_(swimmer)
Alfr

In [3]:
next(os.walk(os.path.join(DATA_DIR, '4chan')))

('english/4chan',
 [],
 ['aligned.swc',
  'audio.ogg',
  'audiometa.txt',
  'info.json',
  'wiki.html',
  'wiki.txt',
  'wiki.xml'])

In [5]:
from typing import List
import xml.etree.ElementTree as ET
import numpy as np


class PronunciationTiming(object):

    def __init__(self, word: str, start: int, end: int) -> None:
        self.word = word
        self.start = start
        self.end = end

class SWC(object):

    def __init__(self, swc_path: str) -> None:
        self.tree = ET.parse(swc_path)
        self.timings = []

    def get_timings(self) -> List[PronunciationTiming]:
        if len(self.timings) > 0:
            return self.timings
        # https://www.geeksforgeeks.org/xml-parsing-python/
        root = self.tree.getroot()
        doc = root.find('d')
        
        for p in doc.findall('p'):
            for s in p.findall('s'):
                for t in s.findall('t'):
                    for n in t.findall('n'):
                        attribs = n.attrib
                        if 'pronunciation' not in attribs or 'start' not in attribs or 'end' not in attribs:
                            continue
                        self.timings.append(PronunciationTiming(
                            attribs['pronunciation'], 
                            int(attribs['start']), 
                            int(attribs['end'])))
        return self.timings


len(SWC("4chan-aligned.swc").get_timings())

173

In [11]:
from typing import Tuple
from pydub import AudioSegment

def split_clip(audio: AudioSegment, swc: SWC) -> Tuple[List[str], List[np.array]]:
    words = []
    pcms = []
    for p in swc.get_timings():
        words.append(p.word)
        # https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentget_array_of_samples
        clip = audio[p.start:p.end].get_array_of_samples()
        fp_arr = np.array(clip).T.astype(np.float32)
        fp_arr /= np.iinfo(clip.typecode).max
        pcms.append(fp_arr)
    return words, pcms

audio = AudioSegment.from_ogg('english/Durian/audio.ogg')
# print(audio.channels, audio.frame_rate)
swc = SWC('english/Durian/aligned.swc')
split_clip(audio, swc)

(['trees',
  'from',
  'the',
  'genus',
  'Durio',
  'belonging',
  'to',
  'the',
  'Durionaceae',
  'family',
  'Widely',
  'known',
  'revered',
  'in',
  'Southeast',
  'Asia',
  'as',
  'the',
  'King',
  'of',
  'Fruits',
  'the',
  'fruit',
  'is',
  'distinctive',
  'large',
  'size',
  'unique',
  'odour',
  'and',
  'formidable',
  'thorn',
  'covered',
  'husk',
  'fruit',
  'can',
  'grow',
  'up',
  'to',
  'thirty',
  'centimetres',
  'twelve',
  'in',
  'long',
  'and',
  'centimetres',
  'six',
  'in',
  'in',
  'diameter',
  'and',
  'typically',
  'weighs',
  'one',
  'three',
  'kilograms',
  'to',
  'seven',
  'l',
  'b',
  'Its',
  'shape',
  'ranges',
  'from',
  'oblong',
  'to',
  'round',
  'the',
  'colour',
  'of',
  'its',
  'husk',
  'green',
  'to',
  'brown',
  'and',
  'its',
  'flesh',
  'pale',
  'yellow',
  'to',
  'red',
  'depending',
  'on',
  'the',
  'species',
  'edible',
  'flesh',
  'emits',
  'distinctive',
  'odour',
  'strong',
  'and',
  

In [21]:
class PronunciationClip(object):
    def __init__(self, word: str, pcm: np.array) -> None:
        self.word = word
        self.pcm = pcm

clips = []

for dir in os.listdir(DATA_DIR):
    path = os.path.join(DATA_DIR, dir)
    if not os.path.isdir(path):
        continue
    files = os.listdir(path)
    if 'audio.ogg' not in files or 'aligned.swc' not in files:
        continue
    audio = AudioSegment.from_ogg(os.path.join(path, 'audio.ogg'))
    swc = SWC(os.path.join(path, 'aligned.swc'))
    for w, c in zip(split_clip(audio, swc)):
        clips.append(PronunciationClip(w, c))

    # arbitrary limit
    if len(clips) > 1000:
        break

KeyboardInterrupt: 