In [11]:
import re
import string
import os

from html.parser import HTMLParser

def get_words(sentence):
    return re.sub('['+string.punctuation+'¿'+'¡'+']', ' ', sentence).split()

def get_syllables(sentence, sentence_fragments):
    words = re.sub('['+string.punctuation+'¿'+'¡'+']', ' ', sentence).split()
    syllables = []

    for f in sentence_fragments:
        frags = re.sub('['+string.punctuation+'¿'+'¡'+']', ' ', f[0]).split()
        for r in frags:
            syllables.append((r, f[1]))
    return syllables

def get_sentence_word_dict(sentence, sentence_fragments):
    index = 0
    buffer = ""
    word_dict = {}
    accentuated_word = []

    words = get_words(sentence)
    syllables = get_syllables(sentence, sentence_fragments)

    for w in words:
        while index < 100:
            buffer += syllables[index][0]
            accentuated_word.append(syllables[index])
            index += 1
            if buffer == w:
                word_dict[w.lower()] = accentuated_word
                buffer = ""
                accentuated_word = []
                break
    return word_dict

class MyHTMLParser(HTMLParser):
        
    def analyze_lesson(self, lesson):
        self.sentences = []
        self.sentences_fragments = []
        self.bold_data = False
        self.feed(lesson)

    def get_lesson_word_dict(self):
        word_dict = {}
        if not hasattr(self, 'sentences'):
            return {}
        for i, s in enumerate(self.sentences):
            wd = get_sentence_word_dict(s, self.sentences_fragments[i])
            for w in wd:
                if not w in word_dict:
                    word_dict[w] = wd[w]
        return word_dict
        
    def handle_starttag(self, tag, attrs):
        if tag == "b":
            self.bold_data = True
        elif tag == "p":
            self.current_sentence = ""
            self.current_sentence_fragments = []
            #self.first_data = True


    def handle_endtag(self, tag):
        if tag == "b":
            self.bold_data = False
        elif tag == "p":
            self.sentences_fragments.append(self.current_sentence_fragments)
            self.sentences.append(self.current_sentence)
            
    def handle_data(self, data):
        #if self.first_data:
        #    self.first_data = False
            #data = data.split('-')[1]
        self.current_sentence_fragments.append((data, self.bold_data))
        self.current_sentence += data


In [12]:
lesson = open('./pronunciation/lessons/L004.html').read()

In [13]:
parser = MyHTMLParser()

In [14]:
parser.analyze_lesson(lesson)

In [15]:
sentence_number = 6
s = parser.sentences[sentence_number]
sf = parser.sentences_fragments[sentence_number]
s

'S05-¡Vale!\xa0'

In [16]:
sf

[('S05-¡', False), ('Va', True), ('le!', False), ('\xa0', False)]

In [17]:
word_dict = get_sentence_word_dict(s, sf)
word_dict

{'s05': [('S05', False)], 'vale': [('Va', True), ('le', False)]}

In [19]:
word_dict = {}
for k in range(5):
    s = parser.sentences[k]
    sf = parser.sentences_fragments[k]
    wd = get_sentence_word_dict(s, sf)
    word_dict.update(wd)
    

In [20]:
word_dict

{'n4': [('N4', False)],
 'lección': [('Lec', False), ('ción', True)],
 'cuarta': [('cua', True), ('rta', False)],
 's00': [('S00', False)],
 'title': [('TITLE', False)],
 'presentaciones': [('Presenta', False), ('cio', True), ('nes', False)],
 's01': [('S01', False)],
 'él': [('él', False)],
 'es': [('es', False)],
 'rafa': [('Ra', True), ('fa', False)],
 'mi': [('mi', False)],
 'hermano': [('her', False), ('ma', True), ('no', False)],
 'y': [('Y', False)],
 'ella': [('e', True), ('lla', False)],
 'luz': [('Luz', False)],
 'hermana': [('her', False), ('ma', True), ('na', False)],
 's02': [('S02', False)],
 'padre': [('pa', True), ('dre', False)],
 'miguel': [('Mi', False), ('guel', True)],
 's03': [('S03', False)],
 'mucho': [('Mu', True), ('cho', False)],
 'gusto': [('gus', True), ('to', False)],
 'cómo': [('Có', True), ('mo', False)],
 'está': [('est', False), ('á', True)],
 'usted': [('us', False), ('ted', True)]}

In [22]:
parser.get_lesson_word_dict()

{'n4': [('N4', False)],
 'lección': [('Lec', False), ('ción', True)],
 'cuarta': [('cua', True), ('rta', False)],
 's00': [('S00', False)],
 'title': [('TITLE', False)],
 'presentaciones': [('Presenta', False), ('cio', True), ('nes', False)],
 's01': [('S01', False)],
 'él': [('Él', False)],
 'es': [('es', False)],
 'rafa': [('Ra', True), ('fa', False)],
 'mi': [('mi', False)],
 'hermano': [('her', False), ('ma', True), ('no', False)],
 'y': [('y', False)],
 'ella': [('e', True), ('lla', False)],
 'luz': [('Luz', False)],
 'hermana': [('her', False), ('ma', True), ('na', False)],
 's02': [('S02', False)],
 'padre': [('pa', True), ('dre', False)],
 'miguel': [('Mi', False), ('guel', True)],
 's03': [('S03', False)],
 'mucho': [('Mu', True), ('cho', False)],
 'gusto': [('gus', True), ('to', False)],
 'cómo': [('Có', True), ('mo', False)],
 'está': [('est', False), ('á', True)],
 'usted': [('us', False), ('ted', True)],
 's04': [('S04', False)],
 'muy': [('Muy', False)],
 'bien': [('bien'

In [19]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
import re
words = re.sub('['+string.punctuation+'¿'+']', ' ', s).split()
words

['S00', 'TITLE', 'Presentaciones']

In [21]:
def get_words(sentence):
    return re.sub('['+string.punctuation+'¿'+']', ' ', sentence).split()

words = get_words(s)
words

['S00', 'TITLE', 'Presentaciones']

In [22]:
def get_syllables(sentence, sentence_fragments):
    print(sentence)
    words = re.sub('['+string.punctuation+'¿'+']', ' ', sentence).split()
    print(words)
    syllables = []

    for f in sentence_fragments:
        frags = re.sub('['+string.punctuation+'¿'+']', ' ', f[0]).split()
        for r in frags:
            syllables.append((r, f[1]))
        print(frags)
    return syllables
syllables = get_syllables(s, sf)
syllables

S00-TITLE-Presentaciones 
['S00', 'TITLE', 'Presentaciones']
['S00', 'TITLE', 'Presenta']
['cio']
['nes']
[]


[('S00', False),
 ('TITLE', False),
 ('Presenta', False),
 ('cio', True),
 ('nes', False)]

In [23]:
def get_word_dict(sentence, sentence_fragments):
    index = 0
    buffer = ""
    word_dict = {}
    accentuated_word = []

    words = get_words(sentence)
    syllables = get_syllables(sentence, sentence_fragments)

    for w in words:
        while index < 100:
            buffer += syllables[index][0]
            print("w and buffer: ", w, buffer)
            accentuated_word.append(syllables[index])
            index += 1
            if buffer == w:
                word_dict[w.lower()] = accentuated_word
                buffer = ""
                accentuated_word = []
                break
    return word_dict

word_dict = get_word_dict(s, sf)
word_dict

S00-TITLE-Presentaciones 
['S00', 'TITLE', 'Presentaciones']
['S00', 'TITLE', 'Presenta']
['cio']
['nes']
[]
w and buffer:  S00 S00
w and buffer:  TITLE TITLE
w and buffer:  Presentaciones Presenta
w and buffer:  Presentaciones Presentacio
w and buffer:  Presentaciones Presentaciones


{'s00': [('S00', False)],
 'title': [('TITLE', False)],
 'presentaciones': [('Presenta', False), ('cio', True), ('nes', False)]}

In [107]:
parser = MyHTMLParser()
word_dict = {}
lessons_directory = './pronunciation/lessons'

for e in os.walk(lessons_directory):
    print(e)
    for fn in e[2]:
        lesson = open(os.path.join(lessons_directory, fn)).read()
        print(fn, " : ", len(lesson))
        parser.analyze_lesson(lesson)
        wd = parser.get_word_dict()
        word_dict.update(wd)

('./pronunciation/lessons', [], ['L004.html', 'L003.html', 'L002.html', 'L001.html'])
L004.html  :  2099
Encountered a start tag: p
nouveau paragraphe
(1, 0)
Encountered some data  : N4-Lec
Encountered a start tag: b
Encountered some data  : ción
Encountered an end tag : b
Encountered some data  :  
Encountered a start tag: b
Encountered some data  : cua
Encountered an end tag : b
Encountered some data  : rta
Encountered a start tag: span
Encountered some data  :  
Encountered an end tag : span
Encountered an end tag : p
Encountered a start tag: p
nouveau paragraphe
(1, 153)
Encountered some data  : S00-TITLE-Presenta
Encountered a start tag: b
Encountered some data  : cio
Encountered an end tag : b
Encountered some data  : nes
Encountered a start tag: span
Encountered some data  :  
Encountered an end tag : span
Encountered an end tag : p
Encountered a start tag: p
nouveau paragraphe
(1, 306)
Encountered some data  : S01-Él es 
Encountered a start tag: b
Encountered some data  : Ra
En

In [96]:
word_dict

{'n4': [('N4', False)],
 'lección': [('Lecc', False), ('ión', True)],
 'cuarta': [('cua', True), ('rta', False)],
 's00': [('S00', False)],
 'title': [('TITLE', False)],
 'presentaciones': [('Presenta', False), ('cio', True), ('nes', False)],
 's01': [('S01', False)],
 'él': [('Él', False)],
 'es': [('es', False)],
 'rafa': [('Ra', True), ('fa', False)],
 'mi': [('Mi', False)],
 'hermano': [('her', False), ('ma', True), ('no', False)],
 'y': [('Y', False)],
 'ella': [('ella', False)],
 'luz': [('Luz', False)],
 'hermana': [('her', False), ('ma', True), ('na', False)],
 's02': [('S02', False)],
 'padre': [('pa', True), ('dre', False)],
 'miguel': [('Mi', False), ('guel', True)],
 's03': [('S03', False)],
 'mucho': [('Mu', True), ('cho', False)],
 'gusto': [('gus', True), ('to', False)],
 'cómo': [('Có', True), ('mo', False)],
 'está': [('est', False), ('á', True)],
 'usted': [('ust', False), ('ed', True)],
 's04': [('S04', False)],
 'muy': [('Muy', False)],
 'bien': [('bien', False)],
 

In [97]:
word_dict['pero']

[('Pe', True), ('ro', False)]

In [105]:
word_dict['n5']

KeyError: 'n5'

In [7]:
parser

<__main__.MyHTMLParser at 0x100de1410>

In [8]:
dir(parser)

['CDATA_CONTENT_ELEMENTS',
 '_HTMLParser__starttag_text',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_decl_otherchars',
 '_parse_doctype_attlist',
 '_parse_doctype_element',
 '_parse_doctype_entity',
 '_parse_doctype_notation',
 '_parse_doctype_subset',
 '_scan_name',
 'analyze_lesson',
 'bold_data',
 'cdata_elem',
 'check_for_whole_start_tag',
 'clear_cdata_mode',
 'close',
 'convert_charrefs',
 'current_sentence',
 'current_sentence_fragments',
 'feed',
 'get_starttag_text',
 'get_word_dict',
 'getpos',
 'goahead',
 'handle_charref',
 'handle_comment',
 'handle_data',
 'handle_decl',
 'handle_endtag',
 'handle_entityref',
 'handle_pi',
 'handle