In [1]:
import os
import re
import pickle
from pathlib import Path

from sqlalchemy import create_engine, insert, select

from lesson_parser import MyHTMLParser, get_words

lessons_directory = f"Sentences"

db_directory = "./"
word_dict_filename = "test_tonic_accent_word_dict.db"

def get_html_lesson_list():
    file_list = []
    m = "L[\d]{3}"
    p = re.compile(m)
    w = os.walk("Sentences/html")
    for (dirpath, dirnames, filenames) in w:
        for fn in filenames:
            if p.match(fn):
                file_list.append(fn)
    return sorted(file_list)


In [2]:
file_list = get_html_lesson_list()

In [3]:
def fill_word_dict_from_html_files(filenames, word_dict):
    for fn in filenames:
        print(fn)
        lesson = open(os.path.join(lessons_directory, "html", fn)).read()
        print(fn, " : ", len(lesson))
        parser.analyze_lesson(lesson)
        wd = parser.get_lesson_word_dict()
        for w in wd:
            if not w in word_dict:
                word_dict[w] = wd[w]


In [4]:
word_dict = {}
parser = MyHTMLParser()
fill_word_dict_from_html_files(file_list, word_dict)

L001.html
L001.html  :  1428
L002.html
L002.html  :  631
L003.html
L003.html  :  2141
L004.html
L004.html  :  670
L005.html
L005.html  :  812
L006.html
L006.html  :  785
L007.html
L007.html  :  751
L008.html
L008.html  :  1117
L009.html
L009.html  :  1059
L010.html
L010.html  :  1064
L011.html
L011.html  :  1216
L012.html
L012.html  :  1104
L013.html
L013.html  :  1187
L014.html
L014.html  :  954
L015.html
L015.html  :  1058
L016.html
L016.html  :  1392
L017.html
L017.html  :  1143
L018.html
L018.html  :  1307
L019.html
L019.html  :  1206
L020.html
L020.html  :  1273
L021.html
L021.html  :  1276
L022.html
L022.html  :  1353
L023.html
L023.html  :  1490
L024.html
L024.html  :  1367
L025.html
L025.html  :  1642
L026.html
L026.html  :  1471
L027.html
L027.html  :  1544
L028.html
L028.html  :  1317
L029.html
L029.html  :  1871
L030.html
L030.html  :  1683
L031.html
L031.html  :  1671
L032.html
L032.html  :  1715
L033.html
L033.html  :  2007
L034.html
L034.html  :  1561
L035.html
L035.html 

In [5]:
keys_number = 0
for key in word_dict:
    keys_number += 1
print(f"Number of keys : {keys_number}")

Number of keys : 2600


In [6]:
from typing import List
from typing import Optional
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship
from sqlalchemy.orm import DeclarativeBase

class Base(DeclarativeBase):
    pass

class WordDict(Base):
    __tablename__ = "word_dict"
    id: Mapped[int] = mapped_column(primary_key=True)
    word: Mapped[str] 
    word_struct: Mapped[str]
    comment: Mapped[Optional[str]]
    index: Mapped[str]
    def __repr__(self) -> str:
        return f"WordDict(id={self.id!r}, word={self.word!r}, comment={self.comment!r})"

class Sentence(Base):
    __tablename__ = "sentences"
    id: Mapped[int] = mapped_column(primary_key=True)
    sentence: Mapped[str] 
    comment: Mapped[Optional[str]]
    lesson : Mapped[int]
    line : Mapped[int]
    def __repr__(self) -> str:
        return f"Sentence(id={self.id!r}, word={self.sentence!r}, lesson={lesson!r}, numero={numero!r}, comment={self.comment!r})"

In [7]:
db_name = Path(db_directory, word_dict_filename)
db_url  = f"sqlite:///{db_name}"
db_url

'sqlite:///test_tonic_accent_word_dict.db'

In [8]:

engine = create_engine(db_url, echo=False)

In [9]:
metadata = Base.metadata

In [10]:
metadata.create_all(engine)

In [11]:
word_table = metadata.tables['word_dict']
sentence_table = metadata.tables['sentences']

In [12]:
def get_sentences(lesson_nb : int):
    lesson_filename = f"Sentences/html/L{str(lesson_nb).zfill(3)}.html"
    try:
        f = open(lesson_filename, "r")
        lines = f.readlines()
        f.close()
        lesson = ""
        for line in lines:
            lesson += line
        parser.analyze_lesson(lesson)
        sentences = parser.get_sentences()
    except FileNotFoundError:
        sentences = get_sentences_from_audio_files(lesson_nb)
    return sentences


In [14]:
sentences = get_sentences(50)
get_words(sentences[2])

['S01',
 'Felipe',
 'guapo',
 'dame',
 'diez',
 'lonchas',
 'de',
 'jamón',
 'de',
 'york',
 'por',
 'favor']

In [None]:
def fill_word_dict_table(filenames, word_dict):
    for fn in filenames:
        print(fn)
        lesson = open(os.path.join(lessons_directory, "html", fn)).read()
        print(fn, " : ", len(lesson))
        parser.analyze_lesson(lesson)
        sentences = parser.get_sentences()
        for sentence in sentences:
            words = get_words(sentence)
            for w in words:
                if w in word_dict:
                    stmt =
                else:
                    index = [
                    stmt = insert(word_table).values(word=word, word_struct=pickle.dumps(word_dict[word]))
            
        for w in wd:
            if not w in word_dict:
                word_dict[w] = wd[w]


In [12]:
with engine.connect() as conn:
    for word in word_dict:
        stmt = insert(word_table).values(word=word, word_struct=pickle.dumps(word_dict[word]))
        result = conn.execute(stmt)
    conn.commit()

In [13]:
stmt = select(word_table).where(word_table.c.word == "acontecimiento")

In [14]:
with engine.connect() as conn:
    for row in conn.execute(stmt):
        print(f"word : {row.word}, structure : {pickle.loads(row.word_struct)}")

word : acontecimiento, structure : [('aconteci', False), ('mien', True), ('to', False)]


In [15]:
stmt = select(word_table).where(word_table.c.word == "traduzca")
with engine.connect() as conn:
    for row in conn.execute(stmt):
        print(f"word : {row.word}, structure : {pickle.loads(row.word_struct)}")

word : traduzca, structure : [('Tra', False), ('du', True), ('zca', False)]


In [16]:
stmt = select(word_table).where(word_table.c.word == "y")
with engine.connect() as conn:
    for row in conn.execute(stmt):
        print(f"word : {row.word}, structure : {pickle.loads(row.word_struct)}")

word : y, structure : [('Y', False)]


In [21]:
with engine.connect() as conn:
    for k in range(79):
        lesson = k + 1
        sentences = get_sentences(lesson)
        for line in range(len(sentences)):
            stmt = insert(sentence_table).values(sentence=sentences[line], lesson=lesson, line=line)
            conn.execute(stmt)
    conn.commit()

In [24]:
stmt = select(sentence_table).where(sentence_table.c.lesson == 72)
with engine.connect() as conn:
    for row in conn.execute(stmt):
        print(f"line : {row.line}, sentence : {row.sentence}")

line : 0, sentence : N72-Lección setenta y dos
line : 1, sentence : S00-TITLE-Ya que estoy
line : 2, sentence : S01-Mírese en el espejo, ¿cómo se siente? 
line : 3, sentence : S02-Así así
line : 4, sentence : S03-Es una prenda de entretiempo, de algodón. Le serviría tanto para primavera como para otoño. 
line : 5, sentence : S04-Me queda un poco estrecho de cintura, ¿no? 
line : 6, sentence : S05-A ver, muévase un poco, dese la vuelta. Perdiendo un kilito, le quedaría perfecto. 
line : 7, sentence : S06-Tengo que ponerme a dieta, tiene toda la razón. 
line : 8, sentence : S07-Es de rayas ¿Cree que me favorece? 
line : 9, sentence : S08-Le queda fabuloso, pero existe también liso y de cuadros. Yo que usted me llevaría los tres. 
line : 10, sentence : S09-Venga, un día es un día. Ah, ya que estoy, he visto que tiene sección de caballeros. 
line : 11, sentence : S10-Mi marido va siempre de traje, pero me gustaría algo más informal. Una americana sport, talla 50. 
line : 12, sentence : S11