Tomando como base el código de Jorge Dueñas Lerín en https://github.com/JorgeDuenasLerin/diccionario-espanol-txt

In [1]:
import json
import time
from collections import Counter
from pathlib import Path
from urllib.parse import quote
from urllib.request import Request, urlopen

import regex
from lxml import etree
from tqdm.notebook import trange


In [2]:
dir_data_RAE = Path("../../data/RAE")


In [3]:
def get_xtree(url, param, UA):
    tree = None
    attemps = 5
    while attemps > 0 and tree == None:
        try:
            req = Request(url.format(quote(param)), headers={"User-Agent": UA})
            # print (req.full_url)
            # print (start_withs)
            webpage = urlopen(req)
            htmlparser = etree.HTMLParser()
            tree = etree.parse(webpage, htmlparser)
        except Exception as e:
            attemps -= 1
            print(str(e))
            time.sleep(0.5)

    return tree


In [4]:
to_remove_from_title = "Ir a la entrada "
"""
Usamos title por que el contenido en determinadas situaciones cambia:
<a data-cat="FETCH" data-acc="LISTA EMPIEZA POR" data-eti="abollado" title="Ir a la entrada abollado, abollada" href="/abollado">abollado<sup>1</sup>, da</a>
"""
skip = len(to_remove_from_title)

letras = [
    "a", "á", "b", "c", "d", "e", "é", "f", "g", "h", "i", "í", "j", "k", "l", "m", "n",
    "ñ", "o", "ó", "p", "q", "r", "s", "t", "u", "ú", "ü", "v", "w", "x", "y", "z",
]

start_withs = letras.copy()

UA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
# RAE dict (m=31 means all elements starting with {})
url_list = "https://dle.rae.es/{}?m=31"
url_detail = "https://dle.rae.es/{}"


Buscar cada palabra única que aparece en el diccionario. Estas pueden ser flexiones de género (como "abadesa" a partir de "abad"), aunque se guardará igualmente. Más tarde serán filtradas.

In [5]:
# initial = ""
# total_palabras = []

# while len(start_withs) != 0:
#     palabra_start_with = start_withs.pop(0)
#     curr = palabra_start_with[0]
#     if initial!=curr:
#         initial = curr
#         print(f"\nSearch '{initial}'")
#         if total_palabras:
#             with dir_data_RAE.joinpath("total_palabras.txt").open("a", encoding="utf-8") as f:
#                 f.writelines([f"{p}\n" for p in set(total_palabras)])
#         total_palabras = []

#     if(palabra_start_with in ['app', 'docs', 'js']):
#         continue

#     tree = get_xtree(url_list, palabra_start_with, UA)
#     res = tree.xpath('//*[@id="resultados"]/*/div[@class="n1"]/a/@title')

#     # Se repiten palabras. Cuando por ejemplo aba tiene más de 30 y se exapande
#     # abaa, abab, etc... las primeras palabras no aparecen: aba
#     for pal in res:
#         pal_clean = pal[skip:]
#         pal_clean = pal_clean.split(", ")
#         for pal in res:
#             pal_clean = pal[skip:]
#             pal_clean = pal_clean.split(", ")
#             total_palabras.extend(pal_clean)

#     if(len(res)>30):
#         # print("!" * 80)
#         # print("EXAPEND: " + palabra_start_with)
#         print(f"p-count: {len(set(total_palabras))}", end="\r")
#         expand = [palabra_start_with + l for l in letras]
#         start_withs = expand + start_withs

# if total_palabras:
#     with dir_data_RAE.joinpath("total_palabras.txt").open("a", encoding="utf-8") as f:
#         f.writelines([f"{p}\n" for p in set(total_palabras)])

# # Ensure vocab is unique
# with dir_data_RAE.joinpath("total_palabras.txt").open("r", encoding="utf-8") as f:
#     vocab = sorted(list(set([p.strip() for p in f.readlines()])))
# with dir_data_RAE.joinpath("total_palabras.txt").open("w", encoding="utf-8") as f:
#     f.writelines([f"{p}\n" for p in vocab])


Hacemos una búsqueda individual de cada uno de los elementos anteriores y filtramos según lo siguiente:
- Se eliminará si no tiene significado (p.e. palabra derivada por flexión de género).
- Si tiene locuciones o expresiones, estas se mantienen como elementos independientes.

In [6]:
def process_loc(res: str, word: str):
    """
    Obtener expresiones, locuciones, etc. de manera única.
    Ej: "hacerse agua, o un agua, en la boca" -> ["hacerse agua en la boca", "hacerse un agua en la boca"]

    Parameters
    ----------
    res: str
        Original expression as it appears in dictionary.
    word: str
        The term that is used as base for the expression.

    Output
    ------
    results: list(str)
        List of all possible combinations of the expression.
    """
    # Split loc into first (before main word), last (after last main word) and middle (rest)
    # Examples:
    #  -> ['hacerse ', ['agua', ' un agua'], ' en la boca']
    # "romper aguas" -> ['romper ', 'agua', 's']

    results = []

    # Split in the multiple ways that there are to say something
    res = regex.split(r",", res)

    # If there is only one way to say something
    if len(res) == 1:
        results.append(regex.sub(r"\s+", " ", res[0]).strip())
        return results
    else:
        # TODO: there might be two full expressions separated (e.g. "hablar con, o de, misterio, o hacer misterio")
        # Initial approach:
        # first_part = regex.search(rf"(?:.+?(?={word}))", res)
        # last_part = regex.search(rf"(?<={word}.*?\b)(?!.*{word})(.*)", res)
        # middle_part = regex.search(rf"\b({word}.*)?({word}.*?)\b", res)
        # Manage this (although not common)

        # Assume last element will be common to all options
        # print("res", res)
        if not regex.search(r"\bo\b|\bu\b", res[-1]):
            last_part = res.pop()
        elif word in res[-1]:
            opt = regex.sub(r"\bo\b|\bu\b", "", res.pop()).strip()
            results.append(opt)
            last_part = ""
        else:
            last_part = ""

        rest = ",".join(res)
        if (
            ((results) or (word in rest))
            and (not last_part)
            and (not any([word in r for r in res[:-1]]))
        ):
            rest = regex.sub(r"((\bo\b|\bu\b)\s+(\bo\b|\bu\b))+", " o ", rest)
            possibilities = process_loc(rest, word)
            results.extend(possibilities)

        else:
            # Check what are the elements that form options (all that are joint by " o ")
            options = []
            for pos in range(1, len(res)):
                if regex.search(r"\bo\b|\bu\b", res[pos]):
                    if res[pos - 1] not in options:
                        options.append(res[pos - 1])
                    if res[pos] not in options:
                        options.append(res[pos])

            if options:
                options = " o ".join(options)
                options = regex.sub(
                    r"((\bo\b|\bu\b)\s+(\bo\b|\bu\b))+", " o ", "".join(options)
                )
                options = regex.split(r"\bo\b|\bu\b", options)
            else:
                options = []

            # TODO: change this, so that the other way around (last part) is also satisfied
            # Set first part up to first "word" appearance
            first_part = regex.search(rf".*(?<!{word}.*)(?={word})", "".join(res))
            if first_part:
                first_part = first_part.group(0)
            else:
                first_part = ""
            if (options) and (word in options[0]) and (word in last_part):
                first_part = ""
            else:
                options = " o ".join(res)
                options = regex.sub(
                    r"((\bo\b|\bu\b)\s+(\bo\b|\bu\b))+", " o ", "".join(options)
                )
                options = regex.split(r"\bo\b|\bu\b", options)
            ###

            if any(word in o for o in options):
                aux_options = []
                prefix = ""
                for o in options:
                    if word in o:
                        _prefix = regex.search(rf".*?{word}.*?\b", o)
                        if _prefix and word not in last_part:
                            prefix = _prefix.group(0)
                    if prefix:
                        aux_options.append(
                            prefix + " " + regex.sub(rf"^{prefix}", "", o)
                        )
                if aux_options:
                    options = aux_options

            # print("first_part: ", first_part)
            # print("options: ", options)
            # print("last_part:", last_part)

            # Then do all the possible combinations: ['hacerse agua en la boca', 'hacerse un agua en la boca']
            for o in options:
                result = (
                    first_part
                    + regex.sub(rf"{first_part}|{last_part}", "", o)
                    + last_part
                )
                result = regex.sub(r"\s+", " ", result).strip()
                results.append(result)
        return results


In [7]:
# # These expressions might not exists, they are just for testing purposses
# word = "agua"
# expresiones = [
#     "hacerse agua, o un agua, en la boca",
#     "hacer agua por las cacholas, o por los imbornales",
#     "echar, o tirar, agua",
#     "agua mineral, o agua natural",
#     "como agua, o como el agua, en el pez",
#     "romper aguas",
#     "agua del norte, agua falsa, o agua roja",
#     "bailar el coso del agua, o de las aguas",
#     "habérsele muerto agua, o no necesitar, o no tener, agua",
#     "hablar con, o de, agua, o hacer agua",
# ]
# print(*[f"{process_loc(exp, word)}\n" for exp in expresiones])


In [8]:
# with dir_data_RAE.joinpath("vocabulary.json").open("r", encoding="utf-8") as f:
#     vocabulary = json.load(f)


In [9]:
# word = "aconsejado"
# tree = get_xtree(url_detail, word, UA)
# definitions = tree.xpath(
#     '//*[@id="resultados"]/article/*[(contains(@class,"j"))]/*[@class="d"]/text()'
# )
# reference = tree.xpath('//*[@id="resultados"]/*/div[@class="n1"]/a/@data-eti')
# expressions = tree.xpath(
#     '//*[@id="resultados"]/article/*[(contains(@class,"k6")) or (contains(@class,"l"))]'
# )

# # It is a reference to word with definition (e.g. word in femenine references to same word in masculine)
# if not definitions and reference:
#     for r in reference:
#         ref_tree = get_xtree(url_detail, r, UA)
#         ref_definitions = ref_tree.xpath(
#             '//*[@id="resultados"]/article/*[(contains(@class,"j"))]/*[@class="d"]/text()'
#         )
#         if ref_definitions:
#             if any([d == "f." for d in ref_definitions]):
#                 # vocabulary[word] = word
#                 print(word, word)
#             else:
#                 # vocabulary[word] = r
#                 print(word, r)
#             break

# # word has its own meaning
# elif definitions:
#     if any([d == "f." for d in definitions]):
#         # vocabulary[word] = word
#         print(word, word)
#     else:
#         entry = tree.xpath('//*[@id="resultados"]/article/header')
#         if entry:
#             entry = regex.sub(r"\d", "", "".join(entry[0].itertext()).split(",")[0])
#             # vocabulary[word] = entry
#             print(word, entry)
#         else:
#             # vocabulary[word] = word
#             print(word, word)
#     # vocabulary[word] = word
#     # print(word, word)
# # word is just a reference to actual meaning
# else:
#     # vocabulary[word] = word
#     print(word, word)
# if expressions:
#     final_expr = []
#     expr = []
#     for el in expressions:
#         text_parts = el.xpath(
#             'text() | span[text()="o"] | span[text()="u"]  | u | em | a'
#         )
#         aux_expr = []
#         for p in text_parts:
#             if isinstance(p, str):
#                 aux_expr.append(regex.sub(r"\s.+", "", p))
#             else:
#                 aux_expr.append(regex.sub(r"\s.+", "", "".join(p.itertext())))
#         aux_expr = regex.sub(r"\s+", " ", "".join(aux_expr).strip())
#         # Exclude "Véase"
#         if not aux_expr.endswith("."):
#             expr.append(aux_expr)
#     # expr = [regex.sub(r"\s+", " ", "".join(el.xpath('text() | span[text()="o"]/text() | u/text()')).strip()) for el in expressions]
#     [final_expr.extend(process_loc(l, word)) for l in expr]
#     # print(final_expr)

In [10]:
# for el in expressions:
#     text_parts = el.xpath(
#         'text() | span[text()="o"] | span[text()="u"]  | u | em | a'
#     )
#     aux_expr = []
#     for p in text_parts:
#         if isinstance(p, str):
#             aux_expr.append(p)
#             # print(p)
#         else:
#             aux_expr.append(regex.sub(r",\s.+", " ", "".join(p.itertext())))
#             # print("".join(p.itertext()))
#     aux_expr = regex.sub(r"\s+", " ", "".join(aux_expr).strip())
#     print(aux_expr)
#     # print("".join(p), aux_expr)

In [26]:
with dir_data_RAE.joinpath("vocabulary.json").open("r", encoding="utf-8") as f:
    vocabulary = json.load(f)

In [29]:
with dir_data_RAE.joinpath("total_palabras.txt").open("r", encoding="utf-8") as f:
    vocab = sorted(list(set([p.strip() for p in f.readlines()])))

initial = ""
# vocabulary = {}
t = trange(104125, len(vocab), 1, desc="", leave=True)

for i in t:
    word = vocab[i]
    curr = word[0]
    if initial != curr:
        initial = curr
        t.set_description(f"Processing: '{initial}'")
        t.refresh()

    if "-" not in word:
        tree = get_xtree(url_detail, word, UA)
        definitions = tree.xpath(
            '//*[@id="resultados"]/article/*[(contains(@class,"j"))]/*[@class="d"]/text()'
        )
        reference = tree.xpath('//*[@id="resultados"]/*/div[@class="n1"]/a/@data-eti')
        expressions = tree.xpath(
            '//*[@id="resultados"]/article/*[(contains(@class,"k6")) or (contains(@class,"l"))]'
        )
        vease = tree.xpath(
            '//*[@id="resultados"]/article/*[(contains(@class,"l")) and (contains(abbr,"V."))]/a/text()'
        )
        if vease:
            vease = [regex.sub(r"\.", "", w) for w in vease]
        reference = reference + vease

        # It is a reference to word with definition (e.g. word in femenine references to same word in masculine)
        if not definitions and reference:
            for r, _ in Counter(vease + reference).most_common():
                ref_tree = get_xtree(url_detail, r, UA)
                ref_definitions = ref_tree.xpath(
                    '//*[@id="resultados"]/article/*[(contains(@class,"j"))]/*[@class="d"]/text()'
                )
                if ref_definitions:
                    if any([d == "f." for d in ref_definitions]):
                        vocabulary[word] = word
                        # print(word, word)
                    else:
                        vocabulary[word] = r
                        # print(word, r)
                    break

        # word has its own meaning
        elif definitions:
            if any([d == "f." for d in definitions]):
                vocabulary[word] = word
                # print(word, word)
            else:
                entry = tree.xpath('//*[@id="resultados"]/article/header')
                if entry:
                    entry = regex.sub(
                        r"\d", "", "".join(entry[0].itertext()).split(",")[0]
                    )
                    vocabulary[word] = entry
                    # print(word, entry)
                else:
                    vocabulary[word] = word
                # print(word, entry)
            # vocabulary[word] = word
            # print(word, word)
        # word is just a reference to actual meaning
        else:
            vocabulary[word] = word
            # print(word, word)

        # # uses of that word in expressions
        # if expressions:
        #     final_expr = []
        #     expr = []
        #     for el in expressions:
        #         text_parts = el.xpath(
        #             'text() | span[text()="o"] | span[text()="u"]  | u | em | a'
        #         )
        #         aux_expr = []
        #         for p in text_parts:
        #             if isinstance(p, str):
        #                 aux_expr.append(p)
        #             else:
        #                 aux_expr.append(regex.sub(r",\s.+", " ", "".join(p.itertext())))
        #         aux_expr = regex.sub(r"\s+", " ", "".join(aux_expr).strip())
        #         # Exclude "Véase"
        #         if not aux_expr.endswith("."):
        #             expr.append(aux_expr)
        #     # expr = [regex.sub(r"\s+", " ", "".join(el.xpath('text() | span[text()="o"]/text() | u/text()')).strip()) for el in expressions]
        #     [final_expr.extend(process_loc(l, word)) for l in expr]
        #     with dir_data_RAE.joinpath("expresiones.txt").open(
        #         "a", encoding="utf-8"
        #     ) as f:
        #         f.writelines([f"{p}\n" for p in set(final_expr)])

    # Save vocab every 1000 words
    if not i % 1000:
        with dir_data_RAE.joinpath("vocabulary.json").open("w", encoding="utf-8") as f:
            json.dump(vocabulary, f)
with dir_data_RAE.joinpath("vocabulary.json").open("w", encoding="utf-8") as f:
    json.dump(vocabulary, f)


  0%|          | 0/5255 [00:00<?, ?it/s]

In [23]:
with dir_data_RAE.joinpath("vocabulary.json").open("w", encoding="utf-8") as f:
    json.dump(vocabulary, f)

In [None]:
with dir_data_RAE.joinpath("expresiones.txt").open("r", encoding="utf8") as f:
    expresiones = [l.strip() for l in f.readlines()]
with dir_data_RAE.joinpath("expresiones.txt").open("w", encoding="utf8") as f:
    f.writelines(f"{l}\n" for l in sorted(list(set(expresiones))))