<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"></ul></div>

In [1]:
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen, HTTPError
from urllib.parse import quote
import pdb
import pandas as pd

In [2]:
URL = "https://en.wiktionary.org/wiki/{word}"

In [3]:
def get_bs(word):
    try:
        with urlopen(URL.format(word=quote(word))) as response:
            html_data = response.read()
        return BeautifulSoup(html_data, "lxml")
    except HTTPError:
        return None

In [4]:
def get_elements(parser):
    root = parser.find("span", id="Serbo-Croatian").parent
    elements = []
    current = root
    while current is not None and current.name != "hr":
        elements.append(current)
        current = current.find_next_sibling()
    return elements

In [5]:
def get_word_data(elements):
    state = None
    iterator = iter(elements)
    try:
        while True:
            element = next(iterator)
            text = element.text
            
            if text.endswith("[edit]"):
                text = text[:-len("[edit]")]
                
            if text == "Serbo-Croatian":
                pass
            elif text == "Etymology":
                next(iterator)
            elif text == "Pronunciation":
                next(iterator)
            elif text == "Derived terms":
                next(iterator)
            elif text == "Alternative forms":
                next(iterator)
            elif text == "References":
                next(iterator)
            elif text == "Noun":
                result = process_noun(iterator)
            elif text == "Verb":
                result = process_verb(iterator)
            else:
                print(f"Unknown element: {text}")
    except StopIteration:
        pass
    
    return result

In [6]:
def process_noun(iterator):
    word = next(iterator).text
    meanings = next(iterator).text
    next(iterator)
    declension = parse_declension(next(iterator))
    return word, meanings, declension

In [7]:
def parse_declension(element):
    import re
    
    text = element.text
    fields = re.split(r"\n+", text.strip())[1:]
    if len(fields) == 23:
        fields = fields[2:]
        index = [fields[3 * i] for i in range(7)]
        singular = [fields[3 * i + 1] for i in range(7)]
        plural = [fields[3 * i + 2] for i in range(7)]
        table = pd.DataFrame(data={"singular": singular, "plural": plural}, index=index, columns=["singular", "plural"])
    elif len(fields) == 15:
        fields = fields[1:]
        index = [fields[2 * i] for i in range(7)]
        plural = [fields[2 * i + 1] for i in range(7)]
        table = pd.DataFrame(data={"plural": plural}, index=index, columns=["plural"])
    else:
        raise ValueError(f"Fields length is {len(fields)}.")
    
    return table

In [8]:
def process_verb(iterator):
    word = next(iterator).text
    meanings = next(iterator).text
    next(iterator)
    conjugation = parse_conjugation(next(iterator))
    next(iterator)
    next(iterator)
    return word, meanings, conjugation

In [9]:
def parse_conjugation(element):
    import re
    from collections import OrderedDict

    text = element.contents[3].text
    fields = re.split(r"\n+", text.strip())
    iterator = iter(fields)

    infinitive, pres_verb_adv, past_verb_adv, verb_noun = ([field.split(": ")[1] for field in [next(iterator) for _ in range(4)]])

    special = pd.Series([infinitive, pres_verb_adv, past_verb_adv, verb_noun], 
                        index=["Infinitive", "Present verbal adverb", "Past verbal adverb", "Verbal noun"])

    for _ in range(17):
        next(iterator)

    values = OrderedDict()
    values["Present"] = [next(iterator) for _ in range(7)][1:]
    next(iterator)
    values["Future I"] = [next(iterator).replace("1", "; ") for _ in range(7)][1:]
    values["Future II"] = [next(iterator).replace("2", "") for _ in range(7)][1:]
    next(iterator)
    values["Perfect"] = [next(iterator).replace("2", "") for _ in range(7)][1:]
    values["Pluperfect"] = [next(iterator).replace("2", "") for _ in range(7)][1:]
    values["Imperfect"] = [next(iterator).replace("2", "") for _ in range(7)][1:]
    values["Conditional I"] = [next(iterator).replace("2", "")  for _ in range(7)][1:]
    values["Conditional II"] = [next(iterator).replace("2", "")  for _ in range(7)][1:]
    values["Imperative"] = [next(iterator) for _ in range(7)][1:]
    
    columns = pd.MultiIndex.from_product([["Singular", "Plural"], ["1st", "2nd", "3rd"]])
    
    table = pd.DataFrame(values, index=columns)

    return special, table

In [10]:
kocka = get_word_data(get_elements(get_bs("kocka")))

In [11]:
print(kocka[2].to_latex())
pd.DataFrame.to_latex

\begin{tabular}{lll}
\toprule
{} & singular &          plural \\
\midrule
nominative   &    kȍcka &           kocke \\
genitive     &    kocke &  kȍcākā / kȍckī \\
dative       &    kocki &         kockama \\
accusative   &    kocku &           kocke \\
vocative     &    kocko &           kocke \\
locative     &    kocki &         kockama \\
instrumental &   kockom &         kockama \\
\bottomrule
\end{tabular}



<function pandas.core.generic.NDFrame.to_latex(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, bold_rows=False, column_format=None, longtable=None, escape=None, encoding=None, decimal='.', multicolumn=None, multicolumn_format=None, multirow=None)>

In [12]:
print(*get_word_data(get_elements(get_bs("vrata"))), sep="\n\n")

vráta n pl (Cyrillic spelling вра́та)


(plural only) door

               plural
nominative      vrata
genitive        vrata
dative        vratima
accusative      vrata
vocative        vrata
locative      vratima
instrumental  vratima


In [13]:
print(*get_word_data(get_elements(get_bs("cvijeće"))), sep="\n\n")

cvijȇće n (Cyrillic spelling цвије̑ће)


(collectively) flowers

                plural
nominative     cvijeće
genitive       cvijeća
dative         cvijeću
accusative     cvijeće
vocative       cvijeće
locative       cvijeću
instrumental  cvijećem


In [14]:
print(*get_word_data(get_elements(get_bs("rasti"))), sep="\n\n")

rȃsti impf (Cyrillic spelling ра̑сти)


(intransitive) to grow
(intransitive) to increase
(intransitive) to accrue, to accumulate

(Infinitive                 rasti
Present verbal adverb    rástūći
Past verbal adverb             —
Verbal noun                    —
dtype: object,               Present            Future I     Future II     Perfect  \
Singular 1st   rastem      rast ću; rašću  budem rastao  rastao sam   
         2nd   rasteš    rast ćeš; rašćeš  budeš rastao   rastao si   
         3rd    raste      rast će; rašće   bude rastao   rastao je   
Plural   1st  rastemo  rast ćemo; rašćemo  budemo rasli   rasli smo   
         2nd  rastete  rast ćete; rašćete  budete rasli   rasli ste   
         3rd    rastu      rast će; rašće    budu rasli    rasli su   

                  Pluperfect   Imperfect Conditional I    Conditional II  \
Singular 1st  bio sam rastao    rastijah    rastao bih    bio bih rastao   
         2nd   bio si rastao   rastijaše     rastao bi     bio bi rasta

In [15]:
rasti = get_word_data(get_elements(get_bs("rasti")))

In [16]:
print(rasti[2][1][["Present"]].to_latex())

\begin{tabular}{lll}
\toprule
       &     &  Present \\
\midrule
Singular & 1st &   rastem \\
       & 2nd &   rasteš \\
       & 3rd &    raste \\
Plural & 1st &  rastemo \\
       & 2nd &  rastete \\
       & 3rd &    rastu \\
\bottomrule
\end{tabular}



In [17]:
print(rasti[2][0].to_latex())
print(rasti[2][1][["Present", "Perfect", "Future I", "Pluperfect"]].to_latex())
print(rasti[2][1][["Imperfect", "Future II", "Conditional I", "Conditional II", "Imperative"]].to_latex())

\begin{tabular}{ll}
\toprule
{} &        0 \\
\midrule
Infinitive            &    rasti \\
Present verbal adverb &  rástūći \\
Past verbal adverb    &        — \\
Verbal noun           &        — \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
       &     &  Present &     Perfect &            Future I &      Pluperfect \\
\midrule
Singular & 1st &   rastem &  rastao sam &      rast ću; rašću &  bio sam rastao \\
       & 2nd &   rasteš &   rastao si &    rast ćeš; rašćeš &   bio si rastao \\
       & 3rd &    raste &   rastao je &      rast će; rašće &   bio je rastao \\
Plural & 1st &  rastemo &   rasli smo &  rast ćemo; rašćemo &  bili smo rasli \\
       & 2nd &  rastete &   rasli ste &  rast ćete; rašćete &  bili ste rasli \\
       & 3rd &    rastu &    rasli su &      rast će; rašće &   bili su rasli \\
\bottomrule
\end{tabular}

\begin{tabular}{lllllll}
\toprule
       &     &   Imperfect &     Future II & Conditional I &    Conditional II & Imperative \\
\midrul

In [18]:
rasti[2]

(Infinitive                 rasti
 Present verbal adverb    rástūći
 Past verbal adverb             —
 Verbal noun                    —
 dtype: object,
               Present            Future I     Future II     Perfect  \
 Singular 1st   rastem      rast ću; rašću  budem rastao  rastao sam   
          2nd   rasteš    rast ćeš; rašćeš  budeš rastao   rastao si   
          3rd    raste      rast će; rašće   bude rastao   rastao je   
 Plural   1st  rastemo  rast ćemo; rašćemo  budemo rasli   rasli smo   
          2nd  rastete  rast ćete; rašćete  budete rasli   rasli ste   
          3rd    rastu      rast će; rašće    budu rasli    rasli su   
 
                   Pluperfect   Imperfect Conditional I    Conditional II  \
 Singular 1st  bio sam rastao    rastijah    rastao bih    bio bih rastao   
          2nd   bio si rastao   rastijaše     rastao bi     bio bi rastao   
          3rd   bio je rastao   rastijaše     rastao bi     bio bi rastao   
 Plural   1st  bili smo rasli  ras

In [19]:
print(r"\begin{itemize}")
print(*[r"\item " + r for r in rasti[1].splitlines()], sep="\n")
print(r"\end{itemize}")

\begin{itemize}
\item (intransitive) to grow
\item (intransitive) to increase
\item (intransitive) to accrue, to accumulate
\end{itemize}


In [20]:
print(rasti[0])

rȃsti impf (Cyrillic spelling ра̑сти)

