In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import time

# A1

## Spanish 1

In [2]:
soup_url = "https://en.wikiversity.org/wiki/Spanish_1"

In [3]:
def get_vocab_url(soup):
    """
    get a list of urls that lead to the vocabulary

    soup: (BeautifulSoup) an html parsed bs object

    return: (list) a list of urls
    """
    tag_list = soup.find("ul").findAll("a", {"href": re.compile(r"/wiki/Spanish_1/.*")})
    url_list = [
        soup_url + re.search(r'(/wiki/Spanish_1)(/.*)(" )', str(tag)).group(2)
        for tag in tag_list
        if "Linguistic_characteristics" not in str(tag)
    ]
    return url_list

In [4]:
soup = BeautifulSoup(urlopen(soup_url), "html.parser")
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_1/Countries', 'https://en.wikiversity.org/wiki/Spanish_1/The_Basics', 'https://en.wikiversity.org/wiki/Spanish_1/Activities', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives', 'https://en.wikiversity.org/wiki/Spanish_1/School', 'https://en.wikiversity.org/wiki/Spanish_1/The_Classroom', 'https://en.wikiversity.org/wiki/Spanish_1/Food_%26_Drink', 'https://en.wikiversity.org/wiki/Spanish_1/Health', 'https://en.wikiversity.org/wiki/Spanish_1/Destinations', 'https://en.wikiversity.org/wiki/Spanish_1/Recreation_%26_Lifestyle', 'https://en.wikiversity.org/wiki/Spanish_1/Family_%26_Celebrations', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives_%26_In_a_restaurant', 'https://en.wikiversity.org/wiki/Spanish_1/The_Bedroom', 'https://en.wikiversity.org/wiki/Spanish_1/The_Household', 'https://en.wikiversity.org/wiki/Spanish_1/Shopping', 'https://en.wikiversity.org/wiki/Spanish_1/At_the_Mall', 'https://en.wikiversity.org/wiki/Spanish_1/Vacation',

In [5]:
def get_vocab(url):
    """
    get a list of vocabulary scraped from the url given

    url: (str) an url lead to a list of vocabulary

    return: (list) a list of Spanish words
    """
    soup = BeautifulSoup(urlopen(url), "html.parser")
    regex = re.compile(r"<b>(.*)</b>")  # extract Spanish words

    vocab_list = [
        regex.search(str(match.find("b")))
        .group(1)
        .lower()
        .split("=")[0]
        .split("-")[0]
        .strip()
        .strip(".,-=")
        .strip()
        for match in soup.findAll("li")  # find lists
        if not len(match.attrs) and match.find("b")
    ]

    
    # Expand and add most of the parenthetical constructions from the scraped vocabulary
    regex = re.compile(r"\((\w+)\)")
    for elem in vocab_list:
        search = regex.search(elem)

        if search:
            start = search.start(1)
            end = search.end(1)
            match = search.group(1)

            # Pre-strip (base case)
            elem_strip = elem[: start - 1] + elem[end + 1 :]

            # If the word in parenthesis occurs at the end of the string preceded by a space,
            # it is likely intended as extra info (eg., "baile (bailar)") and can be skipped
            if elem[start - 2] == " " and len(elem) == end + 1:
                continue

            # For constructions like "mi(s)" -> "mi" & "mis"; "tiene(n)" -> "tiene" & "tienen"
            elif match == "s" or match == "n" or match == "es":
                sub = elem[: start - 1] + match + elem[end + 1 :]

            # For constructions like "professor(ora)" -> "professor" & "professora"
            elif match == "ora":
                sub = elem[: start - len(match)] + match + elem[end + 1 :]

            # If the parentheses occur at the start of the string:
            # eg., "(los) estados unidos" -> "estados unidos" & "los estados unidos"
            elif start == 1:
                sub = match + elem[end + 1 :]
                elem_strip = elem[end + 2 :]

            # For constructions like "fantastico(a)" -> "fantastico" & "fantastica"
            elif match == "as" or match == "a" or match == "os" or match == "o":
                sub = elem[: start - 1 - len(match)] + match + elem[end + 1 :]

            # Else case: if parentheses occur somewhere in the middle of the string:
            # eg., "té (frío) helado" -> "té helado" & "té frío helado"
            else:
                sub = elem[: start - 1] + match + elem[end + 1 :]
                elem_strip = elem[: start - 2] + elem[end + 1 :]

            vocab_list.append(elem_strip)
            vocab_list.append(sub)

    # Add the constituent words in a multi-word phrase to the vocabulary
    for elem in vocab_list:
        if " " in elem:
            vocab_list.extend([e.strip("/,()") for e in elem.split(" ")])

    return vocab_list

In [6]:
spanish1_vocab = []
for url in url_list:
    spanish1_vocab.extend(get_vocab(url))

In [7]:
print(len(set(spanish1_vocab)))

2298


In [8]:
set(spanish1_vocab)

{'',
 'cara a cara',
 'hacer ejercicio',
 'seiscientos(as)',
 'haces',
 'ochocientos(as)',
 'carpeta de argollas',
 '¡qué asco!',
 'comedor',
 'cuarto',
 'corta',
 '¿cuántos(as)?',
 'nadar',
 'está bueno(a)',
 'pizza',
 'artística',
 'setecientos',
 'son las dos',
 'cero',
 'por favor',
 'tener',
 'encanta(n',
 'botella de vidrio',
 'bigote',
 'puede',
 'alto(a)',
 'ayudar',
 'cebolla',
 'video',
 'composición',
 'disteis',
 'buen mozo(a)',
 'acabar de',
 'en',
 'tienen ... años',
 'cuarta',
 'talentoso(a)',
 'me',
 'simpática',
 'cantar',
 'décimo(a)',
 'música clásica',
 'montañas',
 'delfín',
 'veinticinco',
 'sucio(a)',
 '(los) estados unidos',
 'pasar la aspiradora',
 'caminar',
 '¿y',
 'estás',
 'jugar al ajedrez',
 'ambos',
 'cereza',
 'fuisteis',
 'coleccionar',
 'tocáis',
 'madre (mamá)',
 'lo siento',
 'lección de piano',
 'julio',
 'roja',
 'verdulería',
 'tremenda',
 'terminar',
 'construcción',
 'pasado',
 'coliflor',
 'churro',
 'lacio',
 'durante',
 'hermosa',
 'dar',
 '

## Spanish 2

In [9]:
soup_url = "https://en.wikiversity.org/wiki/Spanish_2"

In [10]:
# Remake function for use with Spanish 2 Wikiversity
def get_vocab_url(soup):
    """
    get a list of urls that lead to the vocabulary

    soup: (BeautifulSoup) an html parsed bs object

    return: (list) a list of urls
    """
    tag_list = soup.find("ul").findAll(
        "a", {"href": re.compile(r"/wiki/Spanish_2/Chapter.*")}
    )
    url_list = [
        soup_url + re.search(r'(/wiki/Spanish_2)(/Chapter.*)(" )', str(tag)).group(2)
        for tag in tag_list
    ]
    return url_list

In [11]:
soup = BeautifulSoup(urlopen(soup_url), "html.parser")
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_2/Chapter_1_(Classroom_Events)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_2_(Free_Time)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_3_(Daily_Activities)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_4_(Fashion)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_5_(Errands)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_6_(On_the_Road)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_7_(Childhood)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_8_(Celebrations)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_9_(Emergencies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_10_(Accidents)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_11_(Television)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_12_(Movies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_13_(Cooking)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_14_(Picnics)', 'https://en.wikiversity.org/wiki/Spanish_2/Ch

In [12]:
spanish2_vocab = []
for url in url_list:
    spanish2_vocab.extend(get_vocab(url))

In [13]:
print(len(set(spanish2_vocab)))

1754


In [14]:
set(spanish2_vocab)

{'',
 'mediano(a)',
 'sirva',
 'picadura',
 'dejar',
 'de niña',
 'olla',
 'cepillo',
 'grapadora',
 'cantante',
 'nochebuena',
 'devuelto',
 'inmediatamente',
 'seguridad',
 'brindar',
 'mirasteis',
 'tener',
 'puede',
 'paramédica',
 'empiezo',
 'alto(a)',
 'auxiliar',
 'en',
 'millón/milliones',
 'me',
 'algunos(as',
 'yeso',
 'has',
 'enfermero',
 'gel',
 '¡qué lástima!',
 'dicho',
 'insistir en',
 'estás',
 'fuisteis',
 'cereza',
 'centro de salud',
 'coleccionar',
 'competencia',
 '¡basta!',
 'tan',
 'escriba',
 'almorcé',
 'clara',
 'tuvo',
 'apartamentos',
 'dé',
 'tantos(as) + <i>noun</i> + como',
 'ducha',
 'fresca',
 'entusiasmado',
 'pasado',
 'jugabais',
 'muñeco',
 'crítico(a)',
 'sacar (un lápiz, una hoja de papel)',
 'quemarse',
 'dar',
 'esté',
 'agitada',
 'alegre',
 'tobogán',
 'hacíamos',
 'monstruo',
 'sacar una radiografía',
 'roto',
 'salsa',
 'dormida',
 'no',
 'repitiendo',
 'cerrada',
 'ibais',
 '¿de qué está hecho(a)?',
 'pasajero',
 'complicado(a)',
 'quedar

In [15]:
a1_vocab = set(spanish1_vocab) | set(spanish2_vocab)

In [16]:
len(a1_vocab)

3717

## Building Vocabulary from Frequency Lists

In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

##### Cleaned up (standardized) version of the 5k words list in https://en.wiktionary.org/wiki/User:Matthias_Buchmeier/Spanish_frequency_list-1-5000

In [18]:
urls = [
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish1000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish1001-2000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish2001-3000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish3001-4000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish4001-5000",
]

In [19]:
df_list = []
for url in urls:
    html = requests.get(url).content
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("tbody")
    for row in table.find_all("tr")[1:]:
        row = row.text.lower().strip().split("\n")
        df_list.append(list(filter(lambda e: e != "", row))[1:])
df = pd.DataFrame(df_list, columns = ["word", "occurrences", "lemma"])
print(df)

             word occurrences             lemma
0             que       32894               que
1              de       32116                de
2              no       29897                no
3               a       22313                 a
4              la       21127                la
...           ...         ...               ...
4995    excepción          12         excepción
4996      espadas          12            espada
4997     engañado          12  engañado engañar
4998      débiles          12             débil
4999  corporación          12       corporación

[5000 rows x 3 columns]


In [22]:
N = 500
print(f"What fraction of words in the top-{N} most frequent words already appear in our scraped vocabulary from Spanish 1 & 2?")
print((N - (len(a1_vocab | set(df["word"][:N])) - len(a1_vocab)))/N)

What fraction of words in the top-500 most frequent words already appear in our scraped vocabulary from Spanish 1 & 2?
1.0


In [23]:
a1_vocab = a1_vocab | set(df["word"][:500])
len(a1_vocab)

3857