In [47]:
from bs4 import BeautifulSoup, NavigableString
from urllib.request import urlopen
import re
import time

# python -m spacy download es_core_news_sm
import spacy
from spacy.lang.es.examples import sentences

In [48]:
nlp = spacy.load("es_core_news_sm")

## Spanish 1

In [2]:
soup_url = "https://en.wikiversity.org/wiki/Spanish_1"

In [3]:
def get_vocab_url(soup):
    """
    get a list of urls that lead to the vocabulary

    soup: (BeautifulSoup) an html parsed bs object

    return: (list) a list of urls
    """
    tag_list = soup.find("ul").findAll("a", {"href": re.compile(r"/wiki/Spanish_1/.*")})
    url_list = [
        soup_url + re.search(r'(/wiki/Spanish_1)(/.*)(" )', str(tag)).group(2)
        for tag in tag_list
        if "Linguistic_characteristics" not in str(tag)
    ]
    return url_list

In [4]:
soup = BeautifulSoup(urlopen(soup_url), "html.parser")
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_1/Countries', 'https://en.wikiversity.org/wiki/Spanish_1/The_Basics', 'https://en.wikiversity.org/wiki/Spanish_1/Activities', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives', 'https://en.wikiversity.org/wiki/Spanish_1/School', 'https://en.wikiversity.org/wiki/Spanish_1/The_Classroom', 'https://en.wikiversity.org/wiki/Spanish_1/Food_%26_Drink', 'https://en.wikiversity.org/wiki/Spanish_1/Health', 'https://en.wikiversity.org/wiki/Spanish_1/Destinations', 'https://en.wikiversity.org/wiki/Spanish_1/Recreation_%26_Lifestyle', 'https://en.wikiversity.org/wiki/Spanish_1/Family_%26_Celebrations', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives_%26_In_a_restaurant', 'https://en.wikiversity.org/wiki/Spanish_1/The_Bedroom', 'https://en.wikiversity.org/wiki/Spanish_1/The_Household', 'https://en.wikiversity.org/wiki/Spanish_1/Shopping', 'https://en.wikiversity.org/wiki/Spanish_1/At_the_Mall', 'https://en.wikiversity.org/wiki/Spanish_1/Vacation',

In [14]:
def get_vocab(url):
    """
    get a list of vocabulary scraped from the url given

    url: (str) an url lead to a list of vocabulary

    return: (list) a list of Spanish words
    """
    soup = BeautifulSoup(urlopen(url), "html.parser")
    regex = re.compile(r"<b>(.*)</b>")  # extract Spanish words

    vocab_list = [
        regex.search(str(match.find("b")))
        .group(1)
        .lower()
        .split("=")[0]
        .split("-")[0]
        .strip()
        .strip(".,-=")
        .strip()
        for match in soup.findAll("li")  # find lists
        if not len(match.attrs) and match.find("b")
    ]

    
    # Expand and add most of the parenthetical constructions from the scraped vocabulary
    regex = re.compile(r"\((\w+)\)")
    for elem in vocab_list:
        search = regex.search(elem)

        if search:
            start = search.start(1)
            end = search.end(1)
            match = search.group(1)

            # Pre-strip (base case)
            elem_strip = elem[: start - 1] + elem[end + 1 :]

            # If the word in parenthesis occurs at the end of the string preceded by a space,
            # it is likely intended as extra info (eg., "baile (bailar)") and can be skipped
            if elem[start - 2] == " " and len(elem) == end + 1:
                continue

            # For constructions like "mi(s)" -> "mi" & "mis"; "tiene(n)" -> "tiene" & "tienen"
            elif match == "s" or match == "n" or match == "es":
                sub = elem[: start - 1] + match + elem[end + 1 :]

            # For constructions like "professor(ora)" -> "professor" & "professora"
            elif match == "ora":
                sub = elem[: start - len(match)] + match + elem[end + 1 :]

            # If the parentheses occur at the start of the string:
            # eg., "(los) estados unidos" -> "estados unidos" & "los estados unidos"
            elif start == 1:
                sub = match + elem[end + 1 :]
                elem_strip = elem[end + 2 :]

            # For constructions like "fantastico(a)" -> "fantastico" & "fantastica"
            elif match == "as" or match == "a" or match == "os" or match == "o":
                sub = elem[: start - 1 - len(match)] + match + elem[end + 1 :]

            # Else case: if parentheses occur somewhere in the middle of the string:
            # eg., "té (frío) helado" -> "té helado" & "té frío helado"
            else:
                sub = elem[: start - 1] + match + elem[end + 1 :]
                elem_strip = elem[: start - 2] + elem[end + 1 :]
            
            vocab_list.append(elem_strip)
            vocab_list.append(sub)

    # Add the constituent words in a multi-word phrase to the vocabulary
    for elem in vocab_list:
        if " " in elem:
            vocab_list.extend([e.strip("/,()") for e in elem.split(" ")])

    return [word for word in vocab_list if '(' not in word]

In [15]:
spanish1_vocab = []
for url in url_list:
    spanish1_vocab.extend(get_vocab(url))

In [16]:
print(len(set(spanish1_vocab)))

2115


In [19]:
set(spanish1_vocab)

{'',
 'preocupada',
 'vestido',
 'hiciste',
 'salir',
 'acampar',
 'hoy?',
 'intro',
 'media',
 'zanahorias',
 'almacén',
 'pasear en bote',
 'practicar artes marciales',
 'ciudad',
 'momento',
 'verdes',
 'setecientos',
 'nariz',
 'jugar en el parque',
 'futuro',
 'ordenado',
 'queso crema',
 'está buena',
 'cerca',
 'servirle?',
 'película policíaca',
 '¿de dónde eres?',
 'hija único',
 'monopatín',
 'cincuenta',
 'según',
 'navegar por la red',
 'sabéis',
 'ante',
 'carpeta de argollas',
 'bote',
 'sólo',
 'pongo',
 'boca',
 'chuleta de cerdo',
 'tienda',
 'razón',
 'guantes',
 'té helado',
 'melón',
 'aretes',
 'jugar al fútbol',
 'traje',
 'guapo',
 'galletas',
 'drama',
 'propia',
 'quince',
 'enviar un correo electrónico',
 'el horario',
 'compartís',
 'ropa interior',
 'pastel',
 'me',
 'pensamos',
 'hielo',
 'es necesario',
 'frambuesa',
 'setecientas',
 'policíaca',
 'página web',
 'no',
 'no me gusta',
 'unos pantalones',
 'montar en monopatín / patineta',
 'bebidas',
 'músi

## Spanish 2

In [20]:
soup_url = "https://en.wikiversity.org/wiki/Spanish_2"

In [21]:
# Remake function for use with Spanish 2 Wikiversity
def get_vocab_url(soup):
    """
    get a list of urls that lead to the vocabulary

    soup: (BeautifulSoup) an html parsed bs object

    return: (list) a list of urls
    """
    tag_list = soup.find("ul").findAll(
        "a", {"href": re.compile(r"/wiki/Spanish_2/Chapter.*")}
    )
    url_list = [
        soup_url + re.search(r'(/wiki/Spanish_2)(/Chapter.*)(" )', str(tag)).group(2)
        for tag in tag_list
    ]
    return url_list

In [22]:
soup = BeautifulSoup(urlopen(soup_url), "html.parser")
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_2/Chapter_1_(Classroom_Events)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_2_(Free_Time)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_3_(Daily_Activities)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_4_(Fashion)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_5_(Errands)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_6_(On_the_Road)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_7_(Childhood)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_8_(Celebrations)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_9_(Emergencies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_10_(Accidents)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_11_(Television)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_12_(Movies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_13_(Cooking)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_14_(Picnics)', 'https://en.wikiversity.org/wiki/Spanish_2/Ch

In [23]:
spanish2_vocab = []
for url in url_list:
    spanish2_vocab.extend(get_vocab(url))

In [24]:
print(len(set(spanish2_vocab)))

1626


In [25]:
set(spanish2_vocab)

{'',
 'acampar',
 'cumplir años',
 'ibas',
 'leña',
 've',
 'bombero',
 'derecho',
 'darse',
 'recorrido',
 'nunca jamás',
 'con destino a',
 'dormí',
 'tempestad',
 'cuidado',
 'aplaudir',
 'recordar',
 'grapadora',
 'paramédico',
 'en seguida',
 'entendemos',
 'cepillo de dientes',
 'floja',
 'timida',
 'a causa de',
 'sabéis',
 'he estudiado',
 'competencia',
 'sólo',
 'tuyo',
 'reina',
 'llorar',
 'campeonato',
 'cielo',
 'chuleta de cerdo',
 'tienda',
 'pudimos',
 'empiezo',
 'por lo general',
 'vayas',
 '¿qué tal es...?',
 'maquinilla',
 'melón',
 'he visto',
 'traje',
 'compartimiento sobre la cabeza',
 'tuvieron',
 'artificiales',
 'agente de viajes',
 'camión',
 'pastel',
 'dolor',
 'me',
 'entusiasmada',
 'leí',
 'han',
 'enfermero',
 'tuviste',
 'no',
 'tomar',
 'empieza',
 'fregadero',
 'seca',
 'empleado',
 'aéreo',
 'regalar',
 'acostamos',
 'es...?',
 'entrada',
 'cómoda',
 'están',
 'quitar',
 'zorra',
 'accesible',
 'no vayas',
 'ha estudiado',
 'llevarse',
 'baja',
 '

## A-level vocab

In [26]:
a_vocab = set(spanish1_vocab) | set(spanish2_vocab)
print(len(a_vocab))

3413


## B-level text

In [34]:
soup_url = 'https://press.rebus.community/aalh/'

In [35]:
def get_text_links(soup):
    '''
    get a list of urls that lead to the texts
    
    soup: (BeautifulSoup) an html parsed bs object
    
    return: (list) a list of urls
    '''
    tag_list = soup.findAll("li", {"id" : re.compile(r'toc-chapter-.*')})
    url_list = []
    is_text = False
    for tag in tag_list:
        if tag.find('a').contents[0] == 'Introducción':
            is_text = True
            continue
        elif is_text == True:
            url_list.append(tag.find('a').attrs['href'])
            is_text = False
            
    return url_list

In [36]:
soup = BeautifulSoup(urlopen(soup_url), 'html.parser')
url_list = get_text_links(soup)

print(url_list)

['https://press.rebus.community/aalh/chapter/lo-que-sucedio-a-un-mancebo-el-dia-que-se-caso/', 'https://press.rebus.community/aalh/chapter/carta-a-luis-de-santangel/', 'https://press.rebus.community/aalh/chapter/tercera-carta-relacion-de-hernan-cortes-al-emperador/', 'https://press.rebus.community/aalh/chapter/romance-del-enamorado-y-la-muerte/', 'https://press.rebus.community/aalh/chapter/la-vida-de-lazarillo-de-tormes-y-de-sus-fortunas-y-adversidades/', 'https://press.rebus.community/aalh/chapter/historia-de-la-monja-alferez/', 'https://press.rebus.community/aalh/chapter/a-su-retrato/', 'https://press.rebus.community/aalh/chapter/viaje-a-la-habana-carta-1/', 'https://press.rebus.community/aalh/chapter/la-flor-de-la-cana/', 'https://press.rebus.community/aalh/chapter/al-partir/', 'https://press.rebus.community/aalh/chapter/emancipacion-moral-de-la-mujer/', 'https://press.rebus.community/aalh/chapter/la-rosa-de-pasion/', 'https://press.rebus.community/aalh/chapter/las-medias-rojas/', '

In [107]:
def get_text(url):
    '''
    get a dictionary of the content and metadata of a text scraped from the url given
    
    url: (str) an url lead to a text
    
    return: (dict) a dictionary with 3 keys: 'author', 'title', and 'content'
    '''
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    text_dict = {}
    
    text_dict['title'] = soup.find('h1', {'class':'entry-title'}).contents[0].strip()
    for child in soup.find('h2').contents:
        if isinstance(child, NavigableString):
            text_dict['author'] = child.strip()
            break
    
    content_section = soup.find('section', {'data-type':'chapter'}).findAll('p')
    content = ''
    for tag in content_section:
        paragraph = []
        for child in tag.contents:
            if isinstance(child, NavigableString):
                paragraph.append(child)
        content += ''.join(paragraph) + '\n\n'
    
    text_dict['content'] = content.split('Preguntas de discusión')[0].strip()
    return text_dict

In [108]:
aalh_corpus = []
for url in url_list:
    aalh_corpus.append(get_text(url))
    time.sleep(1)

In [129]:
aalh_corpus[1]

{'title': 'Carta a Luis de Santángel',
 'author': 'Cristóbal Colón',
 'content': 'Señor, porque sé que habréis placer de la gran victoria que Nuestro Señor me ha dado en mi viaje, vos escribo ésta, por la cual sabréis cómo en 33 días pasé de las islas de Canaria a las Indias con la armada que los ilustrísimos rey y reina nuestros señores me dieron, donde yo hallé muy muchas islas pobladas con gente sin número; y de ellas todas he tomado posesión por Sus Altezas con pregón y bandera real extendida, y no me fue contradicho.\n\nA la primera que yo hallé puse nombre San Salvador a conmemoración de Su Alta Majestad, el cual maravillosamente todo esto ha dado; los Indios la llaman Guanahaní; a la segunda puse nombre la isla de Santa María de Concepción; a la tercera Fernandina; a la cuarta la Isabela; a la quinta la isla Juana, y así a cada una nombre nuevo.\n\nCuando yo llegué a la Juana, seguí yo la costa de ella al poniente, y la fallé tan grande que pensé que sería tierra firme, la provi

## Explore B-level text

In [53]:
from spacy.lang.es.stop_words import STOP_WORDS

In [138]:
def get_A_proportion(text):
    doc = nlp(text)
    a_count = 0
    tok_count = 0
    
    for tok in doc:
        tok_pos = tok.pos_
        tok = tok.text.lower()
        
        if tok in STOP_WORDS or tok == ' ' or tok == '\n' or tok == '\xa0' or tok_pos == 'PUNCT':
            continue
        elif tok in a_vocab:
            a_count += 1
        
        tok_count += 1
    
    return a_count / tok_count, tok_count

In [139]:
# percentage of a-level vocab in b-level text
for text in aalh_corpus:
    print(get_A_proportion(text['content'])[1], get_A_proportion(text['content'])[0])
    print()

748 0.47593582887700536

1201 0.3755203996669442

1313 0.3069306930693069

93 0.43010752688172044

2130 0.36807511737089205

934 0.3811563169164882

56 0.19642857142857142

1134 0.3350970017636684

181 0.2983425414364641

64 0.296875

495 0.2727272727272727

1570 0.29044585987261146

470 0.3

3235 0.15054095826893354

2190 0.2452054794520548

208 0.28365384615384615

176 0.3125

939 0.3844515441959531

660 0.30606060606060603

36 0.4166666666666667

1494 0.2643908969210174



## A-level text

In [140]:
###################################### A1-level texts ###################################
#### stories stored in str_story_list as string


wb_address = (
    "https://www.gutenberg.org/files/22065/22065-h/22065-h.htm#EL_CUENTO_DEL_POLLO"
)
f = urlopen(wb_address)
html_soup = BeautifulSoup(f, "lxml")
for node in html_soup.find_all("h2"):
    if "class" in node.attrs.keys() and node["class"] == ["e"]:
        first_node = node
        break

story_list = []
next_node = first_node
while next_node != None:
    story = u""
    next_node = None
    # print(first_node.get_text())
    for tag in first_node.next_siblings:
        # print(str(tag))
        if tag.name == "h2" and "class" in tag.attrs.keys() and tag["class"] == ["e"]:
            story_list.append(story)
            first_node = tag
            next_node = tag
            # print(tag.get_text())
            break
        else:
            story += str(tag)

next_node = first_node
while next_node != None:
    story = u""
    try:
        story += first_node.get_text()
    except:
        story += str(first_node)
    next_node = None
    for tag in first_node.next_siblings:
        # print(str(tag))
        if tag.name == "p" and "class" in tag.attrs.keys() and tag["class"] == ["f"]:
            story_list.append(story)
            first_node = tag
            next_node = tag
            story += str(tag)
            # print(tag.get_text())
            break
        else:
            story += str(tag)

str_story_list = []
for i in range(len(story_list)):
    story_soup = BeautifulSoup(story_list[i], "lxml")
    str_story = story_soup.get_text()
    str_story_list.append(str_story.replace("\xa0", ""))

FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?