In [1]:
from bs4 import BeautifulSoup, NavigableString
from urllib.request import urlopen
import re
import time

# python -m spacy download es_core_news_sm
import spacy
from spacy.lang.es.examples import sentences

In [2]:
nlp = spacy.load("es_core_news_sm")

## Spanish 1

In [3]:
soup_url = "https://en.wikiversity.org/wiki/Spanish_1"

In [4]:
def get_vocab_url(soup):
    """
    get a list of urls that lead to the vocabulary

    soup: (BeautifulSoup) an html parsed bs object

    return: (list) a list of urls
    """
    tag_list = soup.find("ul").findAll("a", {"href": re.compile(r"/wiki/Spanish_1/.*")})
    url_list = [
        soup_url + re.search(r'(/wiki/Spanish_1)(/.*)(" )', str(tag)).group(2)
        for tag in tag_list
        if "Linguistic_characteristics" not in str(tag)
    ]
    return url_list

In [5]:
soup = BeautifulSoup(urlopen(soup_url), "html.parser")
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_1/Countries', 'https://en.wikiversity.org/wiki/Spanish_1/The_Basics', 'https://en.wikiversity.org/wiki/Spanish_1/Activities', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives', 'https://en.wikiversity.org/wiki/Spanish_1/School', 'https://en.wikiversity.org/wiki/Spanish_1/The_Classroom', 'https://en.wikiversity.org/wiki/Spanish_1/Food_%26_Drink', 'https://en.wikiversity.org/wiki/Spanish_1/Health', 'https://en.wikiversity.org/wiki/Spanish_1/Destinations', 'https://en.wikiversity.org/wiki/Spanish_1/Recreation_%26_Lifestyle', 'https://en.wikiversity.org/wiki/Spanish_1/Family_%26_Celebrations', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives_%26_In_a_restaurant', 'https://en.wikiversity.org/wiki/Spanish_1/The_Bedroom', 'https://en.wikiversity.org/wiki/Spanish_1/The_Household', 'https://en.wikiversity.org/wiki/Spanish_1/Shopping', 'https://en.wikiversity.org/wiki/Spanish_1/At_the_Mall', 'https://en.wikiversity.org/wiki/Spanish_1/Vacation',

In [6]:
def get_vocab(url):
    """
    get a list of vocabulary scraped from the url given

    url: (str) an url lead to a list of vocabulary

    return: (list) a list of Spanish words
    """
    soup = BeautifulSoup(urlopen(url), "html.parser")
    regex = re.compile(r"<b>(.*)</b>")  # extract Spanish words

    vocab_list = [
        regex.search(str(match.find("b")))
        .group(1)
        .lower()
        .split("=")[0]
        .split("-")[0]
        .strip()
        .strip(".,-=")
        .strip()
        for match in soup.findAll("li")  # find lists
        if not len(match.attrs) and match.find("b")
    ]

    
    # Expand and add most of the parenthetical constructions from the scraped vocabulary
    regex = re.compile(r"\((\w+)\)")
    for elem in vocab_list:
        search = regex.search(elem)

        if search:
            start = search.start(1)
            end = search.end(1)
            match = search.group(1)

            # Pre-strip (base case)
            elem_strip = elem[: start - 1] + elem[end + 1 :]

            # If the word in parenthesis occurs at the end of the string preceded by a space,
            # it is likely intended as extra info (eg., "baile (bailar)") and can be skipped
            if elem[start - 2] == " " and len(elem) == end + 1:
                continue

            # For constructions like "mi(s)" -> "mi" & "mis"; "tiene(n)" -> "tiene" & "tienen"
            elif match == "s" or match == "n" or match == "es":
                sub = elem[: start - 1] + match + elem[end + 1 :]

            # For constructions like "professor(ora)" -> "professor" & "professora"
            elif match == "ora":
                sub = elem[: start - len(match)] + match + elem[end + 1 :]

            # If the parentheses occur at the start of the string:
            # eg., "(los) estados unidos" -> "estados unidos" & "los estados unidos"
            elif start == 1:
                sub = match + elem[end + 1 :]
                elem_strip = elem[end + 2 :]

            # For constructions like "fantastico(a)" -> "fantastico" & "fantastica"
            elif match == "as" or match == "a" or match == "os" or match == "o":
                sub = elem[: start - 1 - len(match)] + match + elem[end + 1 :]

            # Else case: if parentheses occur somewhere in the middle of the string:
            # eg., "té (frío) helado" -> "té helado" & "té frío helado"
            else:
                sub = elem[: start - 1] + match + elem[end + 1 :]
                elem_strip = elem[: start - 2] + elem[end + 1 :]
            
            vocab_list.append(elem_strip)
            vocab_list.append(sub)

    # Add the constituent words in a multi-word phrase to the vocabulary
    for elem in vocab_list:
        if " " in elem:
            vocab_list.extend([e.strip("/,()") for e in elem.split(" ")])

    return [word for word in vocab_list if ('(' not in word and word != '' and '/' not in word and ',' not in word)]

In [7]:
spanish1_vocab = []
for url in url_list:
    spanish1_vocab.extend(get_vocab(url))

In [8]:
print(len(set(spanish1_vocab)))

2053


In [9]:
set(spanish1_vocab)

{'zapatos deportivos',
 'niña',
 'tener',
 'simpática',
 'compré',
 'buscar',
 'bien',
 'lenta',
 '¿qué te pasó?',
 'cuba',
 'foca',
 'mil',
 'encantar',
 'más',
 'menú',
 'animados',
 'tambor',
 'mas...que',
 'febrero',
 '¿cuántas?',
 'disco compacto',
 'dónde',
 'historia',
 'jirafa',
 'de...?',
 'ni',
 'viaje',
 'ciruela',
 'condimento',
 'artístico',
 'centro',
 'clásica',
 'café',
 'noches',
 'árbol',
 'servicio',
 'cartel',
 'prefieres',
 'una blusa',
 'jugo de naranja',
 'l',
 'barata',
 'e',
 'hacer la cama',
 'séptima',
 'nerviosa',
 'camarero',
 'i',
 'me interesan',
 'artística',
 'tengo que',
 'zapatos de tacón alto',
 'cacahuates',
 '¿adónde?',
 'sus',
 'platos',
 'ensalada de frutas',
 'el horario',
 'una camiseta',
 'frambuesa',
 'museo',
 'calculadora',
 'menor',
 'compartimos',
 'creo que no',
 'mesa',
 'monumento',
 'muchacha',
 'usted',
 'anteojos',
 'canosa',
 'latina',
 'limón',
 'ir de vacaciones',
 'setecientos',
 'pupitre',
 'americano',
 'anoche',
 'no le gusta

## Spanish 2

In [10]:
soup_url = "https://en.wikiversity.org/wiki/Spanish_2"

In [11]:
# Remake function for use with Spanish 2 Wikiversity
def get_vocab_url(soup):
    """
    get a list of urls that lead to the vocabulary

    soup: (BeautifulSoup) an html parsed bs object

    return: (list) a list of urls
    """
    tag_list = soup.find("ul").findAll(
        "a", {"href": re.compile(r"/wiki/Spanish_2/Chapter.*")}
    )
    url_list = [
        soup_url + re.search(r'(/wiki/Spanish_2)(/Chapter.*)(" )', str(tag)).group(2)
        for tag in tag_list
    ]
    return url_list

In [12]:
soup = BeautifulSoup(urlopen(soup_url), "html.parser")
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_2/Chapter_1_(Classroom_Events)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_2_(Free_Time)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_3_(Daily_Activities)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_4_(Fashion)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_5_(Errands)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_6_(On_the_Road)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_7_(Childhood)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_8_(Celebrations)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_9_(Emergencies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_10_(Accidents)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_11_(Television)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_12_(Movies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_13_(Cooking)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_14_(Picnics)', 'https://en.wikiversity.org/wiki/Spanish_2/Ch

In [13]:
spanish2_vocab = []
for url in url_list:
    spanish2_vocab.extend(get_vocab(url))

In [14]:
print(len(set(spanish2_vocab)))

1580


In [15]:
set(spanish2_vocab)

{'furiosa',
 'niña',
 'aduanera',
 'puerta de embarque',
 'seco',
 'yeso',
 'tener',
 'ida',
 'edificio',
 'jugabais',
 'he visto',
 'bien',
 'subibaja',
 'hacer un picnic',
 '¿qué te pasó?',
 'contar',
 'agente',
 'ves',
 'recientemente',
 'empezaste',
 'viaje',
 'ciruela',
 'pedí',
 'dientes',
 'abrazarse',
 'se prohíbe',
 'centro',
 'servicio',
 'poner una inyección',
 'reflexionar',
 'capturar',
 'acción',
 'dormirse',
 'raqueta',
 'fracaso',
 'nerviosa',
 'cinta adhesiva',
 'noticiero',
 'no añadas',
 'picnic',
 'pusimos',
 'habéis',
 'huracán',
 'tuvo',
 'culpable',
 'dije',
 'ningún',
 'escribisteis',
 'durmiendo',
 'muñeca',
 'fiesta de sorpresa',
 'tantas',
 'insistir en',
 'tobogán',
 'den',
 'aspirina',
 'anteojos',
 'cajón de arena',
 'reunirse',
 'meter',
 'hace tiempo que',
 'liquidación',
 'competencia',
 'pupitre',
 'acera',
 'prepare',
 'darse la mano',
 'bache',
 'alquilar',
 'está hecha de',
 'pidió',
 'sartén',
 'sugerir',
 'viniste',
 'fue',
 'aquel',
 'estos',
 'p

## A-level vocab

In [16]:
a_vocab = set(spanish1_vocab) | set(spanish2_vocab)
print(len(a_vocab))

3313


## B-level text

In [17]:
soup_url = 'https://press.rebus.community/aalh/'

In [18]:
def get_text_links(soup):
    '''
    get a list of urls that lead to the texts
    
    soup: (BeautifulSoup) an html parsed bs object
    
    return: (list) a list of urls
    '''
    tag_list = soup.findAll("li", {"id" : re.compile(r'toc-chapter-.*')})
    url_list = []
    is_text = False
    for tag in tag_list:
        if tag.find('a').contents[0] == 'Introducción':
            is_text = True
            continue
        elif is_text == True:
            url_list.append(tag.find('a').attrs['href'])
            is_text = False
            
    return url_list

In [19]:
soup = BeautifulSoup(urlopen(soup_url), 'html.parser')
url_list = get_text_links(soup)

print(url_list)

['https://press.rebus.community/aalh/chapter/lo-que-sucedio-a-un-mancebo-el-dia-que-se-caso/', 'https://press.rebus.community/aalh/chapter/carta-a-luis-de-santangel/', 'https://press.rebus.community/aalh/chapter/tercera-carta-relacion-de-hernan-cortes-al-emperador/', 'https://press.rebus.community/aalh/chapter/romance-del-enamorado-y-la-muerte/', 'https://press.rebus.community/aalh/chapter/la-vida-de-lazarillo-de-tormes-y-de-sus-fortunas-y-adversidades/', 'https://press.rebus.community/aalh/chapter/historia-de-la-monja-alferez/', 'https://press.rebus.community/aalh/chapter/a-su-retrato/', 'https://press.rebus.community/aalh/chapter/viaje-a-la-habana-carta-1/', 'https://press.rebus.community/aalh/chapter/la-flor-de-la-cana/', 'https://press.rebus.community/aalh/chapter/al-partir/', 'https://press.rebus.community/aalh/chapter/emancipacion-moral-de-la-mujer/', 'https://press.rebus.community/aalh/chapter/la-rosa-de-pasion/', 'https://press.rebus.community/aalh/chapter/las-medias-rojas/', '

In [20]:
def get_text(url):
    '''
    get a dictionary of the content and metadata of a text scraped from the url given
    
    url: (str) an url lead to a text
    
    return: (dict) a dictionary with 3 keys: 'author', 'title', and 'content'
    '''
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    text_dict = {}
    
    text_dict['title'] = soup.find('h1', {'class':'entry-title'}).contents[0].strip()
    for child in soup.find('h2').contents:
        if isinstance(child, NavigableString):
            text_dict['author'] = child.strip()
            break
    
    content_section = soup.find('section', {'data-type':'chapter'}).findAll('p')
    content = ''
    for tag in content_section:
        paragraph = []
        for child in tag.contents:
            if isinstance(child, NavigableString):
                paragraph.append(child)
        content += ''.join(paragraph) + '\n\n'
    
    text_dict['content'] = content.split('Preguntas de discusión')[0].strip()
    return text_dict

In [21]:
aalh_corpus = []
for url in url_list:
    aalh_corpus.append(get_text(url))
    time.sleep(1)

In [22]:
aalh_corpus[1]

{'title': 'Carta a Luis de Santángel',
 'author': 'Cristóbal Colón',
 'content': 'Señor, porque sé que habréis placer de la gran victoria que Nuestro Señor me ha dado en mi viaje, vos escribo ésta, por la cual sabréis cómo en 33 días pasé de las islas de Canaria a las Indias con la armada que los ilustrísimos rey y reina nuestros señores me dieron, donde yo hallé muy muchas islas pobladas con gente sin número; y de ellas todas he tomado posesión por Sus Altezas con pregón y bandera real extendida, y no me fue contradicho.\n\nA la primera que yo hallé puse nombre San Salvador a conmemoración de Su Alta Majestad, el cual maravillosamente todo esto ha dado; los Indios la llaman Guanahaní; a la segunda puse nombre la isla de Santa María de Concepción; a la tercera Fernandina; a la cuarta la Isabela; a la quinta la isla Juana, y así a cada una nombre nuevo.\n\nCuando yo llegué a la Juana, seguí yo la costa de ella al poniente, y la fallé tan grande que pensé que sería tierra firme, la provi

## Explore B-level text

In [23]:
from spacy.lang.es.stop_words import STOP_WORDS

In [24]:
def get_A_proportion(text):
    doc = nlp(text)
    a_count = 0
    tok_count = 0
    
    for tok in doc:
        tok_pos = tok.pos_
        tok = tok.text.lower()
        
        if tok in STOP_WORDS or tok == ' ' or tok == '\n' or tok == '\xa0' or tok_pos == 'PUNCT':
            continue
        elif tok in a_vocab:
            a_count += 1
        
        tok_count += 1
    
    return a_count / tok_count, tok_count

In [25]:
# percentage of a-level vocab in b-level text
for text in aalh_corpus:
    print(get_A_proportion(text['content'])[1], get_A_proportion(text['content'])[0])
    print()

748 0.47593582887700536

1201 0.3755203996669442

1313 0.3069306930693069

93 0.43010752688172044

2130 0.36807511737089205

934 0.3811563169164882

56 0.19642857142857142

1134 0.3350970017636684

181 0.2983425414364641

64 0.296875

495 0.2727272727272727

1570 0.29044585987261146

470 0.3

3235 0.15054095826893354

2190 0.2452054794520548

208 0.28365384615384615

176 0.3125

939 0.3844515441959531

660 0.30606060606060603

36 0.4166666666666667

1494 0.2643908969210174



## Explore A-level text

In [26]:
###################################### A1-level texts ###################################
#### stories stored in str_story_list as string


wb_address = (
    "https://www.gutenberg.org/files/22065/22065-h/22065-h.htm#EL_CUENTO_DEL_POLLO"
)
f = urlopen(wb_address)
html_soup = BeautifulSoup(f, "html.parser")
for node in html_soup.find_all("h2"):
    if "class" in node.attrs.keys() and node["class"] == ["e"]:
        first_node = node
        break

story_list = []
next_node = first_node
while next_node != None:
    story = u""
    next_node = None
    # print(first_node.get_text())
    for tag in first_node.next_siblings:
        # print(str(tag))
        if tag.name == "h2" and "class" in tag.attrs.keys() and tag["class"] == ["e"]:
            story_list.append(story)
            first_node = tag
            next_node = tag
            # print(tag.get_text())
            break
        else:
            story += str(tag)

next_node = first_node
while next_node != None:
    story = u""
    try:
        story += first_node.get_text()
    except:
        story += str(first_node)
    next_node = None
    for tag in first_node.next_siblings:
        # print(str(tag))
        if tag.name == "p" and "class" in tag.attrs.keys() and tag["class"] == ["f"]:
            story_list.append(story)
            first_node = tag
            next_node = tag
            story += str(tag)
            # print(tag.get_text())
            break
        else:
            story += str(tag)

str_story_list = []
for i in range(len(story_list)):
    story_soup = BeautifulSoup(story_list[i], "html.parser")
    str_story = story_soup.get_text()
    str_story_list.append(str_story.replace("\xa0", ""))

In [27]:
len(str_story_list)

42

In [28]:
def get_A_proportion(text):
    doc = nlp(text)
    a_count = 0
    tok_count = 0
    
    for tok in doc:
        tok_pos = tok.pos_
        tok = tok.text.lower()
        
        if tok in STOP_WORDS or tok == ' ' or tok == '\n' or tok == '\xa0' or tok_pos == 'PUNCT' or tok_pos == 'NUM':
            continue
        elif tok in a_vocab:
            a_count += 1
        
        tok_count += 1
    
    return a_count / tok_count, tok_count

In [29]:
for text in story_list:
    print(get_A_proportion(text)[1], get_A_proportion(text)[0])
    print()

277 0.30324909747292417

323 0.3157894736842105

609 0.3957307060755337

589 0.3701188455008489

190 0.33157894736842103

479 0.2359081419624217

477 0.1928721174004193

218 0.23394495412844038

1407 0.32338308457711445

230 0.16521739130434782

422 0.24170616113744076

947 0.191129883843717

540 0.3074074074074074

174 0.1206896551724138

10 0.0

10 0.1

18 0.1111111111111111

27 0.2222222222222222

26 0.23076923076923078

29 0.2413793103448276

25 0.16

35 0.14285714285714285

24 0.20833333333333334

38 0.13157894736842105

29 0.06896551724137931

44 0.18181818181818182

21 0.2857142857142857

20 0.1

104 0.09615384615384616

49 0.4489795918367347

37 0.1891891891891892

35 0.3142857142857143

68 0.23529411764705882

55 0.14545454545454545

34 0.20588235294117646

9 0.5555555555555556

37 0.21621621621621623

100 0.09

83 0.3132530120481928

59 0.3220338983050847

71 0.2112676056338028

66 0.22727272727272727



In [30]:
# check length of each story
for text in story_list:
    print(len([w for w in text.split(' ') if w not in {'\n', '\xa0', ' '}]))

500
528
923
1008
301
871
928
430
2470
376
678
1635
1036
365
14
14
58
72
80
62
43
126
72
84
83
70
74
74
204
100
106
70
128
124
47
18
71
105
138
98
101
113


## Explore A & B using sets

In [32]:
def get_A_proportion_in_set(text):
    doc = nlp(text)
    set_list = set()
    
    for tok in doc:
        tok_pos = tok.pos_
        tok = tok.text.lower()
        
        if tok in STOP_WORDS or tok == ' ' or tok == '\n' or tok == '\xa0' or tok_pos == 'PUNCT' or tok_pos == 'NUM':
            continue
        
        set_list.add(tok)
    
    a_count = len(set_list & a_vocab)
    tok_count = len(set_list)
    
    return a_count / tok_count, tok_count

In [37]:
# A level
for text in story_list:
    if get_A_proportion_in_set(text)[1] > 200:
        print(get_A_proportion_in_set(text)[1], get_A_proportion_in_set(text)[0])
        print()

474 0.22784810126582278

399 0.13784461152882205

233 0.21888412017167383



In [36]:
# B level
for text in aalh_corpus:
    if get_A_proportion_in_set(text['content'])[1] > 200:
        print(get_A_proportion_in_set(text['content'])[1], get_A_proportion_in_set(text['content'])[0])
        print()

350 0.2857142857142857

578 0.20761245674740483

538 0.14684014869888476

1112 0.18794964028776978

511 0.18003913894324852

744 0.18010752688172044

359 0.19498607242339833

966 0.16356107660455488

360 0.23333333333333334

1175 0.14808510638297873

1408 0.15269886363636365

483 0.2318840579710145

439 0.2072892938496583

548 0.23905109489051096



# Write to file (vocab for spanish 1 and 2)

In [31]:
with open('/Users/miali/mds/capstone/capstone_FHIS/vocab/spanish1.txt', 'w', encoding='utf-8') as fout:
    output = ''
    for word in spanish1_vocab:
        output += word + '\n'
    fout.write(output)

with open('/Users/miali/mds/capstone/capstone_FHIS/vocab/spanish2.txt', 'w', encoding='utf-8') as fout:
    output = ''
    for word in spanish2_vocab:
        output += word + '\n'
    fout.write(output)
