In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import time

# A-level vocab collection

## Spanish 1

In [2]:
soup_url = 'https://en.wikiversity.org/wiki/Spanish_1'

In [3]:
def get_vocab_url(soup):
    '''
    get a list of urls that lead to the vocabulary
    
    soup: (BeautifulSoup) an html parsed bs object
    
    return: (list) a list of urls
    '''
    tag_list = soup.find('ul').findAll("a", {"href" : re.compile(r'/wiki/Spanish_1/.*')})
    url_list = [soup_url + re.search(r'(/wiki/Spanish_1)(/.*)(" )', str(tag)).group(2) for tag in tag_list \
                 if 'Linguistic_characteristics' not in str(tag)]
    return url_list

In [4]:
soup = BeautifulSoup(urlopen(soup_url), 'html.parser')
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_1/Countries', 'https://en.wikiversity.org/wiki/Spanish_1/The_Basics', 'https://en.wikiversity.org/wiki/Spanish_1/Activities', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives', 'https://en.wikiversity.org/wiki/Spanish_1/School', 'https://en.wikiversity.org/wiki/Spanish_1/The_Classroom', 'https://en.wikiversity.org/wiki/Spanish_1/Food_%26_Drink', 'https://en.wikiversity.org/wiki/Spanish_1/Health', 'https://en.wikiversity.org/wiki/Spanish_1/Destinations', 'https://en.wikiversity.org/wiki/Spanish_1/Recreation_%26_Lifestyle', 'https://en.wikiversity.org/wiki/Spanish_1/Family_%26_Celebrations', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives_%26_In_a_restaurant', 'https://en.wikiversity.org/wiki/Spanish_1/The_Bedroom', 'https://en.wikiversity.org/wiki/Spanish_1/The_Household', 'https://en.wikiversity.org/wiki/Spanish_1/Shopping', 'https://en.wikiversity.org/wiki/Spanish_1/At_the_Mall', 'https://en.wikiversity.org/wiki/Spanish_1/Vacation',

In [5]:
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','ñ','o','p','q','r','s','t','u','v','w','x','y','z']
def get_vocab(url):
    '''
    get a list of vocabulary scraped from the url given
    
    url: (str) an url lead to a list of vocabulary
    
    return: (list) a list of Spanish words
    '''
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    regex = re.compile(r'<li><b>(.*)</b>')
    
    vocab_list = [regex.search(str(match)).group(1).lower().strip() for match in soup.findAll('li') \
                  if len(match.attrs)==0 and regex.search(str(match))]
    for letter in alphabet:
        if letter in vocab_list:
            vocab_list.remove(letter)
    
    return vocab_list

In [6]:
spanish1_vocab = []
for url in url_list:
    spanish1_vocab.extend(get_vocab(url))
    time.sleep(0.5)

In [7]:
print(len(set(spanish1_vocab)))

1454


In [8]:
set(spanish1_vocab)

{'encima de la/del ...',
 '¿cómo te llamas?',
 'cocina',
 'los lunes',
 '¡qué ...!',
 'tecla de borrar',
 'hay ...',
 'yerno',
 'calvo(a)',
 'pensamos',
 'escribiendo',
 'disteis',
 'veintiocho',
 'sabroso(a)',
 'bastante',
 'cuarenta',
 'opening marks</b>. written questions and expressions of exclamation begin with a <b>¿</b> or a <b>¡',
 'viajar',
 'comprender',
 'guisantes',
 'son las siete menos cuarto.',
 'vender',
 'comedia',
 'siempre',
 'anaranjado(a)(s)',
 'costar',
 'audífonos',
 'telenovela',
 'país',
 'si',
 'drama',
 'música rap',
 'horrible',
 'madrastra',
 'vengo',
 'aburrir',
 'deprimido(a)',
 'zanahorias',
 'lavar',
 'esos, esas',
 'dirección electrónica',
 'guinea ecuatorial',
 'señora (sra.)',
 'sótano',
 'compraste',
 'doler',
 'usted (ud.)',
 '¡no me digas!',
 '¿cómo es usted?',
 'unos pantalones cortos',
 'trompa de llaves',
 'práctica de ...',
 'planta baja',
 'reloj',
 'perro',
 'unos calcetines',
 'colores',
 'ventana',
 'pelo',
 'usar la computadora',
 'me fal

## Spanish 2

In [9]:
soup_url = 'https://en.wikiversity.org/wiki/Spanish_2'

In [10]:
def get_vocab_url(soup):
    '''
    get a list of urls that lead to the vocabulary
    
    soup: (BeautifulSoup) an html parsed bs object
    
    return: (list) a list of urls
    '''
    tag_list = soup.find('ul').findAll("a", {"href" : re.compile(r'/wiki/Spanish_2/Chapter.*')})
    url_list = [soup_url + re.search(r'(/wiki/Spanish_2)(/Chapter.*)(" )', str(tag)).group(2) for tag in tag_list]
    return url_list

In [11]:
soup = BeautifulSoup(urlopen(soup_url), 'html.parser')
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_2/Chapter_1_(Classroom_Events)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_2_(Free_Time)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_3_(Daily_Activities)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_4_(Fashion)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_5_(Errands)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_6_(On_the_Road)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_7_(Childhood)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_8_(Celebrations)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_9_(Emergencies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_10_(Accidents)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_11_(Television)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_12_(Movies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_13_(Cooking)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_14_(Picnics)', 'https://en.wikiversity.org/wiki/Spanish_2/Ch

In [12]:
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','ñ','o','p','q','r','s','t','u','v','w','x','y','z']
def get_vocab(url):
    '''
    get a list of vocabulary scraped from the url given
    
    url: (str) an url lead to a list of vocabulary
    
    return: (list) a list of Spanish words
    '''
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    regex = re.compile(r'<li><b>(.*)</b>')
    
    vocab_list = [regex.search(str(match)).group(1).lower().strip() for match in soup.findAll('li') \
                  if len(match.attrs)==0 and regex.search(str(match))]
    for letter in alphabet:
        if letter in vocab_list:
            vocab_list.remove(letter)
    
    return vocab_list

In [13]:
spanish2_vocab = []
for url in url_list:
    spanish2_vocab.extend(get_vocab(url))
    time.sleep(0.5)

In [14]:
print(len(set(spanish2_vocab)))

1121


In [15]:
set(spanish2_vocab)

{'hacíais',
 'reírse',
 'al horno',
 'aterrizar',
 'crear una página web',
 'hervir',
 'inmediatamente',
 'resumen',
 'sala de emergencia',
 'preferir',
 'sé',
 'creativo(a)',
 'nacer',
 'jugador, jugadora',
 'reina',
 'he visto',
 'efectos especiales',
 'he estudiado',
 'centro de salud',
 'leíste',
 'subibaja',
 'muñeco',
 'puntadas',
 'aquel, aquella',
 'rodilla',
 'extraterrestre',
 '¿con qué se sirve?',
 'sartén',
 'no seas',
 'pusisteis',
 'galán',
 'olor',
 'periodista',
 'billete',
 'coleccionar',
 'aprendimos',
 'patines',
 'análisis',
 'tuvo',
 'paisaje',
 'oyen',
 'oyeron',
 'has estudiado',
 'depositar un cheque',
 'tortuga',
 'rana',
 'tranquilo(a)',
 'hacer el papel de',
 'seguro(a)',
 'repetimos',
 'tornado',
 'quedar',
 'juguete',
 'público',
 'planear',
 'hormiga',
 'pastillas',
 'por ejemplo',
 'antes de',
 'aprendisteis',
 '¡basta!',
 'tanteo',
 'palabra',
 'de repente',
 'derecho',
 'prestar atención',
 'cupón de regalo',
 'tela sintética',
 'me',
 'durmió',
 'tiza'

## Gutenberg: An elementary spanish reader

In [54]:
soup_url = 'https://www.gutenberg.org/files/22065/22065-h/22065-h.htm#VOCABULARY'

In [55]:
def get_vocab(url):
    '''
    get a list of vocabulary scraped from the url given
    
    url: (str) an url lead to a list of vocabulary
    
    return: (list) a list of Spanish words
    '''
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    
    return [str(tag.string) for tag in soup.find('ul').findAll('b') if str(tag.string) != str(tag.string).upper()]

In [59]:
gutenberg_elementary_vocab = get_vocab(soup_url)

In [60]:
print(len(gutenberg_elementary_vocab))
gutenberg_elementary_vocab

2868


['a',
 'abandonado',
 'abandonar',
 'abandonar',
 'abeja',
 'abierto',
 'abrir',
 'abrasaba',
 'abrasando',
 'abrasar',
 'abrasar',
 'abrazo',
 'abre',
 'abrieron',
 'abrir',
 'abril',
 'abrió',
 'abrir',
 'abrir',
 'abrirá',
 'abrir',
 'abuelo',
 'como mi abuelo',
 'acabar',
 'acabó',
 'acabar',
 'acariciar',
 'acarició',
 'acariciar',
 'acaso',
 'aceptar',
 'acepto',
 'aceptar',
 'acerca de',
 'acercarse',
 'acercó',
 'acercar',
 'acercose',
 'se acercó',
 'acertar',
 'ácido',
 'acierta',
 'aciertan',
 'acertar',
 'acompañar',
 'acompañe',
 'acompañar',
 'acostarse',
 'acredita',
 'acreditar',
 'se acredita de malo',
 'acreditar',
 'activo',
 'adelante',
 'los de adelante',
 'ademán',
 'además',
 'además de',
 'adentro',
 'adiestraba',
 'adiestrar',
 'adiestrar',
 'adiestrarse',
 'adivinanza',
 'advertir',
 'afán',
 'afanarse',
 'afano',
 'afanar',
 'agrio',
 'agua',
 'aguardando',
 'aguardar',
 'aguardar',
 'aguardiente',
 'agudo',
 'Aguilar',
 'agujero',
 'ah',
 'ahogar',
 'ahogaro

# Combine A-level vocab