In [120]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import time

# A1

## Spanish 1

In [56]:
soup_url = 'https://en.wikiversity.org/wiki/Spanish_1'

In [133]:
def get_vocab_url(soup):
    '''
    get a list of urls that lead to the vocabulary
    
    soup: (BeautifulSoup) an html parsed bs object
    
    return: (list) a list of urls
    '''
    tag_list = soup.find('ul').findAll("a", {"href" : re.compile(r'/wiki/Spanish_1/.*')})
    url_list = [soup_url + re.search(r'(/wiki/Spanish_1)(/.*)(" )', str(tag)).group(2) for tag in tag_list \
                 if 'Linguistic_characteristics' not in str(tag)]
    return url_list

In [66]:
soup = BeautifulSoup(urlopen(soup_url), 'html.parser')
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_1/Countries', 'https://en.wikiversity.org/wiki/Spanish_1/The_Basics', 'https://en.wikiversity.org/wiki/Spanish_1/Activities', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives', 'https://en.wikiversity.org/wiki/Spanish_1/School', 'https://en.wikiversity.org/wiki/Spanish_1/The_Classroom', 'https://en.wikiversity.org/wiki/Spanish_1/Food_%26_Drink', 'https://en.wikiversity.org/wiki/Spanish_1/Health', 'https://en.wikiversity.org/wiki/Spanish_1/Destinations', 'https://en.wikiversity.org/wiki/Spanish_1/Recreation_%26_Lifestyle', 'https://en.wikiversity.org/wiki/Spanish_1/Family_%26_Celebrations', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives_%26_In_a_restaurant', 'https://en.wikiversity.org/wiki/Spanish_1/The_Bedroom', 'https://en.wikiversity.org/wiki/Spanish_1/The_Household', 'https://en.wikiversity.org/wiki/Spanish_1/Shopping', 'https://en.wikiversity.org/wiki/Spanish_1/At_the_Mall', 'https://en.wikiversity.org/wiki/Spanish_1/Vacation',

In [127]:
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','ñ','o','p','q','r','s','t','u','v','w','x','y','z']
def get_vocab(url):
    '''
    get a list of vocabulary scraped from the url given
    
    url: (str) an url lead to a list of vocabulary
    
    return: (list) a list of Spanish words
    '''
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    regex = re.compile(r'<li><b>(.*)</b>')
    
    vocab_list = [regex.search(str(match)).group(1).lower().strip() for match in soup.findAll('li') \
                  if len(match.attrs)==0 and regex.search(str(match))]
    for letter in alphabet:
        if letter in vocab_list:
            vocab_list.remove(letter)
    
    return vocab_list

In [128]:
spanish1_vocab = []
for url in url_list:
    spanish1_vocab.extend(get_vocab(url))
    time.sleep(0.5)

In [131]:
print(len(set(spanish1_vocab)))

1454


In [132]:
set(spanish1_vocab)

{'venimos',
 'cambur',
 'restaurante',
 'muchacho',
 'cama',
 'hicieron',
 'más o menos',
 'computación',
 'programa de la vida real',
 'tango',
 'película de terror',
 'preferís',
 'buenos días.',
 'pide',
 'sirves',
 'bombardino',
 'fantastico(a)',
 'escribir cuentos',
 'parque nacional',
 'inglés',
 'en',
 'menor',
 'quiere',
 '¿qué te parece?',
 'película romántica',
 'dicen',
 'a mí también.',
 'profesor, profesora',
 'seiscientos(as)',
 'está',
 'papas',
 'cena',
 'me gusta mucho...',
 'perú',
 'honduras',
 'libro',
 'faltar',
 'setenta',
 'increíble',
 '¿quién?',
 'aprendí',
 'comparte',
 'barba',
 'cuenta',
 'teclado',
 'la',
 'me interesa(n)',
 'camarero',
 'música latina',
 '¿cuántos años tiene(n) ...\xa0?',
 '¡genial!',
 'traigo ...',
 'preparar',
 'tocar la guitarra',
 'yerno',
 '¿cómo se llama usted?',
 'casa',
 'nuera',
 'señorita (srta.)',
 '¿cómo te queda(n)?',
 'a la derecha',
 'no le gusta ...',
 'seguís',
 'debajo de la/del ...',
 'guapo(a)',
 'maracas',
 'ayer',
 't

## Spanish 2

In [136]:
soup_url = 'https://en.wikiversity.org/wiki/Spanish_2'

In [143]:
def get_vocab_url(soup):
    '''
    get a list of urls that lead to the vocabulary
    
    soup: (BeautifulSoup) an html parsed bs object
    
    return: (list) a list of urls
    '''
    tag_list = soup.find('ul').findAll("a", {"href" : re.compile(r'/wiki/Spanish_2/Chapter.*')})
    url_list = [soup_url + re.search(r'(/wiki/Spanish_2)(/Chapter.*)(" )', str(tag)).group(2) for tag in tag_list]
    return url_list

In [144]:
soup = BeautifulSoup(urlopen(soup_url), 'html.parser')
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_2/Chapter_1_(Classroom_Events)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_2_(Free_Time)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_3_(Daily_Activities)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_4_(Fashion)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_5_(Errands)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_6_(On_the_Road)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_7_(Childhood)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_8_(Celebrations)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_9_(Emergencies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_10_(Accidents)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_11_(Television)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_12_(Movies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_13_(Cooking)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_14_(Picnics)', 'https://en.wikiversity.org/wiki/Spanish_2/Ch

In [145]:
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','ñ','o','p','q','r','s','t','u','v','w','x','y','z']
def get_vocab(url):
    '''
    get a list of vocabulary scraped from the url given
    
    url: (str) an url lead to a list of vocabulary
    
    return: (list) a list of Spanish words
    '''
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    regex = re.compile(r'<li><b>(.*)</b>')
    
    vocab_list = [regex.search(str(match)).group(1).lower().strip() for match in soup.findAll('li') \
                  if len(match.attrs)==0 and regex.search(str(match))]
    for letter in alphabet:
        if letter in vocab_list:
            vocab_list.remove(letter)
    
    return vocab_list

In [146]:
spanish2_vocab = []
for url in url_list:
    spanish2_vocab.extend(get_vocab(url))
    time.sleep(0.5)

In [147]:
print(len(set(spanish2_vocab)))

1121


In [148]:
set(spanish2_vocab)

{'vecino(a)',
 'a la parrilla',
 'sombra de ojos',
 '¿qué te pasó?',
 'piedra',
 'resultar',
 'aburrirse',
 'eres',
 'pudiste',
 'jugábamos',
 'de acuerdo.',
 'cita',
 'equipaje',
 'habléis',
 'dentro de',
 'marcador',
 'vida',
 'cheque',
 'devolver',
 'atleta',
 'dijieron',
 'lo, la',
 'generoso(a)',
 'estáis',
 'sois',
 'quedarse',
 'sala de emergencia',
 'secador',
 'viniste',
 'proyecto',
 'investigar',
 'crimen',
 'se acuesta',
 'salvar',
 'oyes',
 'cajón de arena',
 'jabón',
 'ancho(a)',
 'voz',
 'repetís',
 'lesión',
 'nieto',
 'asustado(a)',
 'panadería',
 'morirse',
 'nos',
 'se cayó',
 'se acuestan',
 'titular',
 'derrumbe',
 'coleccionar',
 'centro de salud',
 'no seas',
 'colección',
 'hablen',
 'banda',
 'cirujano(a)',
 'hinchazón',
 'gel',
 'navegar en la red',
 'nuestro (a, -os, -as)',
 'club',
 'codo',
 '¿qué te parece?',
 'peces',
 'semáforo',
 'vino',
 'tiza',
 'labios',
 'jugar a los bolos',
 'gallo',
 'dormí',
 'director, directora',
 'secarse',
 'directo(a)',
 'her

In [153]:
a1_vocab = set(spanish1_vocab) | set(spanish2_vocab)