# Investigating overlap between A1 scraped vocab and texts from A1- and B-levels

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import time

## Spanish 1

In [2]:
soup_url = "https://en.wikiversity.org/wiki/Spanish_1"

In [3]:
def get_vocab_url(soup):
    """
    get a list of urls that lead to the vocabulary

    soup: (BeautifulSoup) an html parsed bs object

    return: (list) a list of urls
    """
    tag_list = soup.find("ul").findAll("a", {"href": re.compile(r"/wiki/Spanish_1/.*")})
    url_list = [
        soup_url + re.search(r'(/wiki/Spanish_1)(/.*)(" )', str(tag)).group(2)
        for tag in tag_list
        if "Linguistic_characteristics" not in str(tag)
    ]
    return url_list

In [4]:
soup = BeautifulSoup(urlopen(soup_url), "html.parser")
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_1/Countries', 'https://en.wikiversity.org/wiki/Spanish_1/The_Basics', 'https://en.wikiversity.org/wiki/Spanish_1/Activities', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives', 'https://en.wikiversity.org/wiki/Spanish_1/School', 'https://en.wikiversity.org/wiki/Spanish_1/The_Classroom', 'https://en.wikiversity.org/wiki/Spanish_1/Food_%26_Drink', 'https://en.wikiversity.org/wiki/Spanish_1/Health', 'https://en.wikiversity.org/wiki/Spanish_1/Destinations', 'https://en.wikiversity.org/wiki/Spanish_1/Recreation_%26_Lifestyle', 'https://en.wikiversity.org/wiki/Spanish_1/Family_%26_Celebrations', 'https://en.wikiversity.org/wiki/Spanish_1/Adjectives_%26_In_a_restaurant', 'https://en.wikiversity.org/wiki/Spanish_1/The_Bedroom', 'https://en.wikiversity.org/wiki/Spanish_1/The_Household', 'https://en.wikiversity.org/wiki/Spanish_1/Shopping', 'https://en.wikiversity.org/wiki/Spanish_1/At_the_Mall', 'https://en.wikiversity.org/wiki/Spanish_1/Vacation',

In [5]:
def get_vocab(url):
    """
    get a list of vocabulary scraped from the url given

    url: (str) an url lead to a list of vocabulary

    return: (list) a list of Spanish words
    """
    soup = BeautifulSoup(urlopen(url), "html.parser")
    regex = re.compile(r"<b>(.*)</b>")  # extract Spanish words

    vocab_list = [
        regex.search(str(match.find("b")))
        .group(1)
        .lower()
        .split("=")[0]
        .split("-")[0]
        .strip()
        .strip(".,-=")
        .strip()
        for match in soup.findAll("li")  # find lists
        if not len(match.attrs) and match.find("b")
    ]

    
    # Expand and add most of the parenthetical constructions from the scraped vocabulary
    regex = re.compile(r"\((\w+)\)")
    for elem in vocab_list:
        search = regex.search(elem)

        if search:
            start = search.start(1)
            end = search.end(1)
            match = search.group(1)

            # Pre-strip (base case)
            elem_strip = elem[: start - 1] + elem[end + 1 :]

            # If the word in parenthesis occurs at the end of the string preceded by a space,
            # it is likely intended as extra info (eg., "baile (bailar)") and can be skipped
            if elem[start - 2] == " " and len(elem) == end + 1:
                continue

            # For constructions like "mi(s)" -> "mi" & "mis"; "tiene(n)" -> "tiene" & "tienen"
            elif match == "s" or match == "n" or match == "es":
                sub = elem[: start - 1] + match + elem[end + 1 :]

            # For constructions like "professor(ora)" -> "professor" & "professora"
            elif match == "ora":
                sub = elem[: start - len(match)] + match + elem[end + 1 :]

            # If the parentheses occur at the start of the string:
            # eg., "(los) estados unidos" -> "estados unidos" & "los estados unidos"
            elif start == 1:
                sub = match + elem[end + 1 :]
                elem_strip = elem[end + 2 :]

            # For constructions like "fantastico(a)" -> "fantastico" & "fantastica"
            elif match == "as" or match == "a" or match == "os" or match == "o":
                sub = elem[: start - 1 - len(match)] + match + elem[end + 1 :]

            # Else case: if parentheses occur somewhere in the middle of the string:
            # eg., "té (frío) helado" -> "té helado" & "té frío helado"
            else:
                sub = elem[: start - 1] + match + elem[end + 1 :]
                elem_strip = elem[: start - 2] + elem[end + 1 :]

            vocab_list.append(elem_strip)
            vocab_list.append(sub)

    # Add the constituent words in a multi-word phrase to the vocabulary
    for elem in vocab_list:
        if " " in elem:
            vocab_list.extend([e.strip("/,()") for e in elem.split(" ")])

    return vocab_list

In [6]:
spanish1_vocab = []
for url in url_list:
    spanish1_vocab.extend(get_vocab(url))

In [7]:
print(len(set(spanish1_vocab)))

2298


In [8]:
set(spanish1_vocab)

{'',
 'idea!',
 'fácil',
 'judías verdes',
 '¿y usted?',
 'maíz',
 'te',
 'leer',
 'desordenado',
 'no me gusta',
 'zoológico',
 'anaranjado',
 'tocar la guitarra',
 'pescadería',
 'doler',
 'muslo',
 'navegar',
 'cartera',
 'momento',
 'inolvidable',
 'trescientos',
 'dirección electrónica',
 'preparado',
 'marrón',
 'a la una en punto',
 '...',
 'veces',
 'viernes',
 'tienen ... años',
 'precio',
 'vengo',
 'bonito(a)',
 'maní',
 'cuadro',
 'una camisa',
 'hasta',
 'sandía',
 'estar',
 'quinientas',
 'gracioso(a)',
 'nacional',
 'catorce',
 'cebolla',
 'visitar',
 'pronto',
 'hacer',
 'güiro',
 'punto',
 '¿cuántos(as)?',
 'm',
 'está bueno(a)',
 'taza',
 'frío',
 'despacho, la oficina',
 '¿para qué sirve?',
 'estudiar',
 'tacón',
 'estás?',
 '¿me trae ...?',
 'a mí también',
 'confundida',
 '¡qué ...!',
 'pelirroja',
 'hermoso(a)',
 'trabajo voluntario',
 'hiciste?',
 'pon',
 'levantar pesas',
 'menos',
 'hoy',
 'usado(a)',
 'los lunes',
 'una falda',
 'maracas',
 'un poco',
 'me gus

## Spanish 2

In [9]:
soup_url = "https://en.wikiversity.org/wiki/Spanish_2"

In [10]:
# Remake function for use with Spanish 2 Wikiversity
def get_vocab_url(soup):
    """
    get a list of urls that lead to the vocabulary

    soup: (BeautifulSoup) an html parsed bs object

    return: (list) a list of urls
    """
    tag_list = soup.find("ul").findAll(
        "a", {"href": re.compile(r"/wiki/Spanish_2/Chapter.*")}
    )
    url_list = [
        soup_url + re.search(r'(/wiki/Spanish_2)(/Chapter.*)(" )', str(tag)).group(2)
        for tag in tag_list
    ]
    return url_list

In [11]:
soup = BeautifulSoup(urlopen(soup_url), "html.parser")
url_list = get_vocab_url(soup)

print(url_list)

['https://en.wikiversity.org/wiki/Spanish_2/Chapter_1_(Classroom_Events)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_2_(Free_Time)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_3_(Daily_Activities)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_4_(Fashion)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_5_(Errands)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_6_(On_the_Road)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_7_(Childhood)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_8_(Celebrations)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_9_(Emergencies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_10_(Accidents)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_11_(Television)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_12_(Movies)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_13_(Cooking)', 'https://en.wikiversity.org/wiki/Spanish_2/Chapter_14_(Picnics)', 'https://en.wikiversity.org/wiki/Spanish_2/Ch

In [12]:
spanish2_vocab = []
for url in url_list:
    spanish2_vocab.extend(get_vocab(url))

In [13]:
print(len(set(spanish2_vocab)))

1754


In [14]:
set(spanish2_vocab)

{'',
 'cielo',
 'asustado',
 'maíz',
 'tuviste',
 'ir a pie',
 'te',
 'aficionado',
 'cinturón',
 'jugoso',
 'en vivo',
 'planear',
 'discurso',
 'pasajera',
 'ancho(a)',
 'ejemplo',
 'doler',
 'navegar',
 'llegar',
 'paz',
 'lecciones',
 'cocida',
 'mediano(a)',
 'estar basada',
 'tuvisteis',
 'periodista',
 'éxito',
 'bandito(a)',
 'precio',
 'cinta adhesiva',
 'día de acción de gracias',
 'compartimiento',
 'bailarín, bailarína',
 'hasta',
 'sandía',
 'paseo',
 'loco(a',
 'estar',
 'estar pasado(a) de moda',
 'estrecho',
 'yeso',
 'galán',
 'visitar',
 'recorrido',
 'furiosa',
 'pronto',
 'hacer',
 'reunirse',
 'destruyeron',
 'tuvieron',
 'cayó',
 'destruir',
 'examen',
 'abraza',
 'enamorado(a) de',
 'rescatar',
 'al horno',
 'almorzáis',
 'plaza',
 'cajón',
 'recordar',
 'puesto',
 'incendio',
 'olvides',
 'viva',
 'vaya',
 'microondas',
 'dejar',
 'picadura',
 'jugabais',
 'jugar',
 'tráfico',
 'de pequeño',
 'complicada',
 'pelearse',
 'saluda',
 'hockey',
 'materiales',
 'cerr

## Gutenberg: An elementary spanish reader

In [15]:
soup_url = 'https://www.gutenberg.org/files/22065/22065-h/22065-h.htm#VOCABULARY'

In [16]:
def get_vocab(url):
    '''
    get a list of vocabulary scraped from the url given
    
    url: (str) an url lead to a list of vocabulary
    
    return: (list) a list of Spanish words
    '''
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    
    return [str(tag.string) for tag in soup.find('ul').findAll('b') if str(tag.string) != str(tag.string).upper()]

In [17]:
gutenberg_elementary_vocab = get_vocab(soup_url)

In [18]:
print(len(set(gutenberg_elementary_vocab)))
set(gutenberg_elementary_vocab)

2257


{'cielo',
 'luchando',
 'sur',
 'éste',
 'libren',
 'asustado',
 'preceder',
 'consuelo',
 'te',
 'leer',
 'menearse',
 'don',
 'precioso',
 'lleve',
 'ni ... ni',
 'contestó que sí',
 'testigo',
 'moveré',
 'venden',
 'en cambio',
 'vendrán',
 'obligó',
 'ejemplo',
 'lamentación',
 'peinarse',
 'momento',
 'sacre',
 'venida',
 'llegar',
 'atrever',
 'provincia',
 'encontrado',
 'colocó',
 'natural',
 'el que',
 'sabía',
 'recorrió',
 'motivo',
 'rudamente',
 'así que',
 'pasmaron',
 'precio',
 'cuadro',
 'seguido de',
 'vengo',
 'el rico',
 'hasta',
 'libertad',
 'paseo',
 'traía',
 'estar',
 'metamorfosis',
 'pagado',
 'tunante',
 'visitar',
 'responde',
 'los suyos',
 'pronto',
 'hacer',
 'activo',
 'punto',
 'destinado',
 'presentando',
 'taza',
 'conservó',
 'frío',
 'sombrar',
 'hierro',
 'enemigo',
 'cayó',
 'acredita',
 'exclama',
 'examen',
 'indispensable',
 'llamó',
 'retrocedió',
 'y qué',
 'menos',
 'hoy',
 'resignarse',
 'cargó',
 'vaya',
 'pasmarse',
 'hable Vd.',
 'deja

## Building Vocabulary from Frequency Lists

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

##### Cleaned up (standardized) version of the 5k words list in https://en.wiktionary.org/wiki/User:Matthias_Buchmeier/Spanish_frequency_list-1-5000

In [20]:
urls = [
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish1000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish1001-2000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish2001-3000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish3001-4000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish4001-5000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish5001-6000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish6001-7000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish7001-8000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish8001-9000",
    "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Spanish9001-10000",
]

In [21]:
df_list = []
for url in urls:
    html = requests.get(url).content
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("tbody")
    for row in table.find_all("tr")[1:]:
        row = row.text.lower().strip().split("\n")
        df_list.append(list(filter(lambda e: e != "", row))[1:])
df = pd.DataFrame(df_list, columns = ["word", "occurrences", "lemma"])
print(df)

              word occurrences         lemma
0              que       32894           que
1               de       32116            de
2               no       29897            no
3                a       22313             a
4               la       21127            la
...            ...         ...           ...
9995         enoja           5        enojar
9996        empujó           5       empujar
9997        dilema           5        dilema
9998  desconectado           5  desconectado
9999     deprimida           5     deprimido

[10000 rows x 3 columns]


## Combining Spanish 1 & 2 Vocab

In [22]:
a1_vocab = set(spanish1_vocab) | set(spanish2_vocab)

In [23]:
len(a1_vocab)

3717

In [24]:
N = 500
print(f"What fraction of words in the top-{N} most frequent words already appear in the scraped vocabulary from Spanish 1 & 2?")
print((N - (len(a1_vocab | set(df["word"][:N])) - len(a1_vocab)))/N)

What fraction of words in the top-500 most frequent words already appear in the scraped vocabulary from Spanish 1 & 2?
0.72


In [25]:
# a1_vocab = a1_vocab | set(df["word"][:500])
# len(a1_vocab)

## Combining Gutenberg vocab into Span 1+2 vocab

In [26]:
a1_gut_vocab = a1_vocab | set(gutenberg_elementary_vocab)

In [27]:
len(a1_gut_vocab)

5292

In [28]:
N = 500
print(f"What fraction of words in the top-{N} most frequent words already appear in the scraped vocabulary from Spanish 1 & 2 + elementary Gutenberg?")
print((N - (len(a1_gut_vocab | set(df["word"][:N])) - len(a1_gut_vocab)))/N)

What fraction of words in the top-500 most frequent words already appear in the scraped vocabulary from Spanish 1 & 2 + elementary Gutenberg?
0.868


## B-level texts

In [29]:
from bs4 import BeautifulSoup, NavigableString
from urllib.request import urlopen
import re
import time

In [30]:
soup_url = 'https://press.rebus.community/aalh/'

In [31]:
def get_text_links(soup):
    '''
    get a list of urls that lead to the texts
    
    soup: (BeautifulSoup) an html parsed bs object
    
    return: (list) a list of urls
    '''
    tag_list = soup.findAll("li", {"id" : re.compile(r'toc-chapter-.*')})
    url_list = []
    is_text = False
    for tag in tag_list:
        if tag.find('a').contents[0] == 'Introducción':
            is_text = True
            continue
        elif is_text == True:
            url_list.append(tag.find('a').attrs['href'])
            is_text = False
            
    return url_list

In [32]:
soup = BeautifulSoup(urlopen(soup_url), 'html.parser')
url_list = get_text_links(soup)

print(url_list)

['https://press.rebus.community/aalh/chapter/lo-que-sucedio-a-un-mancebo-el-dia-que-se-caso/', 'https://press.rebus.community/aalh/chapter/carta-a-luis-de-santangel/', 'https://press.rebus.community/aalh/chapter/tercera-carta-relacion-de-hernan-cortes-al-emperador/', 'https://press.rebus.community/aalh/chapter/romance-del-enamorado-y-la-muerte/', 'https://press.rebus.community/aalh/chapter/la-vida-de-lazarillo-de-tormes-y-de-sus-fortunas-y-adversidades/', 'https://press.rebus.community/aalh/chapter/historia-de-la-monja-alferez/', 'https://press.rebus.community/aalh/chapter/a-su-retrato/', 'https://press.rebus.community/aalh/chapter/viaje-a-la-habana-carta-1/', 'https://press.rebus.community/aalh/chapter/la-flor-de-la-cana/', 'https://press.rebus.community/aalh/chapter/al-partir/', 'https://press.rebus.community/aalh/chapter/emancipacion-moral-de-la-mujer/', 'https://press.rebus.community/aalh/chapter/la-rosa-de-pasion/', 'https://press.rebus.community/aalh/chapter/las-medias-rojas/', '

In [33]:
def get_text(url):
    '''
    get a dictionary of the content and metadata of a text scraped from the url given
    
    url: (str) an url lead to a text
    
    return: (dict) a dictionary with 3 keys: 'author', 'title', and 'content'
    '''
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    text_dict = {}
    
    text_dict['title'] = soup.find('h1', {'class':'entry-title'}).contents[0].strip()
    for child in soup.find('h2').contents:
        if isinstance(child, NavigableString):
            text_dict['author'] = child.strip()
            break
    
    content_section = soup.find('section', {'data-type':'chapter'})
    content = []
    for tag in content_section.contents:
        if tag.name == 'p':
            paragraph = []
            for child in tag.contents:
                if isinstance(child, NavigableString):
                    paragraph.append(child)
            if paragraph:
                content.append(''.join(paragraph).strip().replace("\xa0", ""))
    
    text_dict['content'] = content
    return text_dict

In [34]:
aalh_corpus = []
for url in url_list:
    aalh_corpus.append(get_text(url))
    time.sleep(0.3)

In [35]:
aalh_corpus[0]

{'title': 'Lo que sucedió a un mancebo el día que se casó',
 'author': 'Don Juan Manuel',
 'content': ['Un día hablaba el Conde Lucanor con Patronio, su consejero, y le decía:',
  '—Patronio, un pariente mío me ha dicho que lo quieren casar con una mujer muy rica, y aunque es más honrada que él, el casamiento sería muy bueno para él si no fuera por un embargo que ahí hay, y el embargo es éste: Me dijo que le dijeron otros que aquella mujer era la más fuerte y la más brava cosa del mundo, y ahora ruego a vos que me aconsejéis si le mande que case con aquella mujer—pues sabe de cual manera es—, o si le mande que lo no haga.',
  '—Señor conde Lucanor —dijo Patronio— si él fuera tal como fue un hijo de un hombre bueno que era moro, aconsejadle que case con ella; más si no fuere tal, no se lo aconseja. Y el conde le rogó que le dijera cómo era aquello.',
  'Patronio le dijo que en una villa vivía un moro honrado que vivía con un hijo, el mejor mancebo que en el mundo podría ser, pero no era

In [36]:
len(aalh_corpus)

21

In [37]:
all_text = ""
for text in aalh_corpus:
    all_text += " ".join(text["content"]) + " "
all_toks = all_text.split(" ")
print("num of tokens (approx.)", len(all_toks))
print("num of types (approx.)", len(set(all_toks)))

num of tokens (approx.) 32819
num of types (approx.) 9645


## A-level texts

In [38]:
###################################### A1-level texts ###################################
#### stories stored in str_story_list as string


wb_address = (
    "https://www.gutenberg.org/files/22065/22065-h/22065-h.htm#EL_CUENTO_DEL_POLLO"
)
f = urlopen(wb_address)
html_soup = BeautifulSoup(f, "lxml")
for node in html_soup.find_all("h2"):
    if "class" in node.attrs.keys() and node["class"] == ["e"]:
        first_node = node
        break

story_list = []
next_node = first_node
while next_node != None:
    story = u""
    next_node = None
    # print(first_node.get_text())
    for tag in first_node.next_siblings:
        # print(str(tag))
        if tag.name == "h2" and "class" in tag.attrs.keys() and tag["class"] == ["e"]:
            story_list.append(story)
            first_node = tag
            next_node = tag
            # print(tag.get_text())
            break
        else:
            story += str(tag)

next_node = first_node
while next_node != None:
    story = u""
    try:
        story += first_node.get_text()
    except:
        story += str(first_node)
    next_node = None
    for tag in first_node.next_siblings:
        # print(str(tag))
        if tag.name == "p" and "class" in tag.attrs.keys() and tag["class"] == ["f"]:
            story_list.append(story)
            first_node = tag
            next_node = tag
            story += str(tag)
            # print(tag.get_text())
            break
        else:
            story += str(tag)

str_story_list = []
for i in range(len(story_list)):
    story_soup = BeautifulSoup(story_list[i], "lxml")
    str_story = story_soup.get_text()
    str_story_list.append(str_story.replace("\xa0", ""))

In [39]:
len(str_story_list)

42

In [40]:
print(str_story_list[41])

LA ABEJA Y EL CUCLILLO

Saliendo del colmenar,
Dijo al cuclillo la abeja;                   270
"Calla, porque no me deja
Tu ingrata voz trabajar.
No hay ave tan fastidiosa
En el cantar, como tú:
Cucú, cucú, y más cucú:                    275
Y siempre una misma cosa."
—"¿Te cansa mi canto igual?
(El cuclillo respondió:)
Pues a fe que no hallo yo
Variedad en tu panal.                     280
Y pues que del propio modo
Fabricas uno que ciento,
Si yo nada nuevo invento,
En ti es viejísimo todo."
A esto la abeja replica:                   285
"En obra de utilidad
La falta de variedad
No es lo que más perjudica.
Pero en obra destinada
Sólo al gusto y diversión,                   290
Si no es varia la invención,
Todo lo demás es nada."

La variedad es requisito indispensable en las obras de gusto.



## Investigating Overlap between A1 vocab and A1 text

In [41]:
a1_text = ""
for story in str_story_list:
    a1_text += " ".join(story.lower().strip().split('\n')).strip()
a1_tokens = a1_text.split(" ")

In [42]:
a1_tokens = list(filter(lambda x: x != "", a1_tokens))
print(len(a1_tokens))
print(len(set(a1_tokens)))

10405
3083


### Overlap between Span 1 + 2 vocab and A1 text

In [43]:
count = 0
for t in a1_tokens:
    if t in a1_vocab:
        count += 1
print(f"Fraction of tokens in A1 Text that are from A1 Vocabulary: {count/len(a1_tokens)}")

Fraction of tokens in A1 Text that are from A1 Vocabulary: 0.5742431523306103


In [44]:
len(set(a1_tokens).intersection(a1_vocab))/len(set(a1_tokens))

0.17353227375932534

### Overlap between Span 1 + 2 + elementary Gutenberg vocab and A1 text

In [45]:
count = 0
for t in a1_tokens:
    if t in a1_gut_vocab:
        count += 1
print(f"Fraction of tokens in A1 Text that are from A1 Vocabulary: {count/len(a1_tokens)}")

Fraction of tokens in A1 Text that are from A1 Vocabulary: 0.7128303700144162


In [46]:
len(set(a1_tokens).intersection(a1_gut_vocab))/len(set(a1_tokens))

0.39928640934155046

## Investigating Overlap between A1 vocab and B-level text

In [47]:
b_tokens = list(filter(lambda x: x != "", all_toks))
print(len(b_tokens))
print(len(set(b_tokens)))

32378
9644


### Overlap between Span 1 + 2 vocab and B-level text

In [48]:
count = 0
for t in b_tokens:
    if t in a1_vocab:
        count += 1
print(f"Fraction of tokens in B-level Text that are from A1 Vocabulary: {count/len(b_tokens)}")

Fraction of tokens in B-level Text that are from A1 Vocabulary: 0.5383902649947495


In [49]:
len(set(b_tokens).intersection(a1_vocab))/len(set(b_tokens))

0.09404811281625881

### Overlap between Span 1 + 2 + elementary Gutenberg vocab and B-level text

In [50]:
count = 0
for t in b_tokens:
    if t in a1_gut_vocab:
        count += 1
print(f"Fraction of tokens in B-level Text that are from A1 Vocabulary: {count/len(b_tokens)}")

Fraction of tokens in B-level Text that are from A1 Vocabulary: 0.5887948607078881


In [51]:
len(set(b_tokens).intersection(a1_gut_vocab))/len(set(b_tokens))

0.14091663210286187