In [1]:
#!pip install pylabeador
from utils import text_processor, read_corpus
import pylabeador
import os
import json
import re
import numpy as np
import pandas as pd
from collections import defaultdict
import altair as alt

### Load texts into a list of dictionaries

In [2]:
# function for loading text

def file_to_dict_list(filename):
    ''' This function takes in a json filename and returns a list of dictionaries.
    ------------------------------------------
    Argument: 
       filename: (str) filename of a json file
    Returns:
        a list of dictionaries where each dictionary contains a paragraph / chapter of a Spanish text
    '''
    
    with open(filename, encoding = 'utf-8') as json_file:
        dict_list = json.load(json_file)
    
    return dict_list

### For testing, set up directory and files

In [3]:
text_dir = '/Users/eun-youngchristinapark/MDS-CAPSTONE/capstone_FHIS/corpus/'
file_list = os.listdir(text_dir)

In [4]:
corpus = read_corpus(text_dir)

In [5]:
first_spanish_reader_corpus = file_list[-3]
first_spanish = file_to_dict_list(text_dir + first_spanish_reader_corpus)

print(f'dictionary list type: {type(first_spanish)}', '\n')
print(f'length of the dictionary list: {len(first_spanish)}', '\n')
print(f'type of dictionary list element: {type(first_spanish[0])}', '\n')
print(f'keys in the dictionary list element: {first_spanish[0].keys()}', '\n')
print(f"source of the first element in the list: {first_spanish[0]['source']}", '\n')
print(f"author: {first_spanish[0]['author']}", '\n')
print(f"title: {first_spanish[0]['title']}", '\n')
print(f"level: {first_spanish[0]['level']}", '\n')
print(f"content: {first_spanish[0]['content']}", '\n')

dictionary list type: <class 'list'> 

length of the dictionary list: 56 

type of dictionary list element: <class 'dict'> 

keys in the dictionary list element: dict_keys(['source', 'author', 'title', 'level', 'content']) 

source of the first element in the list: https://www.gutenberg.org/files/15353/15353-h/15353-h.htm 

author: ERWIN W. ROESSLER, PH.D. 

title: A First Spanish Reader 

level: A1 

content: 1. LA ESCUELA
Voy a la escuela. Voy a la escuela el lunes,
el martes, el miércoles, el jueves y el viernes.
El sábado y el domingo no voy a la escuela.
El sábado y el domingo estoy en casa. Soy un
discípulo y estoy en la escuela. El discípulo
aprende. Aprendo la aritmética, a leer y a
escribir. Vd. aprende el español. Todos nosotros
aprendemos diligentemente. Algunos discípulos
no son diligentes. Algunos son perezosos.
El maestro elogia a los discípulos diligentes y a
los discípulos obedientes. Él no elogia a los
alumnos perezosos.
El maestro enseña. Mi maestro enseña el
español.

### Fernandez-Huerta Score calculation

The equivalent readability measure of Flesch score for Spanish is Fernandez-Huerta score.
Please see the original paper (Spanish) *Medidas sencillas de lecturabilidad. Consigna, 214, 29–32,* and
the mention of this metric in [here](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5831059/#:~:text=The%20Fernandez%2DHuerta%20Formula%20(Fern%C3%A1ndez,formulae%20(Flesch%2C%201948).&text=The%20output%20is%20an%20index,representing%20a%20more%20difficult%20text).
    

In [10]:
# Remove titles
regex = r'[0-9]+\.[^\n]+\n'
def remove_titles(text):
    
    for match in re.finditer(regex, text):
        match_span = match.span()
        text_bf = text[:match_span[0]]
        text_af = text[match_span[1]:]
        text = text_bf + text_af
    
    return text

def fernandez_huerta_score(text):
    '''This function calculates flesch_score of the given text. 
    ---------------------------------------
    Argument: 
        text (str): a string which is a piece of Spanish text
    Returns:
        flesch score (float)
    '''
    text = remove_titles(text)
    tp = text_processor(text)
    
    num_sents = len(tp.sents)
    num_tokens = sum(len(tk) for tk in tp.tokens)
    num_alpha_tokens = len([tk for tkl in tp.tokens for tk in tkl if any(t.isalpha() for t in tk)])      ### count as tokens only if the token contains at least one letter. ex) 'Vd.'' is a token. 
    
    if text == '' or num_alpha_tokens == 0 or num_sents == 0:           ### if text contains nothing, 
        return 206                                                      ###    set the score as very very easy to read 
    
    tokens = tp.tokens
    num_syl = 0
    for tl in tokens:
        for token in tl:
            if any(t.isalpha() for t in token):                          ### if the token contains at least one letter
                try: 
                    token_ = ''.join([t for t in token if t.isalpha()])      ###     get rid of non-alphabets in the token
                    num_syl += len(pylabeador.syllabify(token_))             ###     and get syllables 
                except:
                    num_alpha_tokens -= 1                                ### There are alphabets such as ª which cannot be processed
    
    # see https://support.rankmath.com/ticket/flesch-readability-works-for-other-languages/ and 
    #     https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5831059/#:~:text=The%20Fernandez%2DHuerta%20Formula%20(Fern%C3%A1ndez,formulae%20(Flesch%2C%201948).&text=The%20output%20is%20an%20index,representing%20a%20more%20difficult%20text.
    # for Spanish flesch score. 
    
    fh_score = 206.835 - 102 * (num_sents/num_alpha_tokens) - 60 * (num_syl / num_alpha_tokens)    # use num_alpha_tokens instead of num_tokens 
    #fh_score = 206.835 - 102 * (num_sents/num_tokens) - 60 * (num_syl / num_tokens)
    
    return fh_score

### Tests

#### 1. Edge cases

In [7]:
assert fernandez_huerta_score('') == 206

In [8]:
assert fernandez_huerta_score('?') == 206

In [9]:
assert fernandez_huerta_score('1.') == 206

#### 2. Brute-force calculations vs. Fernandez_huerta_score implementation

In [11]:
text = 'Voy a la escuela el lunes, el martes, el miércoles, el jueves y el viernes.'
text = [c for c in text if c not in {'?',',','.','0','1','2','3','4','5','6','7','8','9'}]
text = ''.join(text)
tokens = text.split()
num_syl = 0
for token in tokens:
    syl_list = pylabeador.syllabify(token)
    print(syl_list)
    num_syl += len(syl_list)
num_sents = 1
num_tokens = len(tokens)
manual_score = 206.835 - 102 * (num_sents/num_tokens) - 60 * (num_syl / num_tokens)
print(manual_score)
assert fernandez_huerta_score(text) == manual_score

['Voy']
['a']
['la']
['es', 'cue', 'la']
['el']
['lu', 'nes']
['el']
['mar', 'tes']
['el']
['miér', 'co', 'les']
['el']
['jue', 'ves']
['y']
['el']
['vier', 'nes']
108.035


In [12]:
text = 'Este maestro enseña las matemáticas y aquel maestro el inglés.'
text = [c for c in text if c not in {'?',',','.','0','1','2','3','4','5','6','7','8','9'}]
text = ''.join(text)
tokens = text.split()
num_syl = 0
for token in tokens:
    syl_list = pylabeador.syllabify(token)
    print(syl_list)
    num_syl += len(syl_list)
num_sents = 1
num_tokens = len(tokens)
manual_score = 206.835 - 102 * (num_sents/num_tokens) - 60 * (num_syl / num_tokens)
print(manual_score)
assert fernandez_huerta_score(text) == manual_score

['Es', 'te']
['ma', 'es', 'tro']
['en', 'se', 'ña']
['las']
['ma', 'te', 'má', 'ti', 'cas']
['y']
['a', 'quel']
['ma', 'es', 'tro']
['el']
['in', 'glés']
58.63500000000002


In [13]:
text_orig = 'Vd. aprende el español. Todos nosotros aprendemos diligentemente. Algunos discípulos no son diligentes. Algunos son perezosos.'
text = [c for c in text_orig if c not in {'?',',','.','0','1','2','3','4','5','6','7','8','9'}]
text = ''.join(text)
tokens = text.split()
num_syl = 0
for token in tokens:
    syl_list = pylabeador.syllabify(token)
    print(syl_list)
    num_syl += len(syl_list)
num_sents = 4
num_tokens = len(tokens)
manual_score = 206.835 - 102 * (num_sents/num_tokens) - 60 * (num_syl / num_tokens)
print(manual_score)
assert fernandez_huerta_score(text_orig) == manual_score

['Vd']
['a', 'pren', 'de']
['el']
['es', 'pa', 'ñol']
['To', 'dos']
['no', 'so', 'tros']
['a', 'pren', 'de', 'mos']
['di', 'li', 'gen', 'te', 'men', 'te']
['Al', 'gu', 'nos']
['dis', 'cí', 'pu', 'los']
['no']
['son']
['di', 'li', 'gen', 'tes']
['Al', 'gu', 'nos']
['son']
['pe', 're', 'zo', 'sos']
16.335000000000008


#### 3. Read json file and see if the numbers make sense

#### Take average of the A Level Train Corpus 

In [14]:
level_list = ['A1', 'A2']
A_level_scores = []
for level in level_list:
    corpus_item = corpus[level]
    for i, text_item in enumerate(corpus_item):
        try: 
            score = fernandez_huerta_score(text_item['content'])
            print(text_item['level'], text_item['title'], score)
            A_level_scores.append(score)
            
        except:
            print(f'Error', text_item['level'], i)
            pass

print(np.mean(A_level_scores))

A1 Mi día 88.48385496183208
A1 Familia pequeña 89.86870786516855
A1 Mi nueva casa 83.61441176470589
A1 El parque 96.89951612903225
A1 Mi pueblo 89.23500000000001
A1 An Elementary Spanish Reader 94.81207736389683
A1 An Elementary Spanish Reader 72.53960048426153
A1 An Elementary Spanish Reader 88.38236842105265
A1 An Elementary Spanish Reader 92.45912993039445
A1 An Elementary Spanish Reader 83.31500000000001
A1 An Elementary Spanish Reader 89.49959627329193
A1 An Elementary Spanish Reader 84.52451612903226
A1 An Elementary Spanish Reader 80.87554054054053
A1 An Elementary Spanish Reader 90.38767605633804
A1 An Elementary Spanish Reader 86.21369822485208
A1 An Elementary Spanish Reader 89.19670212765959
A1 An Elementary Spanish Reader 84.86357142857145
A1 An Elementary Spanish Reader 96.69042168674699
A1 An Elementary Spanish Reader 91.9617605633803
A1 An Elementary Spanish Reader 83.19649732620323
A1 An Elementary Spanish Reader 93.79041401273885
A1 An Elementary Spanish Reader 103.782

In [26]:
i_list = [0, 15, 22, 40]
level = 'B1'
for i in i_list:
    score = fernandez_huerta_score(corpus[level][i]['content'])
    print(score)

84.67256345177667
85.42525787965614
83.21100000000001
97.73640845070425


In [27]:
i_list = [13,14,20,28,37,38,39,41,47,52,57,60,74,76,78,84,87,89,95,96]
level = 'B'
for i in i_list:
    score = fernandez_huerta_score(corpus[level][i]['content'])
    print(score)

83.08102026049205
80.89748412496826
71.90540957202026
94.29911483253588
89.83864520048603
91.1658550185874
91.45264705882353
89.84156455142232
106.18578534031414
91.59279162956368
84.1552614379085
92.32300000000002
94.78371794871795
86.5643233082707
87.99162650602409
93.11000000000001
92.70761146496815
86.57309523809525
91.88336895388076
92.83500000000001


#### Take average of the B Level Train Corpus 

In [15]:
level_list = ['B1', 'B']
B_level_scores = []
for level in level_list:
    corpus_item = corpus[level]
    for i, text_item in enumerate(corpus_item):
        try: 
            score = fernandez_huerta_score(text_item['content'])
            print(text_item['level'], text_item['title'], score)
            B_level_scores.append(score)
            
        except:
            print(f'Error', text_item['level'], i)
            pass

print(np.mean(B_level_scores))

B1 Barcelona 84.67256345177667
B1 Comparaciones 85.76253623188406
B1 El día de la Hispanidad 78.162731092437
B1 En el hotel 81.78581967213115
B1 Mi mejor amigo 84.59619402985076
B1 Miembros de la familia 87.58500000000001
B1 La Navidad en España 88.6101479289941
B1 Pasatiempos 74.80529702970296
B1 La tienda de ropa 82.42875000000001
B1 Tópicos falsos sobre los españoles 86.18551546391754
B1 Mi viaje a Roma 76.32372180451128
B1 Spanish Tales for Beginners 87.77286788750818
B1 Spanish Tales for Beginners 88.21364077669904
B1 Spanish Tales for Beginners 92.59457257346394
B1 Spanish Tales for Beginners 86.77978527607361
B1 Spanish Tales for Beginners 85.42525787965614
B1 Spanish Tales for Beginners 85.06386866059817
B1 Spanish Tales for Beginners 83.84926307448495
B1 Spanish Tales for Beginners 89.0821482889734
B1 Spanish Tales for Beginners 86.07406705539358
B1 Spanish Tales for Beginners 80.39455678670362
B1 Spanish Tales for Beginners 84.67357566765581
B1 Spanish Tales for Beginners 83.

In [46]:
print(f"Count of A1: {len(corpus['A1'])}")
print(f"Count of A2: {len(corpus['A2'])}")
print(f"Count of B1: {len(corpus['B1'])}")
print(f"Count of B: {len(corpus['B'])} \n")

print(f"Count of A total:{len(A_level_scores)}")
print(f"Count of B total:{len(B_level_scores)}")

Count of A1: 94
Count of A2: 62
Count of B1: 42
Count of B: 110 

Count of A total:156
Count of B total:152


### Distribution of the score by level

In [39]:
dict_score = defaultdict(list)
for i, item in enumerate(A_level_scores):
    if i < 94:
        dict_score['level'].append('A1')
    else:
        dict_score['level'].append('A2')
    dict_score['score'].append(item)
    
for i, item in enumerate(B_level_scores):
    if i < 42:
        dict_score['level'].append('B1')
    else:
        dict_score['level'].append('B')
    dict_score['score'].append(item)
        
    

In [40]:
level_score_df = pd.DataFrame(dict_score)
A_score_df = level_score_df.loc[level_score_df['level'].isin(['A1','A2'])]

In [41]:
alt.Chart(level_score_df).mark_bar().encode(x = alt.X('score', bin = alt.Bin(maxbins = 30)), y = 'count()').facet('level')

In [43]:
print(f'Average score for A: {np.mean(A_level_scores):.3f}')
print(f'Average score for B: {np.mean(B_level_scores):.3f}')

Average score for A: 83.370
Average score for B: 91.140
