## Analyze word difficulty

### Set up

In [55]:
from pathlib import Path

from wdiff.analyzer import Analyzer
import pandas as pd

In [56]:
path_words = Path('../Instruments and materials/Data Collection/Words/Cleaned words')

In [57]:
words = pd.read_csv(path_words / '3a-words_analyzed_cleaned_meaning.csv', squeeze=True)
words.head()



  words = pd.read_csv(path_words / '3a-words_analyzed_cleaned_meaning.csv', squeeze=True)


0      abarcar
1      abjurar
2     ablandar
3     abolidos
4    abolieren
Name: word, dtype: object

In [58]:
analyzer = Analyzer(words)
analyzer.run_all_analyses()
words_analyzed = analyzer.results
words_analyzed.head()

                                    word_objs         text  length  \
0     <wdiff.word.Word object at 0x11cb70ac0>      abarcar       7   
1     <wdiff.word.Word object at 0x11cb708b0>      abjurar       7   
2     <wdiff.word.Word object at 0x11cb70be0>     ablandar       8   
3     <wdiff.word.Word object at 0x11cb70c40>     abolidos       8   
4     <wdiff.word.Word object at 0x11cb70ca0>    abolieren       9   
...                                       ...          ...     ...   
2052  <wdiff.word.Word object at 0x11cbc37f0>  disciplinal      11   
2053  <wdiff.word.Word object at 0x11cbc3850>      incluso       7   
2054  <wdiff.word.Word object at 0x11cbc38b0>        lista       5   
2055  <wdiff.word.Word object at 0x11cbc3910>        mecer       5   
2056  <wdiff.word.Word object at 0x11cbc3970>       puesto       6   

      silent_letters  shared_phonemes  total_difficulty  
0                  0                2                 9  
1                  0                2      

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
0,abarcar,7,0,2,9
1,abjurar,7,0,2,9
2,ablandar,8,0,1,9
3,abolidos,8,0,2,10
4,abolieren,9,0,1,10


## Select words based on difficulty

- There will be 24 words
- The word list must have different levels of each characteristic
- The words should be "close" to equally distributed


The process is completely reproducible

General function for getting words by threshold

In [59]:
def sample_words_by_threshold(var, threshold, direction):
    _words = (
        words_analyzed
        .query(f'{var} {direction} @threshold')
        .sample(4, random_state=2)
    )
    return _words

### Length

Options: 
- Above and below average
- below average, max
- q < .5 , q > .95

In [60]:
threshold_high = words_analyzed.length.quantile(.95)
words_length_high = sample_words_by_threshold('length', threshold_high, '>')

# double check
words_length_high

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
1098,perdurablemente,15,0,1,16
1177,presumiblemente,15,0,2,17
139,apresuradamente,15,0,1,16
644,europarlamentario,17,0,0,17


In [61]:
threshold_low = words_analyzed.length.quantile()
words_length_low = sample_words_by_threshold('length', threshold_low, '<')

words_length_low

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
963,mimador,7,0,0,7
1672,endoso,6,0,1,7
1833,venda,5,0,1,6
1401,subir,5,0,2,7


### Silent letters
- min and max 

In [62]:
threshold_high = words_analyzed.silent_letters.max()

words_silent_high = sample_words_by_threshold('silent_letters', threshold_high, '==')

words_silent_high

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
777,horqueta,8,2,1,11
1213,quebrantahuesos,15,2,4,21
775,hormiguero,10,2,0,12
764,hinque,6,2,1,9


In [63]:
threshold_high = words_analyzed.silent_letters.max()
words_silent_low = sample_words_by_threshold('silent_letters', threshold_high, '<')

words_silent_low

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
1462,tintura,7,0,0,7
1766,salada,6,0,1,7
459,descongestionar,15,0,4,19
1785,zigzagueante,12,1,2,15


### Shared phonemes
- min and max

In [64]:
threshold_high = words_analyzed.shared_phonemes.quantile(.9)
words_shared_phonemes_high = sample_words_by_threshold('shared_phonemes', threshold_high, '>')

words_shared_phonemes_high

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
548,ejecutiva,9,0,3,12
349,constitucional,14,0,3,17
610,escabullimiento,15,0,4,19
616,escondidijo,11,0,3,14


In [65]:
words_shared_phonemes_low = sample_words_by_threshold('shared_phonemes', 0, '==')

words_shared_phonemes_low

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
173,aureolar,8,0,0,8
1190,proliferar,10,0,0,10
746,guindar,7,1,0,8
914,machetear,9,0,0,9


### Integrate words into final word pool

In [66]:
words_final = pd.concat(
    [
        words_length_high,
        words_length_low,
        words_silent_high,
        words_silent_low,
        words_shared_phonemes_high,
        words_shared_phonemes_low,
    ]
)

# double check
print('# words', words_final.shape)
words_final

# words (24, 5)


Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
1098,perdurablemente,15,0,1,16
1177,presumiblemente,15,0,2,17
139,apresuradamente,15,0,1,16
644,europarlamentario,17,0,0,17
963,mimador,7,0,0,7
1672,endoso,6,0,1,7
1833,venda,5,0,1,6
1401,subir,5,0,2,7
777,horqueta,8,2,1,11
1213,quebrantahuesos,15,2,4,21


In [67]:
words_final.rename(columns={'text': 'word'}, inplace=True)
words_final.to_csv(path_words / '4-words_final.csv', index=False)

### Make sure word pool doesn't contain duplicates

Count duplicates

In [68]:
sum(words_final.word.duplicated())

0

## Check difficulty of word pool

Use frequencies

In [69]:
words_final.length.value_counts()

15    6
7     3
6     3
5     2
8     2
10    2
9     2
17    1
12    1
14    1
11    1
Name: length, dtype: int64

In [70]:
words_final.silent_letters.value_counts()

0    18
2     4
1     2
Name: silent_letters, dtype: int64

In [71]:
words_final.shared_phonemes.value_counts()

0    8
1    7
2    3
4    3
3    3
Name: shared_phonemes, dtype: int64

In [72]:
words_final.total_difficulty.value_counts()

7     5
17    3
16    2
12    2
9     2
19    2
8     2
6     1
11    1
21    1
15    1
14    1
10    1
Name: total_difficulty, dtype: int64