## Analyze word difficulty

### Set up

In [56]:
from pathlib import Path

from wdiff.analyzer import Analyzer
import pandas as pd

In [57]:
path_words = Path('../Instruments and materials/Data Collection/Words/Cleaned words')

In [58]:
words = pd.read_csv(path_words / '3a-words_analyzed_cleaned_meaning.csv', squeeze=True)
words.head()



  words = pd.read_csv(path_words / '3a-words_analyzed_cleaned_meaning.csv', squeeze=True)


0      abarcar
1      abjurar
2     ablandar
3     abolidos
4    abolieren
Name: word, dtype: object

In [59]:
analyzer = Analyzer(words)
analyzer.run_all_analyses()
words_analyzed = analyzer.results
words_analyzed.head()

                                    word_objs         text  length  \
0     <wdiff.word.Word object at 0x11eb29ac0>      abarcar       7   
1     <wdiff.word.Word object at 0x11eb29c40>      abjurar       7   
2     <wdiff.word.Word object at 0x11eb29c10>     ablandar       8   
3     <wdiff.word.Word object at 0x11eb29c70>     abolidos       8   
4     <wdiff.word.Word object at 0x11eb29cd0>    abolieren       9   
...                                       ...          ...     ...   
2052  <wdiff.word.Word object at 0x11ebda310>  disciplinal      11   
2053  <wdiff.word.Word object at 0x11ebda370>      incluso       7   
2054  <wdiff.word.Word object at 0x11ebda3d0>        lista       5   
2055  <wdiff.word.Word object at 0x11ebda430>        mecer       5   
2056  <wdiff.word.Word object at 0x11ebda490>       puesto       6   

      silent_letters  shared_phonemes  total_difficulty  
0                  0                2                 9  
1                  0                2      

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
0,abarcar,7,0,2,9
1,abjurar,7,0,2,9
2,ablandar,8,0,1,9
3,abolidos,8,0,2,10
4,abolieren,9,0,1,10


## Select words based on difficulty

- There will be 24 words
- The word list must have different levels of each characteristic
- The words should be "close" to equally distributed


The process is completely reproducible

General function for getting words by threshold

In [60]:
def sample_words_by_threshold(var, threshold, direction):
    _words = (
        words_analyzed
        .query(f'{var} {direction} @threshold')
        .sample(4, random_state=1)
    )
    return _words

### Length

Options: 
- Above and below average
- below average, max
- q < .5 , q > .95

In [61]:
threshold_high = words_analyzed.length.quantile(.95)
words_length_high = sample_words_by_threshold('length', threshold_high, '>')

# double check
words_length_high

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
742,gravitomagnetismo,17,0,2,19
1712,introspectivamente,18,0,2,20
460,descuidadamente,15,0,2,17
1996,acuartelamiento,15,0,1,16


In [62]:
threshold_low = words_analyzed.length.quantile()
words_length_low = sample_words_by_threshold('length', threshold_low, '<')

words_length_low

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
1945,regador,7,0,0,7
1165,premio,6,0,0,6
158,asador,6,0,1,7
406,cuenta,6,0,1,7


### Silent letters
- min and max 

In [63]:
threshold_high = words_analyzed.silent_letters.max()

words_silent_high = sample_words_by_threshold('silent_letters', threshold_high, '==')

words_silent_high

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
1213,quebrantahuesos,15,2,4,21
777,horqueta,8,2,1,11
764,hinque,6,2,1,9
775,hormiguero,10,2,0,12


In [64]:
words_silent_low = sample_words_by_threshold('silent_letters', 0, '==')

words_silent_low

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
51,afanar,6,0,0,6
1376,sobrentendido,13,0,2,15
1251,reencarnar,10,0,1,11
1015,novedoso,8,0,2,10


### Shared phonemes
- min and max

In [65]:
threshold_high = words_analyzed.shared_phonemes.quantile(.9)
words_shared_phonemes_high = sample_words_by_threshold('shared_phonemes', threshold_high, '>')

words_shared_phonemes_high

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
609,esbeltez,8,0,3,11
614,escoger,7,0,3,10
1775,despresurizar,13,0,3,16
859,jueves,6,0,3,9


In [66]:
words_shared_phonemes_low = sample_words_by_threshold('shared_phonemes', 0, '==')

words_shared_phonemes_low

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
997,mutuo,5,0,0,5
1469,toldo,5,0,0,5
1147,porteador,9,0,0,9
1120,plaga,5,0,0,5


### Integrate words into final word pool

In [67]:
words_final = pd.concat(
    [
        words_length_high,
        words_length_low,
        words_silent_high,
        words_silent_low,
        words_shared_phonemes_high,
        words_shared_phonemes_low,
    ]
)

# double check
print('# words', words_final.shape)
words_final

# words (24, 5)


Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
742,gravitomagnetismo,17,0,2,19
1712,introspectivamente,18,0,2,20
460,descuidadamente,15,0,2,17
1996,acuartelamiento,15,0,1,16
1945,regador,7,0,0,7
1165,premio,6,0,0,6
158,asador,6,0,1,7
406,cuenta,6,0,1,7
1213,quebrantahuesos,15,2,4,21
777,horqueta,8,2,1,11
