## Analyze word difficulty

### Set up

In [1]:
from pathlib import Path

from wdiff.analyzer import Analyzer
import pandas as pd

In [2]:
path_words = Path('../Instruments and materials/Data Collection/Words/Cleaned words')

In [3]:
words = pd.read_csv(path_words / '3a-words_analyzed_cleaned_meaning.csv', squeeze=True)
words.head()



  words = pd.read_csv(path_words / '3a-words_analyzed_cleaned_meaning.csv', squeeze=True)


0      abarcar
1      abjurar
2     ablandar
3     abolidos
4    abolieren
Name: word, dtype: object

In [4]:
analyzer = Analyzer(words)
analyzer.run_all_analyses()
words_analyzed = analyzer.results
words_analyzed.head()

                                    word_objs         text  length  \
0     <wdiff.word.Word object at 0x122185d60>      abarcar       7   
1     <wdiff.word.Word object at 0x122185e20>      abjurar       7   
2     <wdiff.word.Word object at 0x122185e80>     ablandar       8   
3     <wdiff.word.Word object at 0x122185ee0>     abolidos       8   
4     <wdiff.word.Word object at 0x122185f40>    abolieren       9   
...                                       ...          ...     ...   
2052  <wdiff.word.Word object at 0x122250b80>  disciplinal      11   
2053  <wdiff.word.Word object at 0x122250be0>      incluso       7   
2054  <wdiff.word.Word object at 0x122250c40>        lista       5   
2055  <wdiff.word.Word object at 0x122250ca0>        mecer       5   
2056  <wdiff.word.Word object at 0x122250d00>       puesto       6   

      silent_letters  shared_phonemes  total_difficulty  
0                  0                2                 9  
1                  0                2      

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
0,abarcar,7,0,2,9
1,abjurar,7,0,2,9
2,ablandar,8,0,1,9
3,abolidos,8,0,2,10
4,abolieren,9,0,1,10


## Select words based on difficulty

- There will be 24 words
- The word list must have different levels of each characteristic
- The words should be "close" to equally distributed


The process is completely reproducible

General function for getting words by threshold

In [5]:
def sample_words_by_threshold(var, threshold, direction):
    _words = (
        words_analyzed
        .query(f'{var} {direction} @threshold')
        .sample(4, random_state=1)
    )
    return _words

### Length

Options: 
- Above and below average
- below average, max
- q < .5 , q > .95

In [6]:
threshold_high = words_analyzed.length.quantile(.95)
words_length_high = sample_words_by_threshold('length', threshold_high, '>')

# double check
words_length_high

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
742,gravitomagnetismo,17,0,2,19
1712,introspectivamente,18,0,2,20
460,descuidadamente,15,0,2,17
1996,acuartelamiento,15,0,1,16


In [7]:
threshold_low = words_analyzed.length.quantile()
words_length_low = sample_words_by_threshold('length', threshold_low, '<')

words_length_low

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
1945,regador,7,0,0,7
1165,premio,6,0,0,6
158,asador,6,0,1,7
406,cuenta,6,0,1,7


### Silent letters
- min and max 

In [8]:
threshold_high = words_analyzed.silent_letters.max()

words_silent_high = sample_words_by_threshold('silent_letters', threshold_high, '==')

words_silent_high

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
1213,quebrantahuesos,15,2,4,21
777,horqueta,8,2,1,11
764,hinque,6,2,1,9
775,hormiguero,10,2,0,12


In [9]:
threshold_high = words_analyzed.silent_letters.max()
words_silent_low = sample_words_by_threshold('silent_letters', threshold_high, '<')

words_silent_low

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
406,cuenta,6,0,1,7
1724,esclarecidamente,16,0,2,18
331,condicionalmente,16,0,2,18
102,amalgamador,11,0,0,11


### Shared phonemes
- min and max

In [10]:
threshold_high = words_analyzed.shared_phonemes.quantile(.9)
words_shared_phonemes_high = sample_words_by_threshold('shared_phonemes', threshold_high, '>')

words_shared_phonemes_high

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
609,esbeltez,8,0,3,11
614,escoger,7,0,3,10
1775,despresurizar,13,0,3,16
859,jueves,6,0,3,9


In [11]:
words_shared_phonemes_low = sample_words_by_threshold('shared_phonemes', 0, '==')

words_shared_phonemes_low

Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
997,mutuo,5,0,0,5
1469,toldo,5,0,0,5
1147,porteador,9,0,0,9
1120,plaga,5,0,0,5


### Integrate words into final word pool

In [12]:
words_final = pd.concat(
    [
        words_length_high,
        words_length_low,
        words_silent_high,
        words_silent_low,
        words_shared_phonemes_high,
        words_shared_phonemes_low,
    ]
)

# double check
print('# words', words_final.shape)
words_final

# words (24, 5)


Unnamed: 0,text,length,silent_letters,shared_phonemes,total_difficulty
742,gravitomagnetismo,17,0,2,19
1712,introspectivamente,18,0,2,20
460,descuidadamente,15,0,2,17
1996,acuartelamiento,15,0,1,16
1945,regador,7,0,0,7
1165,premio,6,0,0,6
158,asador,6,0,1,7
406,cuenta,6,0,1,7
1213,quebrantahuesos,15,2,4,21
777,horqueta,8,2,1,11


## Check difficulty of word pool

Use frequencies

In [13]:
words_final.length.value_counts()

6     6
15    3
5     3
7     2
8     2
16    2
17    1
18    1
10    1
11    1
13    1
9     1
Name: length, dtype: int64

In [14]:
words_final.silent_letters.value_counts()

0    20
2     4
Name: silent_letters, dtype: int64

In [15]:
words_final.shared_phonemes.value_counts()

0    8
1    6
2    5
3    4
4    1
Name: shared_phonemes, dtype: int64