In [1]:


import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Tuple
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold, train_test_split



In [2]:


L2_scores = {
    "L2_spelling_skill": [0, 44], 
    "L2_vocabulary_size":[0, 100], 
    "vocab.t2.5": [0, 40],  # still not clear but assume it is 0, 100
    "L2_lexical_skill": [0, 100],
    "TOWRE_word": [0, 104],
    "TOWRE_nonword": [0, 63],
}
L2_scores



{'L2_spelling_skill': [0, 44],
 'L2_vocabulary_size': [0, 100],
 'vocab.t2.5': [0, 40],
 'L2_lexical_skill': [0, 100],
 'TOWRE_word': [0, 104],
 'TOWRE_nonword': [0, 63]}

In [3]:

for k, v in L2_scores.items():
    print(k, f"Lower={v[0]}, Upper={v[1]}")
    
    

L2_spelling_skill Lower=0, Upper=44
L2_vocabulary_size Lower=0, Upper=100
vocab.t2.5 Lower=0, Upper=40
L2_lexical_skill Lower=0, Upper=100
TOWRE_word Lower=0, Upper=104
TOWRE_nonword Lower=0, Upper=63


# Load data for native Ru

In [22]:


age_gender = pd.read_excel("../Datasets/MECO/age_gender.xlsx").dropna()


age_gender = age_gender.astype(({
    "SubjectID": str, 
    "Age": int,
    "Sex": int,
    "lang": str,
}))


print(age_gender.shape)

age_gender.head() 


(594, 4)


Unnamed: 0,SubjectID,Age,Sex,lang
0,du_4,19,0,du
1,du_5,21,0,du
2,du_6,27,0,du
3,du_7,23,1,du
4,du_8,27,0,du


In [19]:

demo = pd.read_excel("../Datasets/MECO/demo_alllang.xlsx").dropna()

demo = demo.astype({
    "subid": str,
    "SubjectID": str, 
    "lang": str, 
    "L2_spelling_skill": float,
    "L2_vocabulary_size": float,
    "vocab.t2.5": float,
    "L2_lexical_skill": float,
    "TOWRE_word":float,
    "TOWRE_nonword": float,
    "motiv": float,
    "IQ":float,
})


print(demo.shape)
demo.head()



(559, 11)


Unnamed: 0,subid,SubjectID,lang,L2_spelling_skill,L2_vocabulary_size,vocab.t2.5,L2_lexical_skill,TOWRE_word,TOWRE_nonword,motiv,IQ
0,DU_04,du_4,du,31.0,66.0,37.0,75.0,84.0,56.0,3.8,6.0
1,DU_05,du_5,du,35.0,78.0,36.0,93.75,63.0,54.0,4.0,4.0
2,DU_06,du_6,du,40.0,80.0,39.0,95.0,84.0,54.0,3.8,9.0
3,DU_07,du_7,du,33.0,50.0,36.0,66.25,65.0,45.0,3.5,9.0
4,DU_08,du_8,du,32.0,50.0,37.0,73.75,82.0,28.0,3.9,6.0


In [23]:


ru_data = pd.read_excel("../Datasets/MECO/DATA/ru_data.xlsx").dropna()
ru_data = ru_data.astype({
    "SubjectID":str, 
    "Text_ID": str, 
    "Fix_X": float, 
    "Fix_Y": float,
    "Fix_Duration": float,
    "Word_Number": int,
    "Sentence": str,
    "Language":str,
})


In [24]:


print(ru_data.shape)
ru_data.head()



(233878, 9)


Unnamed: 0,SubjectID,Text_ID,Fix_X,Fix_Y,Fix_Duration,Word_Number,Word,Sentence,Language
0,ru_10,1,138.0,352.0,299.0,77,portrait,"In 1825, Morse was i",ru
1,ru_10,1,436.0,413.0,338.0,97,ill,"In 1825, Morse was i",ru
2,ru_10,1,507.0,417.0,197.0,98,back,"In 1825, Morse was i",ru
3,ru_10,1,1072.0,411.0,111.0,105,message,The message had take,ru
4,ru_10,1,1259.0,408.0,229.0,107,taken,The message had take,ru


In [25]:


ru_data_subject_id = set(ru_data.SubjectID)



# Check ups

In [26]:


age_gender_ru = age_gender.loc[age_gender.lang == "ru"]
age_gender_ru_subject_id = set(age_gender_ru.SubjectID)
age_gender_ru



Unnamed: 0,SubjectID,Age,Sex,lang
357,rum_1,30,0,ru
358,rum_2,20,0,ru
359,rum_3,20,0,ru
360,rum_4,20,0,ru
361,rum_5,29,0,ru
...,...,...,...,...
458,ru_51,30,0,ru
459,ru_52,29,0,ru
460,ru_53,27,1,ru
461,ru_54,28,1,ru


In [27]:


demo_ru = demo.loc[demo.lang == "ru"]
demo_ru_subject_id = set(demo_ru.SubjectID)




In [33]:


print(
    f" age-gender =  {len(age_gender_ru_subject_id)} \n", 
    f"ru_demo    =  {len(demo_ru_subject_id)} \n",
    f"fixation   = {len(ru_data_subject_id)} \n", 
)




 age-gender =  104 
 ru_demo    =  94 
 fixation   = 110 



In [37]:


print(
    
    f" missing in age_gender or demo    :  {age_gender_ru_subject_id.symmetric_difference(demo_ru_subject_id)} \n",
    f"missing in age_gender or fixation: {age_gender_ru_subject_id.symmetric_difference(ru_data_subject_id)} \n",
    f"missing in fixaiton or demo      : {ru_data_subject_id.symmetric_difference(demo_ru_subject_id)} \n",
    
)  



 missing in age_gender or demo    :  {'rum_61', 'ru_33', 'ru_37', 'ru_8', 'rum_25', 'rum_59', 'ru_34', 'rum_57', 'ru_52', 'rum_66', 'rum_10', 'rum_24', 'ru_4', 'rum_47', 'ru_2', 'rum_23', 'rum_43', 'rum_60', 'ru_9', 'rum_55', 'rum_46', 'ru_56', 'rum_34_1', 'ru_32'} 
 missing in age_gender or fixation: {'rum_61', 'ru_8', 'rum_66', 'ru_4', 'ru_2', 'ru_9', 'rum_55', 'ru_56'} 
 missing in fixaiton or demo      : {'ru_33', 'ru_37', 'rum_25', 'rum_59', 'ru_34', 'rum_57', 'ru_52', 'rum_10', 'rum_24', 'rum_47', 'rum_23', 'rum_43', 'rum_60', 'rum_46', 'rum_34_1', 'ru_32'} 



In [38]:



print(
    
    f" missing in demo vs age_gender    :  {demo_ru_subject_id.difference(age_gender_ru_subject_id)} \n",
    f"missing in fixation vs age_gender:  {ru_data_subject_id.difference(age_gender_ru_subject_id)} \n",

)





 missing in demo vs age_gender    :  {'rum_61', 'ru_56', 'ru_9', 'rum_55', 'ru_4', 'ru_8', 'ru_2'} 
 missing in fixation vs age_gender:  {'rum_61', 'ru_56', 'ru_9', 'rum_55', 'ru_4', 'ru_8', 'ru_2'} 



In [39]:

print(
    
    f" missing in age_gender vs demo   :  {age_gender_ru_subject_id.difference(demo_ru_subject_id)} \n",
    f"missing in age_gender vs fixation:  {age_gender_ru_subject_id.difference(ru_data_subject_id)} \n",

)



 missing in age_gender vs demo   :  {'rum_60', 'rum_25', 'rum_57', 'rum_59', 'ru_52', 'rum_47', 'ru_33', 'ru_37', 'rum_66', 'rum_23', 'rum_43', 'rum_10', 'rum_24', 'ru_34', 'rum_34_1', 'ru_32', 'rum_46'} 
 missing in age_gender vs fixation:  {'rum_66'} 



In [40]:



print(
    
    f" missing in fixation vs demo   :  {ru_data_subject_id.difference(demo_ru_subject_id)} \n",
    f"missing in demo vs fixation   :  {demo_ru_subject_id.difference(ru_data_subject_id)} \n",

)



 missing in fixation vs demo   :  {'rum_60', 'rum_25', 'rum_57', 'rum_59', 'ru_52', 'rum_47', 'ru_33', 'ru_37', 'rum_23', 'rum_43', 'rum_10', 'rum_24', 'ru_34', 'rum_34_1', 'ru_32', 'rum_46'} 
 missing in demo vs fixation   :  set() 



## midterm conclusion:



**We will work with this subject in our further analysis**




In [42]:
subject_to_keep = age_gender_ru_subject_id.intersection(demo_ru_subject_id).intersection(ru_data_subject_id)

len(subject_to_keep)


87

In [44]:

age_gender_ru_ = age_gender_ru.loc[age_gender_ru.SubjectID.isin(subject_to_keep)]
demo_ru_ = demo_ru.loc[demo_ru.SubjectID.isin(subject_to_keep)]
ru_data_ = ru_data.loc[ru_data.SubjectID.isin(subject_to_keep)]


age_gender_ru.shape, age_gender_ru_.shape, demo_ru.shape, demo_ru_.shape, ru_data.shape, ru_data_.shape



((104, 4), (87, 4), (94, 11), (87, 11), (233878, 9), (184791, 9))

In [45]:


ru_data_.isnull()



Unnamed: 0,SubjectID,Text_ID,Fix_X,Fix_Y,Fix_Duration,Word_Number,Word,Sentence,Language
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
236684,False,False,False,False,False,False,False,False,False
236685,False,False,False,False,False,False,False,False,False
236686,False,False,False,False,False,False,False,False,False
236687,False,False,False,False,False,False,False,False,False


In [46]:


ru_data_.isnull().sum()




SubjectID       0
Text_ID         0
Fix_X           0
Fix_Y           0
Fix_Duration    0
Word_Number     0
Word            0
Sentence        0
Language        0
dtype: int64

In [47]:



demo_ru_.isnull().sum()




subid                 0
SubjectID             0
lang                  0
L2_spelling_skill     0
L2_vocabulary_size    0
vocab.t2.5            0
L2_lexical_skill      0
TOWRE_word            0
TOWRE_nonword         0
motiv                 0
IQ                    0
dtype: int64

In [48]:


age_gender_ru_.isnull().sum()



SubjectID    0
Age          0
Sex          0
lang         0
dtype: int64

In [49]:

def convert_range(x_old, old_min, old_max, new_min, new_max):
    
    x_new = ((new_max - new_min)*(x_old - old_min))/(old_max - old_min)+ new_min
    
    return x_new 




In [50]:


convert_range(2.5, 0, 5, 2, 20, )




11.0

In [60]:


def concat_dfs(df1, df2, features1, features2):

    """ concatenates df2 to df1, that is, it casts df2's dimensions df1. """

    data = []
    subject_ids = df2.SubjectID
    for subject_id in subject_ids:
#         print("subject_id:", subject_id)
        tmp1 = df1.loc[(df1.SubjectID == subject_id)]
        tmp1 = tmp1.loc[:, features1].reset_index(drop=True)
        tmp2 = df2.loc[df2.SubjectID == subject_id]
        tmp2 = tmp2.loc[:, features2]

        n = tmp1.shape[0]
        if n == tmp2.shape[0]:
            tmp2 = pd.concat([tmp1], ignore_index=True)
        else:
            tmp2 = pd.concat([tmp2] * n, ignore_index=True)  # .reset_index(drop=True)

        tmp3 = pd.concat([tmp1, tmp2], axis=1, )

        if tmp3.shape[0] != tmp1.shape[0] or tmp3.shape[0] != tmp2.shape[0]:
            print(
                subject_id,
                "in consistencies in number of observations (rows)"
            )

        if tmp3.shape[1] != tmp1.shape[1] + tmp2.shape[1]:
            print(
                subject_id,
                "inconsistencies in feature space (columns)"
            )

        data.append(tmp3)

    return pd.concat(data)



In [61]:


df1 = demo_ru
df2 = age_gender_ru

features1 = list(demo_ru.columns[1:])
features2=["SubjectID", "Age", "Sex"]


In [62]:

data = []
subject_ids = df2.SubjectID
for subject_id in subject_ids:
#     print("subject_id:", subject_id)
    tmp1 = df1.loc[(df1.SubjectID == subject_id)]
    tmp1 = tmp1.loc[:, features1].reset_index(drop=True)
    tmp2 = df2.loc[df2.SubjectID == subject_id]
    tmp2 = tmp2.loc[:, features2]

    n = tmp1.shape[0]


In [66]:



demo_ru_concat = concat_dfs(df1=demo_ru_, df2=age_gender_ru_, 
                         features1=features1, 
                         features2=features2
)




In [67]:
demo_ru_concat

Unnamed: 0,SubjectID,lang,L2_spelling_skill,L2_vocabulary_size,vocab.t2.5,L2_lexical_skill,TOWRE_word,TOWRE_nonword,motiv,IQ,SubjectID.1,lang.1,L2_spelling_skill.1,L2_vocabulary_size.1,vocab.t2.5.1,L2_lexical_skill.1,TOWRE_word.1,TOWRE_nonword.1,motiv.1,IQ.1
0,rum_1,ru,39.0,71.0,36.0,81.25,79.0,57.0,3.7,7.0,rum_1,ru,39.0,71.0,36.0,81.25,79.0,57.0,3.7,7.0
0,rum_2,ru,35.0,85.0,40.0,90.00,74.0,44.0,4.0,10.0,rum_2,ru,35.0,85.0,40.0,90.00,74.0,44.0,4.0,10.0
0,rum_3,ru,39.0,85.0,38.0,96.25,83.0,58.0,2.4,7.0,rum_3,ru,39.0,85.0,38.0,96.25,83.0,58.0,2.4,7.0
0,rum_4,ru,38.0,72.0,39.0,90.00,78.0,43.0,3.2,5.0,rum_4,ru,38.0,72.0,39.0,90.00,78.0,43.0,3.2,5.0
0,rum_5,ru,44.0,76.0,38.0,86.25,87.0,59.0,3.6,8.0,rum_5,ru,44.0,76.0,38.0,86.25,87.0,59.0,3.6,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,ru_50,ru,37.0,47.0,33.0,87.50,72.0,37.0,3.7,9.0,ru_50,ru,37.0,47.0,33.0,87.50,72.0,37.0,3.7,9.0
0,ru_51,ru,33.0,39.0,33.0,71.25,83.0,53.0,4.1,9.0,ru_51,ru,33.0,39.0,33.0,71.25,83.0,53.0,4.1,9.0
0,ru_53,ru,36.0,32.0,26.0,77.50,73.0,45.0,4.3,11.0,ru_53,ru,36.0,32.0,26.0,77.50,73.0,45.0,4.3,11.0
0,ru_54,ru,38.0,48.0,35.0,70.00,76.0,52.0,3.8,8.0,ru_54,ru,38.0,48.0,35.0,70.00,76.0,52.0,3.8,8.0


In [73]:


fix_demo = concat_dfs(
    df1=demo_ru_concat, df2=ru_data_, 
    features1=list(demo_ru_concat.columns), 
    features2=list(ru_data_.columns)
)



ValueError: Cannot index with multidimensional key

In [71]:


list(ru_data_.columns)

['SubjectID',
 'Text_ID',
 'Fix_X',
 'Fix_Y',
 'Fix_Duration',
 'Word_Number',
 'Word',
 'Sentence',
 'Language']