In [1]:


import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Tuple
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold, train_test_split



In [2]:



def convert_range(x_old, old_min, old_max, new_min, new_max):
    
    x_new = ((new_max - new_min)*(x_old - old_min))/(old_max - old_min)+ new_min
    
    return x_new 




In [3]:
convert_range(2.5, 0, 5, 3, 20, )



11.5

In [4]:




def concat_dfs(df1, df2, features1, features2):

    """ concatenates df2 to df1, that is, it casts df2's dimensions df1. """

    data = []
    subject_ids = df2.SubjectID
    for subject_id in subject_ids:
        tmp1 = df1.loc[(df1.SubjectID == subject_id)]
        tmp1 = tmp1.loc[:, features1].reset_index(drop=True)
        tmp2 = df2.loc[df2.SubjectID == subject_id]
        tmp2 = tmp2.loc[:, features2]
        

        n = tmp1.shape[0]
        if n == tmp2.shape[0]:
            tmp2 = pd.concat([tmp2]*1, ignore_index=True)
        else:
            tmp2 = pd.concat([tmp2] * n, ignore_index=True)  # .reset_index(drop=True)

        tmp3 = pd.concat([tmp1, tmp2], axis=1, )

        if tmp3.shape[0] != tmp1.shape[0] or tmp3.shape[0] != tmp2.shape[0]:
            print(
                subject_id,
                "in consistencies in number of observations (rows)"
            )

        if tmp3.shape[1] != tmp1.shape[1] + tmp2.shape[1]:
            print(
                subject_id,
                "inconsistencies in feature space (columns)"
            )

        data.append(tmp3)

    return pd.concat(data)



In [5]:

# from the description and cosulting Olga P. and Victor (1st author of the paper)

L2_scores = {
    "L2_spelling_skill": [0, 44], 
    "L2_vocabulary_size":[0, 100], 
    "vocab.t2.5": [0, 40],  # still not clear but assume it is 0, 100
    "L2_lexical_skill": [0, 100],
    "TOWRE_word": [0, 104],
    "TOWRE_nonword": [0, 63],
}
L2_scores



{'L2_spelling_skill': [0, 44],
 'L2_vocabulary_size': [0, 100],
 'vocab.t2.5': [0, 40],
 'L2_lexical_skill': [0, 100],
 'TOWRE_word': [0, 104],
 'TOWRE_nonword': [0, 63]}

# Load data for native Ru

In [6]:


age_gender = pd.read_excel("../Datasets/MECO/age_gender.xlsx").dropna()

print(age_gender.shape)

age_gender.head() 


(594, 4)


Unnamed: 0,SubjectID,Age,Sex,lang
0,du_4,19,0.0,du
1,du_5,21,0.0,du
2,du_6,27,0.0,du
3,du_7,23,1.0,du
4,du_8,27,0.0,du


In [7]:

# Let us just print the row and have look at them to avoid further data type in consistencies

for index, row in age_gender.iterrows():
    
    if not (
        isinstance(row['SubjectID'], str) and 
        isinstance(row['Age'], int,) and 
        isinstance(row['lang'], str)
       ):
        
        print(
            "Inconsistencies! \n"
            f"index:{index} \t SubjectID:{row['SubjectID'], type(row['SubjectID'])}  "
            f"Age:{row['Age'], type(row['Age'])}   Lang:{row['lang'], type(row['lang'])}"
        )
    

In [8]:

# convert the dtypes just in case
age_gender = age_gender.astype(({
    "SubjectID": str, 
    "Age": int,
    "Sex": int,
    "lang": str,
}))


In [9]:

demo = pd.read_excel("../Datasets/MECO/demo_alllang.xlsx").dropna()

print(demo.shape)
demo.head()



(559, 11)


Unnamed: 0,subid,SubjectID,lang,L2_spelling_skill,L2_vocabulary_size,vocab.t2.5,L2_lexical_skill,TOWRE_word,TOWRE_nonword,motiv,IQ
0,DU_04,du_4,du,31.0,66.0,37.0,75.0,84.0,56.0,3.8,6.0
1,DU_05,du_5,du,35.0,78.0,36.0,93.75,63.0,54.0,4.0,4.0
2,DU_06,du_6,du,40.0,80.0,39.0,95.0,84.0,54.0,3.8,9.0
3,DU_07,du_7,du,33.0,50.0,36.0,66.25,65.0,45.0,3.5,9.0
4,DU_08,du_8,du,32.0,50.0,37.0,73.75,82.0,28.0,3.9,6.0


In [10]:


for index, row in demo.iterrows():
    
    if not (isinstance(row['SubjectID'], str) and 
    isinstance(row['lang'], str) and 
    isinstance(row['L2_spelling_skill'], float) and
    isinstance(row['L2_vocabulary_size'], float) and
    isinstance(row['vocab.t2.5'], float) and
    isinstance(row['L2_lexical_skill'], float) and
    
    isinstance(row['TOWRE_word'], float) and
    
    isinstance(row['TOWRE_nonword'], float) and
    
    isinstance(row['motiv'], float) and
    isinstance(row['IQ'], float)):
        
        print(
            f"Inconsistencies in index:{index} \t SubjectID:{row['SubjectID']}",
            row
        )
    


In [11]:

# convert the dtypes just in case
demo = demo.astype({
    "subid": str,
    "SubjectID": str, 
    "lang": str, 
    "L2_spelling_skill": float,
    "L2_vocabulary_size": float,
    "vocab.t2.5": float,
    "L2_lexical_skill": float,
    "TOWRE_word":float,
    "TOWRE_nonword": float,
    "motiv": float,
    "IQ":float,
})


### Rescaling all L2* scores


In [12]:


# displaying the range of each L2* to convert into the same scale
for k, v in L2_scores.items():
    print(k, f"Lower={v[0]}, Upper={v[1]}")
    

L2_spelling_skill Lower=0, Upper=44
L2_vocabulary_size Lower=0, Upper=100
vocab.t2.5 Lower=0, Upper=40
L2_lexical_skill Lower=0, Upper=100
TOWRE_word Lower=0, Upper=104
TOWRE_nonword Lower=0, Upper=63


In [13]:


for k, v in L2_scores.items():
    demo[k] = demo[k].apply(convert_range, args=(v[0], v[1], 0, 5))

    

In [14]:

demo



Unnamed: 0,subid,SubjectID,lang,L2_spelling_skill,L2_vocabulary_size,vocab.t2.5,L2_lexical_skill,TOWRE_word,TOWRE_nonword,motiv,IQ
0,DU_04,du_4,du,3.522727,3.30,4.625,3.7500,4.038462,4.444444,3.8,6.0
1,DU_05,du_5,du,3.977273,3.90,4.500,4.6875,3.028846,4.285714,4.0,4.0
2,DU_06,du_6,du,4.545455,4.00,4.875,4.7500,4.038462,4.285714,3.8,9.0
3,DU_07,du_7,du,3.750000,2.50,4.500,3.3125,3.125000,3.571429,3.5,9.0
4,DU_08,du_8,du,3.636364,2.50,4.625,3.6875,3.942308,2.222222,3.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...
601,rum_67,rum_67,ru,4.545455,3.45,4.750,4.6875,4.230769,4.126984,4.0,11.0
602,rum_68,rum_68,ru,4.431818,3.25,4.625,3.8325,3.894231,3.650794,3.8,11.0
603,rum_69,rum_69,ru,4.204545,3.30,4.625,3.4375,4.038462,4.047619,3.6,9.0
604,rum_70,rum_70,ru,4.431818,2.00,4.375,3.6875,3.557692,3.253968,3.5,8.0


In [15]:

# compute the average of each row's L2* as the target value
cols = list(L2_scores.keys())

# demo["Target_Ave"] = demo[cols].sum(axis=1)/len(cols)
demo["Target_Ave"] = demo[cols].apply(np.average, axis=1)
demo["Target_Label"] = demo["Target_Ave"].apply(np.round).values
demo



Unnamed: 0,subid,SubjectID,lang,L2_spelling_skill,L2_vocabulary_size,vocab.t2.5,L2_lexical_skill,TOWRE_word,TOWRE_nonword,motiv,IQ,Target_Ave,Target_Label
0,DU_04,du_4,du,3.522727,3.30,4.625,3.7500,4.038462,4.444444,3.8,6.0,3.946772,4.0
1,DU_05,du_5,du,3.977273,3.90,4.500,4.6875,3.028846,4.285714,4.0,4.0,4.063222,4.0
2,DU_06,du_6,du,4.545455,4.00,4.875,4.7500,4.038462,4.285714,3.8,9.0,4.415772,4.0
3,DU_07,du_7,du,3.750000,2.50,4.500,3.3125,3.125000,3.571429,3.5,9.0,3.459821,3.0
4,DU_08,du_8,du,3.636364,2.50,4.625,3.6875,3.942308,2.222222,3.9,6.0,3.435566,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,rum_67,rum_67,ru,4.545455,3.45,4.750,4.6875,4.230769,4.126984,4.0,11.0,4.298451,4.0
602,rum_68,rum_68,ru,4.431818,3.25,4.625,3.8325,3.894231,3.650794,3.8,11.0,3.947390,4.0
603,rum_69,rum_69,ru,4.204545,3.30,4.625,3.4375,4.038462,4.047619,3.6,9.0,3.942188,4.0
604,rum_70,rum_70,ru,4.431818,2.00,4.375,3.6875,3.557692,3.253968,3.5,8.0,3.550996,4.0


In [40]:


ru_data = pd.read_excel("../Datasets/MECO/DATA/ru_data.xlsx").dropna()


In [41]:


print(ru_data.shape)
ru_data.head()




(233878, 9)


Unnamed: 0,SubjectID,Text_ID,Fix_X,Fix_Y,Fix_Duration,Word_Number,Word,Sentence,Language
0,ru_10,1,138,352,299,77.0,portrait,"In 1825, Morse was i",ru
1,ru_10,1,436,413,338,97.0,ill,"In 1825, Morse was i",ru
2,ru_10,1,507,417,197,98.0,back,"In 1825, Morse was i",ru
3,ru_10,1,1072,411,111,105.0,message,The message had take,ru
4,ru_10,1,1259,408,229,107.0,taken,The message had take,ru


In [43]:



for index, row in ru_data.iterrows():
    
    if not (
        isinstance(row['SubjectID'], str) and 
        isinstance(row['Text_ID'], int) and
        isinstance(row['Fix_X'], int) and
        isinstance(row['Fix_Y'], int) and
        isinstance(row['Fix_Duration'], int) and 
        isinstance(row['Word'], str) and 
        isinstance(row['Sentence'], str) and 
        isinstance(row['Language'], str)        
       ):
        
        print(
            "Inconsistencies! \n"
            f"index:{index} \t SubjectID:{row['SubjectID'],} \n", 
            isinstance(row['SubjectID'], str), "\n",
            isinstance(row['Text_ID'], int), "\n",
            isinstance(row['Fix_X'], int), "\n",
            isinstance(row['Fix_Y'], int), "\n",
            isinstance(row['Fix_Duration'], int), "\n",
            isinstance(row['Word'], str), "\n",
            isinstance(row['Sentence'], str), "\n",
            isinstance(row['Language'], str), "\n",
        )
    



In [44]:



# convert the dtypes --just in case
ru_data = ru_data.astype({
    "SubjectID":str, 
    "Text_ID": int, 
    "Fix_X": int, 
    "Fix_Y": int,
    "Fix_Duration": int,
    "Word_Number": int,
    "Sentence": str,
    "Language":str,
})



In [45]:


ru_data_subject_id = set(ru_data.SubjectID)



# Check ups

In [46]:


age_gender_ru = age_gender.loc[age_gender.lang == "ru"]
age_gender_ru_subject_id = set(age_gender_ru.SubjectID)
age_gender_ru



Unnamed: 0,SubjectID,Age,Sex,lang
357,rum_1,30,0,ru
358,rum_2,20,0,ru
359,rum_3,20,0,ru
360,rum_4,20,0,ru
361,rum_5,29,0,ru
...,...,...,...,...
458,ru_51,30,0,ru
459,ru_52,29,0,ru
460,ru_53,27,1,ru
461,ru_54,28,1,ru


In [47]:


demo_ru = demo.loc[demo.lang == "ru"]
demo_ru_subject_id = set(demo_ru.SubjectID)




In [48]:


print(
    f" age-gender =  {len(age_gender_ru_subject_id)} \n", 
    f"ru_demo    =  {len(demo_ru_subject_id)} \n",
    f"fixation   = {len(ru_data_subject_id)} \n", 
)




 age-gender =  104 
 ru_demo    =  94 
 fixation   = 110 



In [49]:


print(
    
    f" missing in age_gender or demo    :  {age_gender_ru_subject_id.symmetric_difference(demo_ru_subject_id)} \n",
    f"missing in age_gender or fixation: {age_gender_ru_subject_id.symmetric_difference(ru_data_subject_id)} \n",
    f"missing in fixaiton or demo      : {ru_data_subject_id.symmetric_difference(demo_ru_subject_id)} \n",
    
)  



 missing in age_gender or demo    :  {'rum_23', 'rum_46', 'rum_66', 'rum_10', 'rum_47', 'ru_8', 'ru_52', 'ru_32', 'ru_37', 'rum_34_1', 'ru_34', 'rum_59', 'rum_25', 'rum_57', 'ru_56', 'ru_2', 'ru_4', 'rum_61', 'rum_24', 'ru_9', 'rum_60', 'rum_43', 'rum_55', 'ru_33'} 
 missing in age_gender or fixation: {'rum_66', 'ru_8', 'ru_56', 'ru_2', 'ru_4', 'rum_61', 'ru_9', 'rum_55'} 
 missing in fixaiton or demo      : {'rum_23', 'rum_46', 'rum_10', 'rum_47', 'ru_52', 'ru_32', 'ru_37', 'rum_34_1', 'ru_34', 'rum_59', 'rum_25', 'rum_57', 'rum_24', 'rum_60', 'rum_43', 'ru_33'} 



In [50]:



print(
    
    f" missing in demo vs age_gender    :  {demo_ru_subject_id.difference(age_gender_ru_subject_id)} \n",
    f"missing in fixation vs age_gender:  {ru_data_subject_id.difference(age_gender_ru_subject_id)} \n",

)





 missing in demo vs age_gender    :  {'rum_61', 'ru_56', 'ru_2', 'rum_55', 'ru_8', 'ru_4', 'ru_9'} 
 missing in fixation vs age_gender:  {'rum_61', 'ru_56', 'ru_2', 'rum_55', 'ru_8', 'ru_4', 'ru_9'} 



In [51]:

print(
    
    f" missing in age_gender vs demo   :  {age_gender_ru_subject_id.difference(demo_ru_subject_id)} \n",
    f"missing in age_gender vs fixation:  {age_gender_ru_subject_id.difference(ru_data_subject_id)} \n",

)



 missing in age_gender vs demo   :  {'rum_46', 'rum_25', 'rum_24', 'rum_43', 'rum_66', 'ru_37', 'rum_10', 'ru_33', 'rum_47', 'rum_34_1', 'ru_52', 'ru_32', 'ru_34', 'rum_59', 'rum_23', 'rum_57', 'rum_60'} 
 missing in age_gender vs fixation:  {'rum_66'} 



In [52]:



print(
    
    f" missing in fixation vs demo   :  {ru_data_subject_id.difference(demo_ru_subject_id)} \n",
    f"missing in demo vs fixation   :  {demo_ru_subject_id.difference(ru_data_subject_id)} \n",

)



 missing in fixation vs demo   :  {'rum_46', 'rum_25', 'rum_24', 'rum_43', 'ru_37', 'rum_10', 'ru_33', 'rum_47', 'rum_34_1', 'ru_52', 'ru_32', 'ru_34', 'rum_59', 'rum_23', 'rum_57', 'rum_60'} 
 missing in demo vs fixation   :  set() 



## midterm conclusion:



**We will work with this subject in our further analysis**




In [53]:


subject_to_keep = age_gender_ru_subject_id.intersection(
    demo_ru_subject_id).intersection(ru_data_subject_id)

len(subject_to_keep)


87

In [54]:

ru_age_gender = age_gender_ru.loc[age_gender_ru.SubjectID.isin(subject_to_keep)]
ru_demo = demo_ru.loc[demo_ru.SubjectID.isin(subject_to_keep)]
ru_fix_data = ru_data.loc[ru_data.SubjectID.isin(subject_to_keep)]


age_gender_ru.shape, ru_age_gender.shape, demo_ru.shape, ru_demo.shape, ru_data.shape, ru_fix_data.shape



((104, 4), (87, 4), (94, 13), (87, 13), (233878, 9), (184791, 9))

In [55]:


ru_fix_data.isnull().sum()




SubjectID       0
Text_ID         0
Fix_X           0
Fix_Y           0
Fix_Duration    0
Word_Number     0
Word            0
Sentence        0
Language        0
dtype: int64

In [56]:



ru_demo.isnull().sum()




subid                 0
SubjectID             0
lang                  0
L2_spelling_skill     0
L2_vocabulary_size    0
vocab.t2.5            0
L2_lexical_skill      0
TOWRE_word            0
TOWRE_nonword         0
motiv                 0
IQ                    0
Target_Ave            0
Target_Label          0
dtype: int64

In [57]:


ru_age_gender.isnull().sum()



SubjectID    0
Age          0
Sex          0
lang         0
dtype: int64

In [60]:



demo_ru_concat = concat_dfs(
    df1=ru_demo, df2=ru_age_gender, 
    features1=[
        'SubjectID', 'lang', 'L2_spelling_skill', 'L2_vocabulary_size',
        'vocab.t2.5', 'L2_lexical_skill', 'TOWRE_word', 'TOWRE_nonword',
        'motiv', 'IQ', 'Target_Ave', 'Target_Label'], 
    features2=["Age", "Sex"],
)




In [61]:
demo_ru_concat

Unnamed: 0,SubjectID,lang,L2_spelling_skill,L2_vocabulary_size,vocab.t2.5,L2_lexical_skill,TOWRE_word,TOWRE_nonword,motiv,IQ,Target_Ave,Target_Label,Age,Sex
0,rum_1,ru,4.431818,3.55,4.500,4.0625,3.798077,4.523810,3.7,7.0,4.144367,4.0,30,0
0,rum_2,ru,3.977273,4.25,5.000,4.5000,3.557692,3.492063,4.0,10.0,4.129505,4.0,20,0
0,rum_3,ru,4.431818,4.25,4.750,4.8125,3.990385,4.603175,2.4,7.0,4.472980,4.0,20,0
0,rum_4,ru,4.318182,3.60,4.875,4.5000,3.750000,3.412698,3.2,5.0,4.075980,4.0,20,0
0,rum_5,ru,5.000000,3.80,4.750,4.3125,4.182692,4.682540,3.6,8.0,4.454622,4.0,29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,ru_50,ru,4.204545,2.35,4.125,4.3750,3.461538,2.936508,3.7,9.0,3.575432,4.0,21,0
0,ru_51,ru,3.750000,1.95,4.125,3.5625,3.990385,4.206349,4.1,9.0,3.597372,4.0,30,0
0,ru_53,ru,4.090909,1.60,3.250,3.8750,3.509615,3.571429,4.3,11.0,3.316159,3.0,27,1
0,ru_54,ru,4.318182,2.40,4.375,3.5000,3.653846,4.126984,3.8,8.0,3.729002,4.0,28,1


In [64]:


ru_fix_demo = concat_dfs(
    df1=ru_fix_data, df2=demo_ru_concat, 
    features1=[        
        'Text_ID', 'Fix_X', 'Fix_Y', 'Fix_Duration', 
        'Word_Number', 'Word', 'Sentence', 'Language'],
    features2=[
        'SubjectID', 'L2_spelling_skill', 'L2_vocabulary_size',
        'vocab.t2.5', 'L2_lexical_skill', 'TOWRE_word', 'TOWRE_nonword',
        'motiv', 'IQ', 'Age', 'Sex', 'Target_Ave', 'Target_Label',], 
)



In [67]:


ru_fix_demo.to_csv("../Datasets/DataToUse/Ru_Fix_Demo.csv", index=False)

