In [1]:


import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Tuple
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold, train_test_split



In [16]:


L2_scores = {
    "L2_spelling_skill": [0, 44], 
    "L2_vocabulary_size":[0, 100], 
    "vocab.t2.5": [0, 40],  # still not clear but assume it is 0, 100
    "L2_lexical_skill": [0, 100],
    "TOWRE_word": [0, 104],
    "TOWRE_nonword": [0, 63],
}
L2_scores


{'L2_spelling_skill': [0, 44],
 'L2_vocabulary_size': [0, 100],
 'vocab.t2.5': [0, 40],
 'L2_lexical_skill': [0, 100],
 'TOWRE_word': [0, 104],
 'TOWRE_nonword': [0, 63]}

In [3]:
for k, v in L2_scores.items():
    print(k, f"Lower={v[0]}, Upper={v[1]}")

L2_spelling_skill Lower=0, Upper=44
L2_vocabulary_size Lower=0, Upper=100
vocab.t2.5 Lower=0, Upper=100
L2_lexical_skill Lower=0, Upper=100
TOWRE_word Lower=0, Upper=104
TOWRE_nonword Lower=0, Upper=63


# Load data for native Ru

In [4]:

age_gender = pd.read_excel("../Datasets/MECO/age_gender.xlsx")
age_gender



Unnamed: 0,SubjectID,Age,Sex,lang
0,du_4,19,0.0,du
1,du_5,21,0.0,du
2,du_6,27,0.0,du
3,du_7,23,1.0,du
4,du_8,27,0.0,du
...,...,...,...,...
592,tr_48,20,0.0,tr
593,tr_49,22,1.0,tr
594,tr_50,21,0.0,tr
595,tr_51,25,0.0,tr


In [5]:

demo = pd.read_excel("../Datasets/MECO/demo_alllang.xlsx")
demo



Unnamed: 0,subid,SubjectID,lang,L2_spelling_skill,L2_vocabulary_size,vocab.t2.5,L2_lexical_skill,TOWRE_word,TOWRE_nonword,motiv,IQ
0,DU_04,du_4,du,31.0,66.0,37.0,75.00,84.0,56.0,3.8,6.0
1,DU_05,du_5,du,35.0,78.0,36.0,93.75,63.0,54.0,4.0,4.0
2,DU_06,du_6,du,40.0,80.0,39.0,95.00,84.0,54.0,3.8,9.0
3,DU_07,du_7,du,33.0,50.0,36.0,66.25,65.0,45.0,3.5,9.0
4,DU_08,du_8,du,32.0,50.0,37.0,73.75,82.0,28.0,3.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...
601,rum_67,rum_67,ru,40.0,69.0,38.0,93.75,88.0,52.0,4.0,11.0
602,rum_68,rum_68,ru,39.0,65.0,37.0,76.65,81.0,46.0,3.8,11.0
603,rum_69,rum_69,ru,37.0,66.0,37.0,68.75,84.0,51.0,3.6,9.0
604,rum_70,rum_70,ru,39.0,40.0,35.0,73.75,74.0,41.0,3.5,8.0


In [6]:


demo.describe()



Unnamed: 0,L2_spelling_skill,L2_vocabulary_size,vocab.t2.5,L2_lexical_skill,TOWRE_word,TOWRE_nonword,motiv,IQ
count,599.0,592.0,592.0,593.0,603.0,603.0,594.0,584.0
mean,35.262104,46.971284,30.445946,76.445093,80.671642,50.344942,3.634007,8.640411
std,5.195806,23.67506,9.714039,11.856747,11.791586,8.924401,0.348287,1.948957
min,21.0,3.0,3.0,46.25,36.0,20.0,2.2,1.0
25%,32.0,24.0,24.0,67.5,73.0,45.0,3.4,7.0
50%,36.0,46.5,35.0,75.0,82.0,53.0,3.7,9.0
75%,39.0,69.0,37.0,86.25,89.0,57.0,3.9,10.0
max,76.0,88.0,40.0,100.0,104.0,69.0,4.4,12.0


In [7]:


ru_data = pd.read_excel("../Datasets/MECO/DATA/ru_data.xlsx")
ru_data


Unnamed: 0,SubjectID,Text_ID,Fix_X,Fix_Y,Fix_Duration,Word_Number,Word,Sentence,Language
0,ru_10,1,138,352,299,77.0,portrait,"In 1825, Morse was i",ru
1,ru_10,1,436,413,338,97.0,ill,"In 1825, Morse was i",ru
2,ru_10,1,507,417,197,98.0,back,"In 1825, Morse was i",ru
3,ru_10,1,1072,411,111,105.0,message,The message had take,ru
4,ru_10,1,1259,408,229,107.0,taken,The message had take,ru
...,...,...,...,...,...,...,...,...,...
236684,rum_9,12,324,150,177,20.0,smart,Technology is rapidl,ru
236685,rum_9,12,126,123,269,1.0,Technology,Technology is rapidl,ru
236686,rum_9,12,355,138,209,20.0,smart,Technology is rapidl,ru
236687,rum_9,12,464,122,102,4.0,expanding,Technology is rapidl,ru


In [8]:


ru_data_subject_id = set(ru_data.SubjectID)



# Check ups

In [9]:


age_gender_ru = age_gender.loc[age_gender.lang == "ru"]
age_gender_ru_subject_id = set(age_gender_ru.SubjectID)
age_gender_ru



Unnamed: 0,SubjectID,Age,Sex,lang
357,rum_1,30,0.0,ru
358,rum_2,20,0.0,ru
359,rum_3,20,0.0,ru
360,rum_4,20,0.0,ru
361,rum_5,29,0.0,ru
...,...,...,...,...
459,ru_52,29,0.0,ru
460,ru_53,27,1.0,ru
461,ru_54,28,1.0,ru
462,ru_55,29,1.0,ru


In [10]:


demo_ru = demo.loc[demo.lang == "ru"]
demo_ru_subject_id = set(demo_ru.SubjectID)
demo_ru



Unnamed: 0,subid,SubjectID,lang,L2_spelling_skill,L2_vocabulary_size,vocab.t2.5,L2_lexical_skill,TOWRE_word,TOWRE_nonword,motiv,IQ
430,ru_10,ru_10,ru,37.0,46.0,33.0,68.75,77.0,45.0,3.4,7.0
431,ru_11,ru_11,ru,40.0,67.0,36.0,97.50,90.0,55.0,3.4,9.0
432,ru_12,ru_12,ru,30.0,26.0,26.0,70.00,78.0,47.0,3.4,10.0
433,ru_13,ru_13,ru,37.0,69.0,39.0,81.25,89.0,54.0,3.6,10.0
434,ru_14,ru_14,ru,30.0,6.0,6.0,70.00,82.0,39.0,3.8,7.0
...,...,...,...,...,...,...,...,...,...,...,...
601,rum_67,rum_67,ru,40.0,69.0,38.0,93.75,88.0,52.0,4.0,11.0
602,rum_68,rum_68,ru,39.0,65.0,37.0,76.65,81.0,46.0,3.8,11.0
603,rum_69,rum_69,ru,37.0,66.0,37.0,68.75,84.0,51.0,3.6,9.0
604,rum_70,rum_70,ru,39.0,40.0,35.0,73.75,74.0,41.0,3.5,8.0


In [11]:

print(
    f" age-gender =  {len(age_gender_ru_subject_id)} \n", 
    f"ru_demo    =  {len(demo_ru_subject_id)} \n",
    f"fixation   = {len(ru_data_subject_id)} \n", 
)


 age-gender =  107 
 ru_demo    =  110 
 fixation   = 110 



In [12]:


print(
    
    f" missing in age_gender or demo    :  {age_gender_ru_subject_id.symmetric_difference(demo_ru_subject_id)} \n",
    f"missing in age_gender or fixation: {age_gender_ru_subject_id.symmetric_difference(ru_data_subject_id)} \n",
    f"missing in fixaiton or demo      : {ru_data_subject_id.symmetric_difference(demo_ru_subject_id)} \n",
    
)  



 missing in age_gender or demo    :  {'ru_2', 'rum_66', 'ru_4', 'ru_8', 'ru_9'} 
 missing in age_gender or fixation: {'ru_2', 'rum_66', 'ru_4', 'ru_8', 'ru_9'} 
 missing in fixaiton or demo      : set() 



In [13]:



print(
    
    f" missing in demo vs age_gender    :  {demo_ru_subject_id.difference(age_gender_ru_subject_id)} \n",
    f"missing in fixation vs age_gender:  {ru_data_subject_id.difference(age_gender_ru_subject_id)} \n",

)





 missing in demo vs age_gender    :  {'ru_2', 'ru_8', 'ru_4', 'ru_9'} 
 missing in fixation vs age_gender:  {'ru_2', 'ru_8', 'ru_4', 'ru_9'} 



In [14]:

print(
    
    f" missing in age_gender vs demo   :  {age_gender_ru_subject_id.difference(demo_ru_subject_id)} \n",
    f"missing in age_gender vs fixation:  {age_gender_ru_subject_id.difference(ru_data_subject_id)} \n",

)



 missing in age_gender vs demo   :  {'rum_66'} 
 missing in age_gender vs fixation:  {'rum_66'} 



In [15]:



print(
    
    f" missing in fixation vs demo   :  {ru_data_subject_id.difference(demo_ru_subject_id)} \n",
    f"missing in demo vs fixation   :  {demo_ru_subject_id.difference(ru_data_subject_id)} \n",

)



 missing in fixation vs demo   :  set() 
 missing in demo vs fixation   :  set() 



## midterm conclusion:

- We should drop out "{'ru_9', 'rum_66', 'ru_4', 'ru_8', 'ru_2'}" subjects from our experiments.



In [20]:


ru_data.isnull()



Unnamed: 0,SubjectID,Text_ID,Fix_X,Fix_Y,Fix_Duration,Word_Number,Word,Sentence,Language
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
236684,False,False,False,False,False,False,False,False,False
236685,False,False,False,False,False,False,False,False,False
236686,False,False,False,False,False,False,False,False,False
236687,False,False,False,False,False,False,False,False,False


In [21]:


ru_data.isnull().sum()




SubjectID          0
Text_ID            0
Fix_X              0
Fix_Y              0
Fix_Duration       0
Word_Number       26
Word              26
Sentence        2811
Language           0
dtype: int64

In [22]:



demo_ru.isnull().sum()




subid                  0
SubjectID              0
lang                   0
L2_spelling_skill      3
L2_vocabulary_size     7
vocab.t2.5             7
L2_lexical_skill      11
TOWRE_word             1
TOWRE_nonword          1
motiv                  9
IQ                     0
dtype: int64

In [23]:


age_gender_ru.isnull().sum()



SubjectID    0
Age          0
Sex          3
lang         0
dtype: int64

In [24]:

demo_ru



Unnamed: 0,subid,SubjectID,lang,L2_spelling_skill,L2_vocabulary_size,vocab.t2.5,L2_lexical_skill,TOWRE_word,TOWRE_nonword,motiv,IQ
430,ru_10,ru_10,ru,37.0,46.0,33.0,68.75,77.0,45.0,3.4,7.0
431,ru_11,ru_11,ru,40.0,67.0,36.0,97.50,90.0,55.0,3.4,9.0
432,ru_12,ru_12,ru,30.0,26.0,26.0,70.00,78.0,47.0,3.4,10.0
433,ru_13,ru_13,ru,37.0,69.0,39.0,81.25,89.0,54.0,3.6,10.0
434,ru_14,ru_14,ru,30.0,6.0,6.0,70.00,82.0,39.0,3.8,7.0
...,...,...,...,...,...,...,...,...,...,...,...
601,rum_67,rum_67,ru,40.0,69.0,38.0,93.75,88.0,52.0,4.0,11.0
602,rum_68,rum_68,ru,39.0,65.0,37.0,76.65,81.0,46.0,3.8,11.0
603,rum_69,rum_69,ru,37.0,66.0,37.0,68.75,84.0,51.0,3.6,9.0
604,rum_70,rum_70,ru,39.0,40.0,35.0,73.75,74.0,41.0,3.5,8.0


In [25]:

def convert_range(x_old, old_min, old_max, new_min, new_max):
    
    x_new = ((new_max - new_min)*(x_old - old_min))/(old_max - old_min)+ new_min
    
    return x_new 




In [26]:


convert_range(2.5, 0, 5, 2, 20, )




11.0

In [27]:

df = pd.DataFrame(
    np.arange(0, 9).reshape(-1, 3), 
    index=list("ABC"), 
    columns=["f1", "f2", "f3"]
)
df



Unnamed: 0,f1,f2,f3
A,0,1,2
B,3,4,5
C,6,7,8


In [28]:


k = "f3"
tmp = df[k].apply(convert_range, args=(df[k].min(), df[k].max(), 0, 20))
df[k] = tmp

    

In [29]:

df


Unnamed: 0,f1,f2,f3
A,0,1,0.0
B,3,4,10.0
C,6,7,20.0


In [30]:

df

Unnamed: 0,f1,f2,f3
A,0,1,0.0
B,3,4,10.0
C,6,7,20.0


In [39]:


def concat_dfs(df1, df2, features1, features2):

    """ concatenates df2 to df1, that is, it casts df2's dimensions df1. """

    data = []
    subject_ids = df2.SubjectID
    for subject_id in subject_ids:
        print("subject_id:", subject_id)
        tmp1 = df1.loc[(df1.SubjectID == subject_id)]
        tmp1 = tmp1.loc[:, features1].reset_index(drop=True)
        tmp2 = df2.loc[df2.SubjectID == subject_id]
        tmp2 = tmp2.loc[:, features2]

        n = tmp1.shape[0]
        if n == tmp2.shape[0]:
            tmp2 = pd.concat([tmp1], ignore_index=True)
        else:
            tmp2 = pd.concat([tmp2] * n, ignore_index=True)  # .reset_index(drop=True)

        tmp3 = pd.concat([tmp1, tmp2], axis=1, )

        if tmp3.shape[0] != tmp1.shape[0] or tmp3.shape[0] != tmp2.shape[0]:
            print(
                subject_id,
                "in consistencies in number of observations (rows)"
            )

        if tmp3.shape[1] != tmp1.shape[1] + tmp2.shape[1]:
            print(
                subject_id,
                "inconsistencies in feature space (columns)"
            )

        data.append(tmp3)

    return pd.concat(data)


In [40]:


df1 = demo_ru
df2 = age_gender_ru

features1 = list(demo_ru.columns[1:])
features2=["SubjectID", "Age", "Sex"]


In [35]:

data = []
subject_ids = df2.SubjectID
for subject_id in subject_ids:
    print("subject_id:", subject_id)
    tmp1 = df1.loc[(df1.SubjectID == subject_id)]
    tmp1 = tmp1.loc[:, features1].reset_index(drop=True)
    tmp2 = df2.loc[df2.SubjectID == subject_id]
    tmp2 = tmp2.loc[:, features2]

    n = tmp1.shape[0]


subject_id: rum_1
subject_id: rum_2
subject_id: rum_3
subject_id: rum_4
subject_id: rum_5
subject_id: rum_6
subject_id: rum_7
subject_id: rum_8
subject_id: rum_9
subject_id: rum_10
subject_id: rum_11
subject_id: rum_12
subject_id: rum_13
subject_id: rum_14
subject_id: rum_15
subject_id: rum_16
subject_id: rum_17
subject_id: rum_18
subject_id: rum_19
subject_id: rum_20
subject_id: rum_21
subject_id: rum_22
subject_id: rum_23
subject_id: rum_24
subject_id: rum_25
subject_id: rum_26
subject_id: rum_27
subject_id: rum_28
subject_id: rum_29
subject_id: rum_31
subject_id: rum_33
subject_id: rum_34
subject_id: rum_34_1
subject_id: rum_39
subject_id: rum_41
subject_id: rum_42
subject_id: rum_43
subject_id: rum_44
subject_id: rum_45
subject_id: rum_46
subject_id: rum_47
subject_id: rum_49
subject_id: rum_50
subject_id: rum_51
subject_id: rum_52
subject_id: rum_53
subject_id: rum_54
subject_id: rum_55
subject_id: rum_56
subject_id: rum_57
subject_id: rum_58
subject_id: rum_59
subject_id: rum_60


In [41]:



demo_concat = concat_dfs(df1=demo_ru, df2=age_gender_ru, 
                         features1=features1, 
                         features2=features2
)




subject_id: rum_1
subject_id: rum_2
subject_id: rum_3
subject_id: rum_4
subject_id: rum_5
subject_id: rum_6
subject_id: rum_7
subject_id: rum_8
subject_id: rum_9
subject_id: rum_10
subject_id: rum_11
subject_id: rum_12
subject_id: rum_13
subject_id: rum_14
subject_id: rum_15
subject_id: rum_16
subject_id: rum_17
subject_id: rum_18
subject_id: rum_19
subject_id: rum_20
subject_id: rum_21
subject_id: rum_22
subject_id: rum_23
subject_id: rum_24
subject_id: rum_25
subject_id: rum_26
subject_id: rum_27
subject_id: rum_28
subject_id: rum_29
subject_id: rum_31
subject_id: rum_33
subject_id: rum_34
subject_id: rum_34_1
subject_id: rum_39
subject_id: rum_41
subject_id: rum_42
subject_id: rum_43
subject_id: rum_44
subject_id: rum_45
subject_id: rum_46
subject_id: rum_47
subject_id: rum_49
subject_id: rum_50
subject_id: rum_51
subject_id: rum_52
subject_id: rum_53
subject_id: rum_54
subject_id: rum_55
subject_id: rum_56
subject_id: rum_57
subject_id: rum_58
subject_id: rum_59
subject_id: rum_60


ValueError: No objects to concatenate

In [None]:
list(age_gender_ru.columns)