In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

import os

import numpy as np
import pandas as pd

from utils.paths import RAW_DATA_DIR, RAN_DIR

## 1. Data Prep

### Prep RAN Scores

#### DysCover Children

In [2]:
dyscover_ran = pd.read_csv(RAW_DATA_DIR / 'RAN_DysCover.csv')
dyscover_ran

Unnamed: 0,Child ID,Age,RAN time (s),Nbr of mistakes,Nbr of forgotten words,TOTAL,objectPerSecond,HASH
0,12D_before_1,6.0,97.0,2,5,132.0,0.151515,2024-06-05_10-01-57
1,12D_before_2,6.0,68.0,0,3,83.0,0.240964,2024-06-05_10-01-57
2,12F_before_1,6.0,47.0,0,2,57.0,0.350877,2024-06-05_10-01-57
3,12F_before_2,6.0,47.0,0,3,62.0,0.322581,2024-06-05_10-01-57
4,12F_after_1,6.0,58.0,0,2,68.0,0.294118,2024-06-05_10-01-57
...,...,...,...,...,...,...,...,...
73,18D_before_1,5.0,48.5,0,0,48.5,0.412371,2024-06-05_10-34-55
74,18D_before_2,5.0,43.0,0,0,43.0,0.465116,2024-06-05_10-34-55
75,18F_before_1,5.0,79.0,2,0,89.0,0.224719,2024-06-05_10-34-55
76,18F_after_1,5.0,57.0,1,0,62.0,0.322581,2024-06-05_10-34-55


In [3]:
dyscover_ran['Language'] = dyscover_ran['Child ID'].str[2]
dyscover_ran['ID'] = dyscover_ran['Child ID'].str[:2]
dyscover_ran['Group'] = 'DysCover'
dyscover_ran = dyscover_ran.drop(columns=['RAN time (s)', 'Nbr of mistakes', 'Nbr of forgotten words', 'TOTAL', 'Child ID'])
dyscover_ran

Unnamed: 0,Age,objectPerSecond,HASH,Language,ID,Group
0,6.0,0.151515,2024-06-05_10-01-57,D,12,DysCover
1,6.0,0.240964,2024-06-05_10-01-57,D,12,DysCover
2,6.0,0.350877,2024-06-05_10-01-57,F,12,DysCover
3,6.0,0.322581,2024-06-05_10-01-57,F,12,DysCover
4,6.0,0.294118,2024-06-05_10-01-57,F,12,DysCover
...,...,...,...,...,...,...
73,5.0,0.412371,2024-06-05_10-34-55,D,18,DysCover
74,5.0,0.465116,2024-06-05_10-34-55,D,18,DysCover
75,5.0,0.224719,2024-06-05_10-34-55,F,18,DysCover
76,5.0,0.322581,2024-06-05_10-34-55,F,18,DysCover


#### Fruit Ninja Adults

In [4]:
FN_ran = pd.read_csv(RAW_DATA_DIR / 'RAN_FruitNinja.csv')

In [5]:
FN_ran['ID'] = FN_ran['Participant number']
FN_ran['objectPerSecond'] = FN_ran['RAN(obj/sec)']
FN_ran['Group'] = 'FruitNinja'

# check what labels are in the column
print(FN_ran['Primary language'].unique())

# Define mapping
language_map = {
    'French': 'F',
    'Swissgerman': 'D', 
    'Luxembourgish / German': 'D',
    'German': 'D',
    'Chinese / English': 'E',
}

# Apply the mapping and store in a new column, e.g. 'Language'
FN_ran['Language'] = FN_ran['Primary language'].map(language_map)

FN_ran = FN_ran.drop(columns=['Participant number', 'Date', 'Name', 'Sex', 'Primary language',
       'Hand dominance', 'Dyslexia diagnosis', 'Dyslexia in family',
       'Visual impairment, contacts', 'Inter-eye distance [mm]', 'Notes',
       'RAN(obj/sec)', 'FN'])

FN_ran


['Swissgerman' 'Luxembourgish / German' 'German' 'French'
 'Chinese / English']


Unnamed: 0,Age,HASH,ID,objectPerSecond,Group,Language
0,21,2024-03-08_13-36-24,1,1.2,FruitNinja,D
1,26,2024-03-08_14-14-27,2,2.22,FruitNinja,D
2,22,2024-03-08_14-38-57,3,1.79,FruitNinja,D
3,26,2024-03-08_15-41-30,4,1.86,FruitNinja,D
4,29,2024-03-08_16-18-12,5,1.95,FruitNinja,D
5,34,2024-03-08_16-40-11,6,1.42,FruitNinja,D
6,27,2024-03-21_11-19-30,7,1.72,FruitNinja,D
7,23,2024-03-21_14-03-47,8,1.88,FruitNinja,F
8,21,2024-03-21_14-29-21,9,1.93,FruitNinja,F
9,20,2024-03-21_17-41-08,10,1.51,FruitNinja,F


#### Adults Spring

In [6]:
as_ran = pd.read_csv(RAW_DATA_DIR / 'RAN_AdultSpring.csv')
as_ran['Group'] = 'AdultSpring'
as_ran['Age'] = np.nan
as_ran.head()

Unnamed: 0,HASH,Merge,Language,ID,number of error 1,number of error 2,number of error 3,number of error 4,time added/error,RAN1 t,...,objectPerSecond 2,objectPerSecond 3,objectPerSecond 4,Mean 1-2 score RAN,Mean score RAN,Notes,number of FN rounds,highest FN score,Group,Age
0,2025-04-15_09-22-00,2025-04-15_09-25-00,F,SUB 311,0,0,0.0,0.0,0,11.209,...,1.349892,2.201673,1.571092,1.567,1.73,Manually renamed the files,3,808,AdultSpring,
1,2025-04-15_15-13-11,2025-04-15_15-21-59,F,SUB 600,0,0,0.0,0.0,0,14.726,...,1.054185,1.456664,1.440611,1.206,1.33,,5,662,AdultSpring,
2,2025-04-15_16-33-38,,F,SUB 262,0,0,0.0,0.0,0,16.311,...,1.02923,1.497903,1.184764,1.128,1.23,,5,762,AdultSpring,
3,2025-04-16_11-29-18,,F,SUB 685,0,0,0.0,0.0,0,12.49,...,1.574927,1.797268,1.703578,1.588,1.67,,5,806,AdultSpring,
4,2025-04-17_08-06-40,,F,SUB 743,0,0,,,0,13.419,...,1.946093,,,1.718,1.72,,3,903,AdultSpring,


In [7]:
as_ran = as_ran.drop(columns=['Merge', 'number of error 1', 'number of error 2',
       'number of error 3', 'number of error 4', 'time added/error', 'RAN1 t',
       'RAN2 t', 'RAN3 t', 'RAN4 t', 'Score RAN 1', 'Score RAN 2',
       'Score RAN 3', 'Score RAN 4', 'Mean 1-2 score RAN',
       'Mean score RAN ', 'Notes', 'number of FN rounds', 'highest FN score'])


In [8]:
# make long format to match previous datasets
as_ran_long = pd.melt(
    as_ran,
    id_vars=['HASH', 'Language', 'ID', 'Group', 'Age'], # missing age that needs to be added later
    value_vars=[
        'objectPerSecond 1',
        'objectPerSecond 2',
        'objectPerSecond 3',
        'objectPerSecond 4'
    ],
    var_name='Run',
    value_name='objectPerSecond'
)

as_ran_long.sort_values(by=['HASH', 'Run'], inplace=True)
as_ran_long.drop(columns=['Run'], inplace=True)
as_ran_long.reset_index(drop=True, inplace=True)
as_ran_long


Unnamed: 0,HASH,Language,ID,Group,Age,objectPerSecond
0,2025-03-31_09-07-17,F,Ali,AdultSpring,,1.149425
1,2025-03-31_09-07-17,F,Ali,AdultSpring,,1.069748
2,2025-03-31_09-07-17,F,Ali,AdultSpring,,1.540120
3,2025-03-31_09-07-17,F,Ali,AdultSpring,,1.433897
4,2025-03-31_09-49-48,D,Lisa,AdultSpring,,0.725953
...,...,...,...,...,...,...
111,2025-05-05_15-33-21,F,SUB 990,AdultSpring,,0.649287
112,2025-05-08_09-09-51,F,SUB 390,AdultSpring,,0.848644
113,2025-05-08_09-09-51,F,SUB 390,AdultSpring,,0.864304
114,2025-05-08_09-09-51,F,SUB 390,AdultSpring,,0.872182


## 2. Make HASH Table

In [9]:
combined_df = pd.concat([dyscover_ran, FN_ran, as_ran_long], ignore_index=True, sort=True)
combined_df = combined_df[combined_df['objectPerSecond'].notna()]
combined_df

Unnamed: 0,Age,Group,HASH,ID,Language,objectPerSecond
0,6.0,DysCover,2024-06-05_10-01-57,12,D,0.151515
1,6.0,DysCover,2024-06-05_10-01-57,12,D,0.240964
2,6.0,DysCover,2024-06-05_10-01-57,12,F,0.350877
3,6.0,DysCover,2024-06-05_10-01-57,12,F,0.322581
4,6.0,DysCover,2024-06-05_10-01-57,12,F,0.294118
...,...,...,...,...,...,...
204,,AdultSpring,2025-05-05_15-33-21,SUB 990,F,0.649287
205,,AdultSpring,2025-05-08_09-09-51,SUB 390,F,0.848644
206,,AdultSpring,2025-05-08_09-09-51,SUB 390,F,0.864304
207,,AdultSpring,2025-05-08_09-09-51,SUB 390,F,0.872182


In [10]:
# select the row with the highest 'objectPerSecond'
hash_table = combined_df.loc[combined_df.groupby('HASH')['objectPerSecond'].idxmax()]

# reset index for a clean DataFrame
hash_table = hash_table.reset_index(drop=True)

# also serves as the best RAN score regardless of language
hash_table

Unnamed: 0,Age,Group,HASH,ID,Language,objectPerSecond
0,21.0,FruitNinja,2024-03-08_13-36-24,1,D,1.200000
1,26.0,FruitNinja,2024-03-08_14-14-27,2,D,2.220000
2,22.0,FruitNinja,2024-03-08_14-38-57,3,D,1.790000
3,26.0,FruitNinja,2024-03-08_15-41-30,4,D,1.860000
4,29.0,FruitNinja,2024-03-08_16-18-12,5,D,1.950000
...,...,...,...,...,...,...
56,,AdultSpring,2025-04-29_14-39-36,SUB 799,D,2.157730
57,,AdultSpring,2025-04-29_15-55-26,SUB 848,D,1.916627
58,,AdultSpring,2025-05-02_13-02-10,SUB 531,F,2.473411
59,,AdultSpring,2025-05-05_15-33-21,SUB 990,F,0.649287


In [11]:
hash_table.describe()

Unnamed: 0,Age,objectPerSecond
count,31.0,61.0
mean,13.870968,1.394779
std,9.745912,0.62267
min,4.0,0.31746
25%,6.0,0.769231
50%,6.0,1.578283
75%,22.5,1.88
max,34.0,2.473411


In [12]:
# -- SAVE AS CSV ---
output_csv_path = RAN_DIR / 'RAN_HashTable.csv'
overwrite = False  # Set to True if you want to overwrite existing files

if not os.path.exists(output_csv_path) or overwrite:
    hash_table.to_csv(output_csv_path, index=False)
    print(f"Features saved to {output_csv_path}")

else:
    print(f"File already exists: {output_csv_path}. Not overwriting.")


File already exists: /HOME/lecomteo/thesis/master_thesis/data/processed/RAN/RAN_HashTable.csv. Not overwriting.


### Determine Best Scores for DE RAN

In [13]:
df_de = combined_df[combined_df['Language'] == 'D']

# select the row with the highest 'objectPerSecond'
best_scores_de = df_de.loc[df_de.groupby('HASH')['objectPerSecond'].idxmax()]

# reset index for a clean DataFrame
best_scores_de = best_scores_de.reset_index(drop=True)

best_scores_de

Unnamed: 0,Age,Group,HASH,ID,Language,objectPerSecond
0,21.0,FruitNinja,2024-03-08_13-36-24,1,D,1.2
1,26.0,FruitNinja,2024-03-08_14-14-27,2,D,2.22
2,22.0,FruitNinja,2024-03-08_14-38-57,3,D,1.79
3,26.0,FruitNinja,2024-03-08_15-41-30,4,D,1.86
4,29.0,FruitNinja,2024-03-08_16-18-12,5,D,1.95
5,34.0,FruitNinja,2024-03-08_16-40-11,6,D,1.42
6,27.0,FruitNinja,2024-03-21_11-19-30,7,D,1.72
7,19.0,FruitNinja,2024-03-25_13-21-43,12,D,1.59
8,20.0,FruitNinja,2024-03-28_12-25-50,15,D,2.16
9,6.0,DysCover,2024-06-05_10-01-57,12,D,0.240964


In [14]:
best_scores_de.describe()

Unnamed: 0,Age,objectPerSecond
count,23.0,37.0
mean,13.086957,1.254602
std,10.121593,0.684044
min,4.0,0.240964
25%,5.5,0.666667
50%,6.0,1.42
75%,21.5,1.79
max,34.0,2.401537


### Determine Best Scores for FR RAN

In [15]:
df_fr = combined_df[combined_df['Language'] == 'F']

# select the row with the highest 'objectPerSecond'
best_scores_fr = df_fr.loc[df_fr.groupby('HASH')['objectPerSecond'].idxmax()]

# reset index for a clean DataFrame
best_scores_fr = best_scores_fr.reset_index(drop=True)

best_scores_fr

Unnamed: 0,Age,Group,HASH,ID,Language,objectPerSecond
0,23.0,FruitNinja,2024-03-21_14-03-47,8,F,1.88
1,21.0,FruitNinja,2024-03-21_14-29-21,9,F,1.93
2,20.0,FruitNinja,2024-03-21_17-41-08,10,F,1.51
3,23.0,FruitNinja,2024-03-21_18-07-22,11,F,1.56
4,25.0,FruitNinja,2024-03-26_14-16-11,13,F,1.86
5,6.0,DysCover,2024-06-05_10-01-57,12,F,0.350877
6,5.0,DysCover,2024-06-05_10-34-55,18,F,0.322581
7,5.0,DysCover,2024-06-05_10-56-25,21,F,0.350877
8,6.0,DysCover,2024-06-06_10-02-38,16,F,0.217391
9,6.0,DysCover,2024-06-06_10-32-17,17,F,0.20202


In [16]:
best_scores_fr.describe()

Unnamed: 0,Age,objectPerSecond
count,16.0,32.0
mean,10.75,1.287085
std,8.193493,0.689401
min,4.0,0.20202
25%,5.0,0.662322
50%,6.0,1.503951
75%,20.25,1.881876
max,25.0,2.473411
