In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

import pandas as pd

from utils.paths import EXTRACTED_FEATURES_DIR, RAN_DIR

In [2]:
# --- CONFIGURATION ---
# Load RAN scores
ran_scores_df = pd.read_csv(RAN_DIR / "RAN_HashTable.csv")
merged_df = pd.read_csv(EXTRACTED_FEATURES_DIR / "merged_features.csv")

# Filter to keep only participants present in merged_df
final_hash = merged_df["HASH"].tolist()
filtered_ran_df = ran_scores_df[ran_scores_df["HASH"].isin(final_hash)]

In [3]:
# Summary statistics of Age per Group
summary = filtered_ran_df.groupby("Group")["Age"].describe()

# Add participant count per group
summary["Participant_Count"] = filtered_ran_df.groupby("Group").size()

# Display group-level age summary
print("Group-wise Age Summary:")
print(summary)

print("Total Participants:", len(final_hash))

# Language distribution within each group
language_counts = filtered_ran_df.groupby("Group")["Language"].value_counts()

print("\nLanguage Counts per Group:")
print(language_counts)

# Percent of participants with Language 'D'
percent_D = (filtered_ran_df["Language"] == "D").sum() / len(filtered_ran_df) * 100
print(f"\nPercentage of participants with Language 'D' for best RAN: {percent_D:.2f}%")

Group-wise Age Summary:
             count     mean       std   min   25%   50%   75%   max  \
Group                                                                 
AdultSpring    0.0      NaN       NaN   NaN   NaN   NaN   NaN   NaN   
DysCover      16.0   5.5625  0.629153   4.0   5.0   6.0   6.0   6.0   
FruitNinja    14.0  24.0000  4.150996  19.0  21.0  23.0  26.0  34.0   

             Participant_Count  
Group                           
AdultSpring                 26  
DysCover                    17  
FruitNinja                  14  
Total Participants: 57

Language Counts per Group:
Group        Language
AdultSpring  F           14
             D           12
DysCover     D           11
             F            6
FruitNinja   D            9
             F            5
Name: count, dtype: int64

Percentage of participants with Language 'D' for best RAN: 56.14%


In [4]:
dyscover_ran = pd.read_csv(RAN_DIR / "RAN_DysCover.csv")
dyscover_ran

Unnamed: 0,Child ID,Age,RAN time (s),Nbr of mistakes,Nbr of forgotten words,TOTAL,objectPerSecond,HASH
0,12D_before_1,6.0,97.0,2,5,132.0,0.151515,2024-06-05_10-01-57
1,12D_before_2,6.0,68.0,0,3,83.0,0.240964,2024-06-05_10-01-57
2,12F_before_1,6.0,47.0,0,2,57.0,0.350877,2024-06-05_10-01-57
3,12F_before_2,6.0,47.0,0,3,62.0,0.322581,2024-06-05_10-01-57
4,12F_after_1,6.0,58.0,0,2,68.0,0.294118,2024-06-05_10-01-57
...,...,...,...,...,...,...,...,...
73,18D_before_1,5.0,48.5,0,0,48.5,0.412371,2024-06-05_10-34-55
74,18D_before_2,5.0,43.0,0,0,43.0,0.465116,2024-06-05_10-34-55
75,18F_before_1,5.0,79.0,2,0,89.0,0.224719,2024-06-05_10-34-55
76,18F_after_1,5.0,57.0,1,0,62.0,0.322581,2024-06-05_10-34-55


In [5]:
# Extract relevant info from 'Child ID'
lowest_per_child = dyscover_ran[dyscover_ran["HASH"].isin(final_hash)]
lowest_per_child["Participant"] = lowest_per_child["Child ID"].str[:2]

# Find the row with the lowest objectPerSecond
idx_min_per_child = lowest_per_child.groupby("Participant")["objectPerSecond"].idxmin()
lowest_per_child = lowest_per_child.loc[idx_min_per_child]
lowest_per_child

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lowest_per_child["Participant"] = lowest_per_child["Child ID"].str[:2]


Unnamed: 0,Child ID,Age,RAN time (s),Nbr of mistakes,Nbr of forgotten words,TOTAL,objectPerSecond,HASH,Participant
0,12D_before_1,6.0,97.0,2,5,132.0,0.151515,2024-06-05_10-01-57,12
35,13F_after,6.0,43.0,0,0,43.0,0.465116,2024-06-06_11-09-27,13
10,14D_before_2,6.0,91.0,0,0,91.0,0.21978,2024-06-06_08-02-47,14
24,16D_after_1,6.0,63.0,1,5,93.0,0.215054,2024-06-06_10-02-38,16
28,17F_before_2,6.0,87.0,2,1,102.0,0.196078,2024-06-06_10-32-17,17
75,18F_before_1,5.0,79.0,2,0,89.0,0.224719,2024-06-05_10-34-55,18
14,19D_before_2,6.0,25.0,1,0,30.0,0.666667,2024-06-06_08-28-26,19
49,20D_before_1,6.0,30.0,1,0,35.0,0.571429,2024-06-11_13-47-59,20
6,21D_after_1,5.0,152.0,1,2,177.0,0.112994,2024-06-05_10-56-25,21
56,22F_after_2,6.0,36.0,0,8,76.0,0.263158,2024-06-11_14-21-08,22


In [6]:
before_count_dys = (
    lowest_per_child["Child ID"].str.contains("before", case=False, na=False).sum()
)
before_count_dys

np.int64(11)

In [7]:
as_ran = pd.read_csv(RAN_DIR / "RAN_AdultSpring.csv")

In [8]:
# Check for patterns in lowest RAN score
object_per_sec_cols = [col for col in as_ran.columns if "objectPerSecond" in col]
as_ran["min_objectPerSecond"] = as_ran[object_per_sec_cols].idxmin(axis=1)
lowest_per_as = as_ran[as_ran["HASH"].isin(final_hash)]
summary = lowest_per_as.groupby("min_objectPerSecond").size()
summary

min_objectPerSecond
objectPerSecond 1     8
objectPerSecond 2    15
objectPerSecond 3     1
objectPerSecond 4     2
dtype: int64

In [9]:
before_count_as = (
    lowest_per_as["min_objectPerSecond"].str.contains("1|2", case=False, na=False).sum()
)
before_count_as

np.int64(23)

In [10]:
tot = len(lowest_per_child) + len(lowest_per_as)
sum_before = before_count_dys + before_count_as
percent_before = sum_before / tot * 100
print(
    f"Percentage of particpants with a better score before FN: {percent_before:.2f}% ({sum_before}/{tot})"
)

Percentage of particpants with a better score before FN: 79.07% (34/43)
