In [None]:
# Had issues installing pyreadstat so used magic command in notebook instead
# %pip install pyreadstat

In [1]:
import glob
import pyreadstat
import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path

In [2]:
# Establish path to data folder
DATA_DIR = Path('/Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/dsp/data')

#todo make cross-platform safe, or just add your own absolute path to the data folder 

# Tracking data

## List of available tracking data

Determine available tracking data per date, school, and class by subfolder name.

⚠️ Currently only 2023 tracking data has been loaded since we do not have the survey responses to 2022 data.  

In [3]:
filename = 'tracking-data-all'
path = DATA_DIR / '02_interim' / f"{filename}.csv"

summary_filename = 'tracking-data-summary'
summary_path = DATA_DIR / '02_interim' / f"{summary_filename}.csv"

if path.exists() and summary_path.exists():
    print(f'Found {path}')
    tracking_data = pd.read_csv(path).astype(str)
    tracking_data['date'] = pd.to_datetime(tracking_data['date'])
    
    print(f'Found {summary_path}')
    tracking_summary = pd.read_csv(summary_path).astype(str)
    tracking_summary['date'] = pd.to_datetime(tracking_summary['date'])    
else:
    # Get 2023 school/class info from tracking data subfolder names
    folder = DATA_DIR / '01_tracking' / '2023'
    subfolders = [f.name for f in folder.iterdir() if f.is_dir()]
    
    # Initialize list to store all tracking data entries
    all_data = []
    
    # Process each subfolder
    for subfolder in subfolders:
        date, school_num = subfolder.split('_s')
        school_num, class_num = school_num.split('_c')
        
        # Get the full path to the subfolder
        subfolder_path = folder / subfolder
        
        # Find all CSV files in the subfolder
        csv_files = list(subfolder_path.glob('*.csv'))
        
        if csv_files:
            # Process each CSV file in the subfolder
            for csv_file in csv_files:
                file_name = csv_file.name
                
                # Extract tracklab_id from filename by splitting on '] ' and taking the part after
                if '] ' in file_name:
                    tracklab_id = file_name.split('] ', 1)[1]
                    # Remove .csv extension if present
                    tracklab_id = tracklab_id.replace('.csv', '')
                else:
                    # Handle case where the expected delimiter isn't found
                    tracklab_id = file_name.replace('.csv', '')
                
                # Add entry for this specific CSV file
                all_data.append([date, school_num, class_num, tracklab_id])
        else:
            # If no CSV files found, still add the folder info without tracklab_id
            all_data.append([date, school_num, class_num, None])
    
    # Create df with date, school, class, and tracklab_id
    tracking_data = pd.DataFrame(all_data, columns=['date', 'school', 'class', 'tracklab_id'])
    tracking_data = tracking_data.astype(str)
    tracking_data['date'] = pd.to_datetime(tracking_data['date'], format='%Y_%m_%d')
    
    # Create summary df of unique date, school, and class combinations
    tracking_summary= tracking_data[['date', 'school', 'class']].drop_duplicates().reset_index(drop=True)
    
    # Save dfs as csv
    summary_filename = 'tracking-data-summary'
    summary_path = DATA_DIR / '02_interim' / f"{summary_filename}.csv"
    summary_path.parent.mkdir(parents=True, exist_ok=True)
    tracking_summary.to_csv(summary_path, index=False)
    print("Summary data saved to", summary_path)
    
    path.parent.mkdir(parents=True, exist_ok=True)
    tracking_data.to_csv(path, index=False)
    print("All data saved to", path)
    
    display(tracking_data)

tracking_data.groupby(['date', 'school', 'class'])['tracklab_id'].nunique()

Found /Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/dsp/data/02_interim/tracking-data-all.csv
Found /Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/dsp/data/02_interim/tracking-data-summary.csv


date        school  class
2023-04-11  42      102      16
2023-05-11  43      103      30
2023-05-23  46      107      20
2023-05-24  47      108      15
2023-05-31  1       104      22
2023-06-08  45      105      22
2023-06-09  45      106      16
Name: tracklab_id, dtype: int64

## Load TrackLabID keyfiles

In [4]:
filename = 'keyfile_tracklab_id'
path = DATA_DIR / '02_interim' / f"{filename}.csv"

if path.exists():
    # Load formatted keyfile
    print(f'Found {path}')
    keyfile_tagID = pd.read_csv(path).astype(str)
    display(keyfile_tagID)
else:
    # Load and format raw keyfile + save 
    path_raw = DATA_DIR / 'keyfiles' / 'Keyfile_csv.csv'
    keyfile_tagID = pd.read_csv(path_raw, delimiter=';')
    
    keyfile_tagID = keyfile_tagID.astype(str)
    keyfile_tagID = keyfile_tagID.rename(columns={'Tagnumber': 'tagnumber', 'TrackLabID': 'tracklab_id'})
    
    path.parent.mkdir(parents=True, exist_ok=True)
    keyfile_tagID.to_csv(path, index=False)
    print("Data saved to", path)

Found /Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/dsp/data/02_interim/keyfile_tracklab_id.csv


Unnamed: 0,tagnumber,tracklab_id,subject_id
0,1,0x24025F48A3E6,
1,2,0x24025F48A133,
2,3,0x24025F44F8D7,
3,4,0x24046130B076,
4,5,0x24046131F437,
...,...,...,...
65,131,,
66,132,,
67,133,,
68,134,,


# Keyfiles

In [14]:
folder = DATA_DIR / 'keyfiles'
# file_path = folder.glob('*.xlsx')

keyfiles = pd.DataFrame()

for file in folder.glob('*.xlsx'):
    try:
        df = pd.read_excel(file, engine='openpyxl')
        df['source'] = file.stem
        keyfiles = pd.concat([keyfiles, df], ignore_index=True)
        print(f'Loaded: {file.name}')
    except Exception as e:
        print(f'Could not read {file.name}: {e}')

Loaded: keyfile school 1 class 104.xlsx
Loaded: keyfile school 41 class 100.xlsx
Loaded: keyfile school 41 class 101.xlsx
Loaded: keyfile school 42 class 102.xlsx
Loaded: keyfile school 43 class 103.xlsx
Loaded: keyfile school 45 class 105.xlsx
Could not read ~$keyfile school 43 class 103.xlsx: File is not a zip file
Loaded: keyfile school 46 class 107.xlsx
Loaded: keyfile school 45 class 106.xlsx
Loaded: keyfile school 47 class 108.xlsx


In [15]:
# Convert all entries to string
keyfiles = keyfiles.apply(lambda x: x.apply(lambda y: str(int(y)) if pd.notna(y) and isinstance(y, (float, int)) else y))

Keyfiles **do not** share the same structure. Columns containing tag numbers are called tagnummer, tagnummer , tagnr, etc. Teachers are not entered according to an ID number but mostly denoted as 'leerkracht'. 

I'm merging the columns containing tag numbers into one column and assigning school and class numbers to techers, incl. ID number '9999'. 

In [16]:
# Create a new column 'tagnumber' that combines all the tag number columns
keyfiles['tagnumber'] = keyfiles['tagnummer'].copy()

# Inspect column names
print(keyfiles.columns)

Index(['school ID', 'klas ID', 'subject ID', 'id', 'voornaam', 'achternaam',
       'consent', 'tagnummer ', 'sID_survey', 'ID_survey', 'source', 'tagnr.',
       'comment', 'tagnr', 'trackingnnumer', 'Unnamed: 11', 'tagnummer',
       'tagnumber'],
      dtype='object')


In [17]:
# List of tag number columns
tag_columns = ['tagnummer', 'tagnummer ', 'tagnr.', 'tagnr', 'trackingnnumer']

# Fill NaN values in 'tagnumber' with values from other tag columns
for col in tag_columns:
    if col != 'tagnummer':  # Skip the first column as we already copied it
        keyfiles['tagnumber'] = keyfiles['tagnumber'].fillna(keyfiles[col])

keyfiles = keyfiles.drop(columns=tag_columns)

In [18]:
keyfiles.columns

Index(['school ID', 'klas ID', 'subject ID', 'id', 'voornaam', 'achternaam',
       'consent', 'sID_survey', 'ID_survey', 'source', 'comment',
       'Unnamed: 11', 'tagnumber'],
      dtype='object')

In [23]:
# Get the list of all columns in the DataFrame
all_columns = keyfiles.columns.tolist()

# Find the indices of 'source' and 'tagnumber' columns
source_index = all_columns.index('source')
tagnumber_index = all_columns.index('tagnumber')

columns_between = all_columns[source_index+1:tagnumber_index]

# Stack their entries in one series
fill_values = keyfiles[columns_between].stack().groupby(level=0).first()

# Unify them in new column 'comment', then remove individual columns
keyfiles['comments'] = np.nan
keyfiles['comments'] = keyfiles['comments'].fillna(fill_values)
keyfiles = keyfiles.drop(columns = columns_between)

### Clean tag numbers
1. Remove rows where tag numbers are empty or not a number

In [326]:
# Inspect values in tag number column
# keyfiles['tagnumber'].unique()

array(['9', '19', '14', '29', '28', '31', '33', '22', '5', '20', '2',
       '30', '32', '18', '11', '1', '3', '34', '10', nan, '27', '25', '-',
       '24', '15', '7', '17', '6', '21', '13', '26', '35',
       'niet aanwezig, uit vragenlijst gehaald', 'niet aanwezig', '12',
       'x '], dtype=object)

In [327]:
# # Keep only rows containing digits in tag number column
# keyfiles = keyfiles[
#     keyfiles['tagnumber'].notna() & 
#     keyfiles['tagnumber'].astype(str).str.isdigit()
# ]
# 
# # Inspect values in tag number again -> OK
# keyfiles['tagnumber'].unique()

In [25]:
# Rename columns for alignment
keyfiles = keyfiles.rename(columns={'school ID': 'school', 'klas ID': 'class', 'id': 'person_id', 'subject ID': 'subject_id'})

# Add dates to keyfile by mapping
keyfiles['date'] = np.nan
date_map = tracking_data.set_index(['school', 'class'])['date'].to_dict()

# # Update 'date' in keyfiles where keys match
keyfiles['date'] = keyfiles.apply(
    lambda row: date_map.get((row['school'], row['class']), row['date']),
    axis=1
)

In [30]:
# Rename columns for alignment (fixed typo in variable name)
# keyfiles = keyfiles.rename(columns={'tagnummer': 'tagnumber'})
# keyfile_tagID = keyfile_tagID.rename(columns={'Tagnumber': 'tagnumber', 'TrackLabID': 'tracklabID'}).astype(str)

# Create mapping dictionary - convert keys to same type as keyfiles['tagnumber']
tracklab_id_map = keyfile_tagID.dropna(subset=['tagnumber']).set_index('tagnumber')['tracklab_id'].to_dict()

# Map values more efficiently using map() instead of apply
keyfiles['tracklab_id'] = keyfiles['tagnumber'].map(tracklab_id_map)

In [31]:
# keyfiles = keyfiles.rename(columns={'subject ID': 'subject_id'})

# Inspect consent entries
print('Entries in CONSENT:')
print(keyfiles['consent'].unique())

# Get position of 'source' column
source_position = list(keyfiles.columns).index('source')

# Identify columns before 'source'
cols_before_source = keyfiles.columns[:source_position]

# Check if any of these columns contain 'leerkracht'
has_leerkracht = keyfiles[cols_before_source].apply(
    lambda col: col.astype(str).str.contains('leerkracht', case=False, na=False)
).any(axis=1)

# Assign '9999' to 'subject_ID' where 'leerkracht' was found
keyfiles.loc[has_leerkracht, 'subject_id'] = '9999'

# Replace positive non-digit consent entries with '1'
keyfiles['consent'] = keyfiles['consent'].replace(
    {'ja': '1', 'leerkracht': '1'}
)

# Inspect consent entries
print('Entries in CONSENT:')
print(keyfiles['consent'].unique())

Entries in CONSENT:
['1' '4' '5' 'ja, geen tracking' nan 'Ja, geen tracking' '35'
 'received oral permission from parent on day of data collection']
Entries in CONSENT:
['1' '4' '5' 'ja, geen tracking' nan 'Ja, geen tracking' '35'
 'received oral permission from parent on day of data collection']


In [32]:
#todo find out why school 44 has been (inconsistently) renamed school 1, and how we should call it
keyfiles['school'] = keyfiles['school'].astype(str)
# keyfiles.loc[keyfiles['school']=='44', 'school'] = '1'

# Teacher responses

Teacher survey responses look like they were collected through an online form. Delivered raw as wide-format SPSS files. 

# IOP scores

Confirmed with Nathalie that IOP responses were optional.
If IOP response was given, variable name contains 'Q68' and student number. It's then followed by Q70 and Q71 with matching student number.

School and class data has been added to each dataframe from source filename. This data is surely already present in the questionnaire, but I cannot decipher under which variable it's present. Hence, a workaround.

In [37]:
# Determine path to raw SPSS files
folder = DATA_DIR / '01_survey' / 'teacher_raw_2023'

# Initiate empty dict to store teacher questionnaire dfs
tq_all = {}

for file in folder.glob('*.sav'):
    var_name = file.stem
    df = pd.read_spss(file)
    
    # Add school/class as columns to each df from filename
    school_num, class_num = var_name.split('_')[1:]
    df['school'] = school_num
    df['class'] = class_num
    
    # Store df in dict with filename as key
    tq_all[var_name] = df
    print(f'Loaded dataframe: {var_name}')
    
print(f'Total dataframes: {len(tq_all)}')

Loaded dataframe: tq_49_113
Loaded dataframe: tq_1_104
Loaded dataframe: tq_41_100
Loaded dataframe: tq_41_101
Loaded dataframe: tq_42_102
Loaded dataframe: tq_43_103
Loaded dataframe: tq_45_105
Loaded dataframe: tq_45_106
Loaded dataframe: tq_46_107
Loaded dataframe: tq_47_108
Loaded dataframe: tq_49_110
Loaded dataframe: tq_49_111
Loaded dataframe: tq_49_112
Total dataframes: 13


In [172]:
# Uncomment to inspect example of available columns
# print(tq_all['tq_1_104'].columns.tolist())

In [38]:
# Initiate empty dict to store relevant tq only
tq_relevant = {}

for df in tq_all:
    school_num = str(tq_all[df]['school'].iloc[0])
    class_num = str(tq_all[df]['class'].iloc[0])
    
    tq_match = tracking_data[
        (tracking_data['school'] == school_num) &
        (tracking_data['class'] == class_num)
    ]
    
    if not tq_match.empty:
        tq_relevant[df] = tq_all[df]
        print(f"Matching dataframe: {df}")

#todo concatenate AFTER the columns have been equalized
# tq = pd.concat(tq_relevant, ignore_index=True)

print(f"Total matching: {len(tq_relevant)}")

Matching dataframe: tq_1_104
Matching dataframe: tq_42_102
Matching dataframe: tq_43_103
Matching dataframe: tq_45_105
Matching dataframe: tq_45_106
Matching dataframe: tq_46_107
Matching dataframe: tq_47_108
Total matching: 7


Based on reading the codebook and inspecting the answers in the raw files, I've determined the following:
* Q30 = school
* Q31 = class
* Q32 = ?
* Q27 = T_gender
* Q28 = T_age
* Q29 = T_dutch
* Q30.0 = T_exp1
* Q31.0 = T_exp2
* Q32.0 = T_time_teaching
* Q33 = T_class_comp

In [39]:
# Uncomment to inspect dataset
# tq_relevant['tq_1_104']

Q68: "In vergelijking met andere leerlingen bezoek ik [naam kind]"

Q68 responses: Minder vaak, Gemiddeld, Vaker

IOP response Q68 is given per student. Variable name format is 'Q68_N', where N should match an entry in keyfiles['subject ID']. By matching the subject ID, Q68 can then be matched to the 4-digit 'ID' in the file containing student survey responses (once these have been fixed). For an initial analysis, the matching to the 'subject ID' and thus to tracking tag numbers should be enough. 

In [40]:
# Eliminating irrelevant columns in tq dataframes

# Lists of relevant questions
descriptives = ['Q27', 'Q28', 'Q29', 'Q30', 'Q31', 'Q32', 'Q33']
iop_id = ['Q68']  # Add 'Q70', 'Q71' for detailed IOP responses

tq_filtered = {}

for key, df in tq_relevant.items():
    # Create a mask for columns to keep
    cols_to_keep = []
    
    for col in df.columns:
        # Check if column matches any descriptive column
        if any(q_id in col for q_id in descriptives):
            cols_to_keep.append(col)
        # Check if column contains any of the specified question IDs
        elif any(q_id in col for q_id in iop_id):
            cols_to_keep.append(col)
    
    # Create a new dataframe with only the columns to keep
    tq_filtered[key] = df[cols_to_keep]

Load IOP Q68 values into keyfiles df

In [41]:
# Create a new column 'iop' in keyfiles if it doesn't exist
if 'iop' not in keyfiles.columns:
    keyfiles['iop'] = None  # Initialize with None values

# Iterate through each dataframe in the dictionary
for df_name, df in tq_filtered.items():
    # Make a copy of the dataframe to ensure we're working with a clean copy
    df_copy = df.copy()
    
    # Convert columns Q30 and Q31 to string type using .loc to avoid the warning
    df_copy.loc[:, 'Q30'] = df_copy['Q30'].astype(str)
    df_copy.loc[:, 'Q31'] = df_copy['Q31'].astype(str)
    
    # Identify the Q68_N columns (those that start with 'Q68_')
    q68_cols = [col for col in df_copy.columns if col.startswith('Q68_')]
    
    # Iterate through each row in the dataframe
    for idx, row in df_copy.iterrows():
        school = row['Q30']
        class_val = row['Q31']
        
        # Check each Q68_N column for matching subjects
        for q68_col in q68_cols:
            # Extract just the number part from Q68_N column name
            subject_id = q68_col.split('_')[1]  # Extract the N from Q68_N
            
            # Get the value from this Q68_N cell
            q68_value = row[q68_col]
            
            # Only proceed if the cell has a valid value
            if pd.notna(q68_value) and str(q68_value) != "0" and str(q68_value) != "":
                # Find matching rows in keyfiles where all three conditions are met
                matching_rows = keyfiles[(keyfiles['school'] == school) & 
                                        (keyfiles['class'] == class_val) & 
                                        (keyfiles['sID_survey'] == subject_id)]
                
                # If matches found, update the 'iop' column with the actual value from Q68_N
                if not matching_rows.empty:
                    keyfiles.loc[matching_rows.index, 'iop'] = q68_value

# TEMP: Export merged file

File includes connection school & class -> subject_id -> tagnumber -> tracklab_id + iop

In [42]:
temp = tracking_data.copy().astype(str)
temp.loc[temp['school']=='1', 'school'] = '44'

In [43]:
temp[temp['school']=='44']

Unnamed: 0,date,school,class,tracklab_id
101,2023-05-31,44,104,0x24025F44B7A5
102,2023-05-31,44,104,0x24025F44DBFA
103,2023-05-31,44,104,0x24025F44E6FB
104,2023-05-31,44,104,0x24025F44ECCF
105,2023-05-31,44,104,0x24025F44F682
106,2023-05-31,44,104,0x24025F465724
107,2023-05-31,44,104,0x240461308FB5
108,2023-05-31,44,104,0x24046130B6FA
109,2023-05-31,44,104,0x24046130B9B6
110,2023-05-31,44,104,0x24046130BA41


In [44]:
# First, let's find the missing tracklab_ids for each school-class combination
missing_entries = []

# Get unique school-class combinations from keyfiles
school_class_combinations = keyfiles[['school', 'class']].drop_duplicates()

# For each school-class combination
for _, row in school_class_combinations.iterrows():
    school = row['school']
    class_val = row['class']
    
    # Get all tracklab_ids for this school-class in tracking_data
    tracking_ids = temp[(temp['school'] == school) & 
                        (temp['class'] == class_val)]['tracklab_id'].unique()
    
    # Get all tracklab_ids for this school-class already in keyfiles
    keyfiles_ids = keyfiles[(keyfiles['school'] == school) & 
                           (keyfiles['class'] == class_val)]['tracklab_id'].unique()
    
    # Find tracklab_ids in tracking_data but not in keyfiles
    missing_ids = set(tracking_ids) - set(keyfiles_ids)
    
    # Create new rows for each missing tracklab_id
    for missing_id in missing_ids:
        # Create a new row with school, class, and tracklab_id
        new_row = {
            'school': school,
            'class': class_val,
            'tracklab_id': missing_id
        }
        missing_entries.append(new_row)

# Create DataFrame from the missing entries
if missing_entries:
    missing_df = pd.DataFrame(missing_entries)
    
    # Append the missing entries to keyfiles
    keyfiles = pd.concat([keyfiles, missing_df], ignore_index=True)
    
    print(f"Added {len(missing_entries)} new rows to keyfiles for missing tracklab_ids")
else:
    print("No missing tracklab_ids found")

# Display the updated keyfiles DataFrame
keyfiles.sort_values(by=['school', 'class', 'tracklab_id'])[['school', 'class', 'tracklab_id']]

No missing tracklab_ids found


Unnamed: 0,school,class,tracklab_id
22,41,100,0x24025F449A89
21,41,100,0x24025F44AD21
28,41,100,0x24025F44DBFA
33,41,100,0x24025F44E6FB
35,41,100,0x24025F44ECCF
...,...,...,...
141,,,
163,,,
185,,,
206,,,


In [45]:
# temp = tracking_data.copy().astype(str)
# temp.loc[temp['school']=='1'] = '44'

export = keyfiles.loc[keyfiles['school'].isin(temp['school'].unique())]

filename = 'merged-data-WIP'
today = pd.to_datetime('today').strftime('%Y-%m-%d_%H-%M')
savepath = DATA_DIR / '02_interim' / f"{filename}_{today}.xlsx"
export.to_excel(savepath, index=False, engine='openpyxl')

# Student responses

In [332]:
filename = 'TotalData_T1_all_cbs_ethnicity_gender'
path = DATA_DIR / '01_survey' / f"{filename}.xlsx"

students_raw = pd.read_excel(path)

students = students_raw.copy()

# Rename columns to match keyfiles
students = students.rename(columns={'School_ID': 'school', 'Class_ID': 'class', 'sID': 'subject_id', 'ID': 'person_id'}).astype(str)

Turn all columns before 'age' to string.

This is totally arbitrary typecasting; I just need select columns in this range to be string. 

In [333]:
# # Get position of 'age' column
# age_position = list(students.columns).index('age')
# 
# # Identify columns before 'age'
# cols_before_age = students.columns[:age_position]
# 
# # Typecast to string
# students[cols_before_age] = students[cols_before_age].astype('str')

In [334]:
# # Create a set of valid (school, class) pairs from tracking_data
# valid_pairs = set(zip(tracking_data['school'], tracking_data['class']))
# 
# # Filter students DataFrame
# students = students[
#     students.apply(lambda row: (row['school'], row['class']) in valid_pairs, axis=1)
# ]

In [335]:
print(students_raw.shape)
print(students.shape)

(310, 534)
(310, 534)


In [336]:
# Create list of SPARTS score columns
st_rel = [col for col in students_raw.columns if col.startswith('st_rel')]

# Get index of 'gender_sr'
gender_sr_idx = students.columns.get_loc('gender_sr')

# Get all columns up to and including 'gender_sr'
base_cols = students.columns[:gender_sr_idx+1]

# Combine with st_rel columns
students = students[list(base_cols) + st_rel]

In [337]:
# Discovered duplicate subject_id in raw student responses because my merge would not work, removing it now
duplicate = students.loc[
    (students.duplicated(['school', 'class', 'subject_id'], keep=False)) &
    (students['consent'].astype(str)!='1')
]

duplicate

Unnamed: 0,time,school,class,cohort,tracking,condition_seating,condition_game,nPupils,nAbsent,dataPresent,...,st_rel4,st_rel5,st_rel6,st_rel7,st_rel8,st_rel9,st_rel10,st_rel11,st_rel13,st_rel12
114,1,41,100,2223,0,0,0,25,,0,...,,,,,,,,,,
128,1,41,100,2223,0,0,0,25,,0,...,,,,,,,,,,
238,1,45,105,2223,0,0,0,22,,0,...,,,,,,,,,,


In [338]:
students = students.drop(duplicate.index)

In [202]:
students_short = students[['school','class','tracking','nPupils','dataPresent','person_id','consent','subject_id','informed','assent']]

## Align student responses with keyfiles

According to Nathalie, the order of the subject IDs ('subjectID') in the keyfiles are correct and can be matched to the student responses in the column 'sID'. The 4-digit person ID numbers ('id') were noted down incorrectly in the keyfiles. They can be copied from the student responses, according to 'sID'. -- MC, 01-05-2025   

In [308]:
# First, ensure the columns are standardized for proper matching
for col in ['school', 'class', 'subject_id']:
    # Convert to string, strip whitespace, and make lowercase to ensure matching
    keyfiles[col] = keyfiles[col].astype(str).str.strip().str.lower()
    students[col] = students[col].astype(str).str.strip().str.lower()

# Create a composite key from the matching columns
keyfiles['match_key'] = keyfiles['school'] + '|' + keyfiles['class'] + '|' + keyfiles['subject_id']
students['match_key'] = students['school'] + '|' + students['class'] + '|' + students['subject_id']

# Create a dictionary mapping from match_key to person_id
person_id_map = students.set_index('match_key')['person_id'].to_dict()

# Apply the mapping to keyfiles based on match_key
keyfiles['person_id'] = keyfiles['match_key'].map(person_id_map)

# Remove the temporary match_key column
# keyfiles.drop('match_key', axis=1, inplace=True)
# students.drop('match_key', axis=1, inplace=True)

In [None]:
key

In [257]:
# Change school 44 to 1 to match student response file
keyfiles.loc[keyfiles['school']=='44', 'school'] = '1'

In [290]:
# Prep for merge: typecast cols as str just in case
# # Convert relevant columns to string in both DataFrames
for col in ['school', 'class', 'subject_id', 'person_id']:
    keyfiles.loc[:, col] = keyfiles[col].astype(str).str.strip()
    students.loc[:, col] = students[col].astype(str).str.strip()
    
# merge_cols = ['school', 'class', 'subject_id', 'person_id']
# keyfiles[merge_cols] = keyfiles[merge_cols].astype('str')
# students[merge_cols] = students[merge_cols].astype('str')

In [291]:

# Merge keyfiles and student response file
matched = keyfiles.merge(
    students[['school', 'class', 'subject_id', 'person_id']],
    on=['school', 'class', 'subject_id'],
    how='left',
    suffixes=('', '_new')
)

# Overwrite keyfiles['person_id'] with matches and assign NaN if no match
# keyfiles['person_id'] = matched['person_id_new']

In [310]:
temp = keyfiles[['school','class','subject_id','person_id', 'consent']]
temp
# temp[temp['person_id_new'].isna()]
# temp.loc[temp['person_id']!=temp['person_id_new']]

Unnamed: 0,school,class,subject_id,person_id,consent
0,47,108,1,2285,1
1,47,108,2,2286,1
2,47,108,3,2287,1
3,47,108,4,2288,1
4,47,108,5,2289,1
...,...,...,...,...,...
203,46,107,16,2278,1
204,46,107,17,2279,1
205,46,107,18,2280,1
206,46,107,19,2281,4


In [311]:
# Debug information to check matching problems
def debug_matching():
    """Function to help debug matching issues"""
    # Check where matches should happen but don't
    keyfiles_sample = keyfiles.head(10)
    print("Sample keyfiles data:")
    print(keyfiles_sample[['school', 'class', 'subject_id', 'person_id']])
    
    # For each row in sample, check if it should have matched
    for idx, row in keyfiles_sample.iterrows():
        match_key = f"{row['school']}|{row['class']}|{row['subject_id']}"
        matches = students_short[
            (students_short['school'] == row['school']) & 
            (students_short['class'] == row['class']) & 
            (students_short['subject_id'] == row['subject_id'])
        ]
        print(f"\nChecking row {idx}, match_key: {match_key}")
        print(f"Found {len(matches)} matching rows in students_short")
        if len(matches) > 0:
            print("Matching student data:")
            print(matches[['school', 'class', 'subject_id', 'person_id']])

# Uncomment to run the debug function
debug_matching()

Sample keyfiles data:
  school class subject_id person_id
0     47   108          1      2285
1     47   108          2      2286
2     47   108          3      2287
3     47   108          4      2288
4     47   108          5      2289
5     47   108          6      2290
6     47   108          7      2291
7     47   108          8      2292
8     47   108          9      2293
9     47   108         10      2294

Checking row 0, match_key: 47|108|1
Found 1 matching rows in students_short
Matching student data:
    school class subject_id person_id
291     47   108          1      2285

Checking row 1, match_key: 47|108|2
Found 1 matching rows in students_short
Matching student data:
    school class subject_id person_id
292     47   108          2      2286

Checking row 2, match_key: 47|108|3
Found 1 matching rows in students_short
Matching student data:
    school class subject_id person_id
293     47   108          3      2287

Checking row 3, match_key: 47|108|4
Found 1 matching 

In [320]:

# First reset the person_id column in keyfiles (if it already exists)
if 'person_id' in keyfiles.columns:
    keyfiles['person_id'] = np.nan

# Perform the merge
merged = keyfiles.merge(
    students[['school', 'class', 'subject_id', 'person_id']],
    on=['school', 'class', 'subject_id'],
    how='left',
    indicator=True
)

# Only use person_id values from the matches
keyfiles['person_id'] = np.where(
    merged['_merge'] == 'both',
    merged['person_id'],
    np.nan
)


KeyError: 'person_id'

In [316]:
merged

Unnamed: 0,school,class,subject_id,person_id_x,voornaam,achternaam,consent,source,tagnumber,comment,date,tracklab_id,match_key,person_id_y,_merge
0,47,108,1,,,,1,keyfile school 47 class 108,9,,2023-05-24,0x24046130C8AB,47|108|1,2285,both
1,47,108,2,,,,1,keyfile school 47 class 108,19,,2023-05-24,0x24025F44AD21,47|108|2,2286,both
2,47,108,3,,,,1,keyfile school 47 class 108,14,,2023-05-24,0x24025F449A89,47|108|3,2287,both
3,47,108,4,,,,1,keyfile school 47 class 108,29,,2023-05-24,0x24025F44CDE1,47|108|4,2288,both
4,47,108,5,,,,1,keyfile school 47 class 108,28,,2023-05-24,0x24025F4603F8,47|108|5,2289,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,46,107,16,,,,1,keyfile school 46 class 107,25,,2023-05-23,0x240461308FB5,46|107|16,2278,both
206,46,107,17,,,,1,keyfile school 46 class 107,17,,2023-05-23,0x24046131F062,46|107|17,2279,both
207,46,107,18,,,,1,keyfile school 46 class 107,5,,2023-05-23,0x24046131F437,46|107|18,2280,both
208,46,107,19,,,,4,keyfile school 46 class 107,,,2023-05-23,,46|107|19,2281,both


In [115]:
keyfiles[['school', 'class','subject_id', 'person_id']]

Unnamed: 0,school,class,subject_id,person_id
0,47,108,1,2285
1,47,108,2,2286
2,47,108,3,2287
3,47,108,4,2288
4,47,108,5,2289
...,...,...,...,...
203,46,107,16,2277
204,46,107,17,2278
205,46,107,18,2279
206,46,107,19,2280


## Last attempt at aligning before switching to manual

In [340]:
# Create deep copies of the dataframes to avoid modifying the originals
keyfiles_copy = keyfiles.copy()
students_copy = students.copy()

# Function to thoroughly standardize data for matching
def standardize_column(df, column):
    """Apply comprehensive cleaning to ensure matching works"""
    if column in df.columns:
        # Handle different data types appropriately
        if df[column].dtype.kind in 'ifc':  # numeric columns
            # Convert numeric to string without scientific notation
            df[column] = df[column].apply(lambda x: str(int(x)) if pd.notnull(x) and float(x).is_integer() else 
                                          (str(x) if pd.notnull(x) else np.nan))
        else:
            # For string/object columns
            df[column] = df[column].astype(str)
        
        # Apply thorough cleaning
        df[column] = df[column].str.strip()
        df[column] = df[column].str.lower()
        # Remove any non-visible characters
        df[column] = df[column].str.replace(r'\s+', ' ', regex=True)
        # Normalize unicode characters
        import unicodedata
        df[column] = df[column].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ASCII', 'ignore').decode() if isinstance(x, str) else x)
    return df

# Apply standardization to both dataframes
for col in ['school', 'class', 'subject_id']:
    # keyfiles_copy = standardize_column(keyfiles_copy, col)
    students_copy = standardize_column(students_copy, col)

In [None]:
# Alternative explicit approach using direct comparison and assignment
def direct_matching_approach(keyfiles_df, students_df):
    """Direct matching approach that ensures complete control over the matching process"""
    # Make sure person_id column exists in keyfiles
    if 'person_id' not in keyfiles_df.columns:
        keyfiles_df['person_id'] = np.nan
    
    # Iterate through each row in keyfiles
    for idx, keyfile_row in keyfiles_df.iterrows():
        # Find matching rows in students_df
        matches = students_df[
            (students_df['school'] == keyfile_row['school']) & 
            (students_df['class'] == keyfile_row['class']) & 
            (students_df['subject_id'] == keyfile_row['subject_id'])
        ]
        
        # If there's a match, use the person_id
        if len(matches) > 0:
            keyfiles_df.at[idx, 'person_id'] = matches.iloc[0]['person_id']
    
    return keyfiles_df

# If the previous methods still don't work, try this direct approach
direct_result = direct_matching_approach(keyfiles_copy, students_copy)
# keyfiles['person_id'] = direct_result['person_id']

In [342]:
temp = direct_result[['school','class','subject_id','person_id', 'consent']]
temp

Unnamed: 0,school,class,subject_id,person_id,consent
0,47,108,1,2285,1
1,47,108,2,2286,1
2,47,108,3,2287,1
3,47,108,4,2288,1
4,47,108,5,2289,1
...,...,...,...,...,...
203,46,107,16,2278,1
204,46,107,17,2279,1
205,46,107,18,2280,1
206,46,107,19,2281,4


## SPARTS scores

Source:  https://doi.org/10.1111/bjep.12094

Relevant variables named 'SPARTSN' (e.g. 'SPARTS1') in the codebook, but this name is not present in the data. Instead, variables named **'st_relN'** have been identified as SPARTS scores. As explained in the codebook, the questionnaire contained 13 items, but Q13 was not presented to all students. After filtering the dataset for relevant data only (i.e., responses of students whose tracking data we have available), only responses 1-12 were available anyway.  

Q12 is not part of the original scale, but developed for this study.

I cannot find a score sheet for this test that is not behind a paywall. The COTAN entry for the SPARTS lists a 25-item test instead of the 13-item test used. 

#todo ask Yvonne & Nathalie for scoring sheet 