In [3]:
import glob
import pyreadstat
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
from pathlib import Path

In [4]:
# Establish path to data in RU workgroup folder
DATA_DIR = Path('/Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/dsp/data')

# Tracking data

❗️ Tracking data is **not processed here**. The files are only loaded to collect TrackLab IDs, school and class numbers, and dates. This information is added to the merged file export (see #Exports).

## Determine available data

Only 2023 tracking data has been loaded since we **do not have the survey responses to 2022 data**.

In [6]:
filename = 'tracking-data-all'
path = DATA_DIR / '02_interim' / f"{filename}.csv"

summary_filename = 'tracking-data-summary'
summary_path = DATA_DIR / '02_interim' / f"{summary_filename}.csv"

# If files already exist, load them
if path.exists() and summary_path.exists():
    print(f'Loaded file: {filename}.csv')
    tracking_data = pd.read_csv(path).astype(str)
    tracking_data['date'] = pd.to_datetime(tracking_data['date'])
    print(f'Loaded file: {summary_filename}.csv')
    tracking_summary = pd.read_csv(summary_path).astype(str)
    tracking_summary['date'] = pd.to_datetime(tracking_summary['date'])    
# If not, create them from raw files
else:
    # Get 2023 school/class info from tracking data subfolder names
    folder = DATA_DIR / '01_tracking' / '2023'
    subfolders = [f.name for f in folder.iterdir() if f.is_dir()]
    
    # Initialize list to store all tracking data entries
    all_data = []
    
    # Work per subfolder
    for subfolder in subfolders:
        date, school_num = subfolder.split('_s')
        school_num, class_num = school_num.split('_c')
        
        # Get full path to subfolder
        subfolder_path = folder / subfolder
        
        # Find CSV files in subfolder
        csv_files = list(subfolder_path.glob('*.csv'))
        
        if csv_files:
            for csv_file in csv_files:
                file_name = csv_file.name
                
                # Extract tracklab_id from filename by splitting on '] '
                if '] ' in file_name:
                    tracklab_id = file_name.split('] ', 1)[1]
                    # Remove .csv extension if present
                    tracklab_id = tracklab_id.replace('.csv', '')
                else:
                    # If no ']' delimiter, take filename
                    tracklab_id = file_name.replace('.csv', '')
                
                # Append tracklab_id to all_data
                all_data.append([date, school_num, class_num, tracklab_id])
        else:
            # If no CSV files found, still add the folder info without tracklab_id
            all_data.append([date, school_num, class_num, None])
    
    # Create df with date, school, class, and tracklab_id
    tracking_data = pd.DataFrame(all_data, columns=['date', 'school', 'class', 'tracklab_id'])
    tracking_data = tracking_data.astype(str)
    tracking_data['date'] = pd.to_datetime(tracking_data['date'], format='%Y_%m_%d')
    
    # Replace school 1 with 44 to align data; it's the same school but different across files
    tracking_data.loc[tracking_data['school']=='1', 'school'] = '44'
    
    # Create summary df of unique date, school, and class combinations
    tracking_summary= tracking_data[['date', 'school', 'class']].drop_duplicates().reset_index(drop=True)
    
    # Save dfs as csv
    summary_filename = 'tracking-data-summary'
    summary_path = DATA_DIR / '02_interim' / f"{summary_filename}.csv"
    summary_path.parent.mkdir(parents=True, exist_ok=True)
    tracking_summary.to_csv(summary_path, index=False)
    print("Summary data saved to", summary_path)
    
    path.parent.mkdir(parents=True, exist_ok=True)
    tracking_data.to_csv(path, index=False)
    print("All data saved to", path)

print("---\nAvailable tracking data:")
display(tracking_data.groupby(['date', 'school', 'class'])['tracklab_id'].nunique().reset_index(name='N tracklab_id'))

Loaded file: tracking-data-all.csv
Loaded file: tracking-data-summary.csv
---
Available tracking data:


Unnamed: 0,date,school,class,N tracklab_id
0,2023-04-11,42,102,16
1,2023-05-11,43,103,30
2,2023-05-23,46,107,20
3,2023-05-24,47,108,15
4,2023-05-31,44,104,22
5,2023-06-08,45,105,22
6,2023-06-09,45,106,16


## Inspect TrackLab files

### Raw file load
Inspection of a single file because min-max figures looked off.

In [7]:
# Establish data folder and inspect subfolders
filepath = DATA_DIR / '01_tracking' / '2023'
subfolders = [p for p in filepath.iterdir() if p.is_dir()]
# print(subfolders)

# Choose one subfolder (tracking day) for random check
subfolder_path = filepath / '2023_04_11_s42_c102'
csv_files = list(subfolder_path.glob('*.csv'))
print(f"\nInspecting {subfolder_path}")
print(f"N of TrackLab files in subfolder: {len(csv_files)}")

# Load all csv files separately into one df, then concatenate
# Takes almost 30sec on my laptop...
tracklab = [pd.read_csv(file, sep=';') for file in csv_files]
print(f"N of TrackLab files loaded into dataframe: {len(tracklab)}")
tracklab = pd.concat(tracklab, ignore_index=True) 


Inspecting /Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/dsp/data/01_tracking/2023/2023_04_11_s42_c102
N of TrackLab files in subfolder: 16
N of TrackLab files loaded into dataframe: 16


In [8]:
# Search for example of crazy value like 940721624896047 in this df -> literal int did not exist, checking for decimal
tracklab[tracklab['X']>=9.4].sort_values(by='X').head()  # Think I found it (idx 102265)

Unnamed: 0,TimeStamp,X,Y,Z,TagId
102265,04/11/2023 13:07:05.618,9.407216,4.041735,0.54761,0x24025F465724
81548,04/11/2023 13:27:27.207,9.419891,3.696139,1.171148,0x24025F44F682
437817,04/11/2023 10:45:45.546,9.42225,3.40748,0.719096,0x24046131F1EE
347814,04/11/2023 13:40:57.632,9.433517,3.48701,0.56626,0x24046131EF0E
181256,04/11/2023 11:26:15.948,9.435173,1.555048,0.164448,0x24046130BB1E


In [13]:
# Check min-max XYZ values
pivot = pd.pivot_table(
    tracklab,
    index='TagId',
    values=['X', 'Y', 'Z'],
    aggfunc={'X': ['min', 'mean', 'max'], 'Y': ['min', 'max'], 'Z': ['min', 'max']}
)

# Rename cols for clarity
pivot.columns = [f"{col}_{func}" for col, func in pivot.columns]

print('Min-max XYZ coordinate values for 2023_04_11_s42_c102.csv:')
display(pivot)

Min-max XYZ coordinate values for 2023_04_11_s42_c102.csv:


Unnamed: 0_level_0,X_max,X_mean,X_min,Y_max,Y_min,Z_max,Z_min
TagId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0x24025F44CDE1,15.092955,1.342349,-24.435585,3.015607,-1.206362,1.230103,0.067725
0x24025F44E6FB,39.5526,4.093858,-6.155361,78.032991,-18.094372,1.396872,-0.042799
0x24025F44F682,97.628709,4.238005,-335.403276,37.529389,-103.28534,1.490345,-0.058508
0x24025F44F8D7,37.243773,4.102761,-34.900087,115.983518,-29.696081,1.30498,0.006909
0x24025F465724,12.482998,4.57982,0.453502,121.239072,-32.836124,1.118668,0.016974
0x24025F48A133,120.117363,3.935432,-163.657265,14343.316801,-45463.081799,1.457794,-0.091884
0x24025F48A3E6,403.406702,2.316815,-124.974161,7.814908,-3.401151,1.146962,-0.016349
0x24046130BA41,366.907745,2.687998,-349.554803,2351.393537,-7430.81482,1.518447,-0.120033
0x24046130BB1E,225.664067,2.817178,-222.462319,64.452108,-38.500365,1.295222,-0.042725
0x24046130C8AB,21.519536,3.338224,-44.904469,9.646207,-2.698372,1.11834,-0.001482


## Animate movement trajectories

**14-04-2025:** A mini .py attempt. Not successful because timestamps need to be in sequence, and they are currently not. Code left here for future attempts.

In [474]:
# import matplotlib.animation as animation
# from IPython.display import HTML
# 
# # Load the CSV file (adjust the path accordingly)
# # file_path = "your_filename.csv"  # Replace with the correct file path
# # df = pd.read_csv(file_path, delimiter=';')
# 
# # Filter data for a single TagId
# tag_id = tracklab['TagId'].unique()[0]
# tag_data = tracklab[tracklab['TagId'] == tag_id].copy()
# 
# # Convert TimeStamp to datetime and sort
# tag_data['TimeStamp'] = pd.to_datetime(tag_data['TimeStamp'], format="%m/%d/%Y %H:%M:%S.%f")
# tag_data.sort_values('TimeStamp', inplace=True)
# 
# # Set up the plot
# fig, ax = plt.subplots(figsize=(8, 6))
# ax.set_xlim(tag_data['X'].min() - 1, tag_data['X'].max() + 1)
# ax.set_ylim(tag_data['Y'].min() - 1, tag_data['Y'].max() + 1)
# ax.set_title(f'Movement Path Animation for Tag {tag_id}')
# ax.set_xlabel('X (meters)')
# ax.set_ylabel('Y (meters)')
# 
# # Line and moving point
# line, = ax.plot([], [], lw=2)
# point, = ax.plot([], [], 'ro')
# 
# # Initialization function
# def init():
#     line.set_data([], [])
#     point.set_data([], [])
#     return line, point
# 
# # Animation function
# def animate(i):
#     x = tag_data['X'].iloc[:i]
#     y = tag_data['Y'].iloc[:i]
#     line.set_data(x, y)
#     if i > 0:
#         point.set_data(x.iloc[-1], y.iloc[-1])
#     return line, point
# 
# # Create the animation
# ani = animation.FuncAnimation(
#     fig, animate, init_func=init,
#     frames=len(tag_data), interval=20, blit=True
# )
# 
# # Display the animation inline
# HTML(ani.to_jshtml())

# Keyfiles

## TrackLabID keyfile

One CSV file connects TrackLabID numbers to tag numbers. Each tag was placed on a student, and connected to them using an ID number noted down in separate keyfiles (processed below).

In [35]:
filename = 'keyfile_tracklab_id'
path = DATA_DIR / '02_interim' / f"{filename}.csv"

if path.exists():
    # Load formatted keyfile
    print(f'Loaded file: {filename}.csv')
    keyfile_tagID = pd.read_csv(path).astype(str)
else:
    # Load and format raw keyfile + save 
    path_raw = DATA_DIR / 'keyfiles' / 'Keyfile_csv.csv'
    keyfile_tagID = pd.read_csv(path_raw, delimiter=';')
    
    keyfile_tagID = keyfile_tagID.astype(str)
    keyfile_tagID = keyfile_tagID.rename(columns={'Tagnumber': 'tag_id', 'TrackLabID': 'tracklab_id'})
    
    path.parent.mkdir(parents=True, exist_ok=True)
    keyfile_tagID.to_csv(path, index=False)
    print("Data saved to", path)
    
print("---\nDisplaying first five rows of keyfile_tagID:")
display(keyfile_tagID.head())

Loaded file: keyfile_tracklab_id.csv
---
Displaying first five rows of keyfile_tagID:


Unnamed: 0,tag_id,tracklab_id,SubjectID
0,1,0x24025F48A3E6,
1,2,0x24025F48A133,
2,3,0x24025F44F8D7,
3,4,0x24046130B076,
4,5,0x24046131F437,


## Classroom keyfiles

Keyfiles **do not** share the same structure. Columns containing tag numbers are called `tagnummer`, `tagnummer `, `tagnr`, etc. Teachers have not been added to the keyfiles according to an ID number; mostly they are denoted as 'leerkracht' in various columns. 

Columns `sID_survey` and `ID_survey` have been added **manually** from the student response master sheet `TotalData_T1_all_cbs_ethnicity_gender.xlsx` after determining that the 4-digit ID numbers do not match across the individual keyfiles and master sheet, and aligning the data through code proved more time-consuming than copy/pasting. Project owners provided instruction to paste the ID numbers from student response master sheet to keyfiles in the order they are presented.

Columns containing tag numbers have been merged into one column `tagnumber`.

Column `source` has been added to denote source file. 

Column `comment` has been added as a container for any observations made in the classroom.

Teacher have been assigned their school, class, and placeholder ID numbers (9999).  

In [36]:
def process_excel_columns(df):
    
    # Process first two columns
    for col_idx in [0, 1]:  # First and second columns (0-indexed)
        if len(df.columns) > col_idx:  # Check if column exists
            col_name = df.columns[col_idx]
            
            # Extract unique digits from column
            unique_digits = set()
            for val in df.iloc[:, col_idx]:
                try:
                    # Convert to int
                    num = int(float(val))
                    unique_digits.add(num)
                except (ValueError, TypeError):
                    # Skip non-numeric values
                    continue
            
            # If exactly one unique digit found, fill entire column
            if len(unique_digits) == 1:
                digit = unique_digits.pop()
                df.iloc[:, col_idx] = digit
                print(f"Column {col_idx+1} ('{col_name}') filled with value: {digit}")
            else:
                print(f"Column {col_idx+1} ('{col_name}') has {len(unique_digits)} unique digits - no action taken")
    
    return df


### LOAD KEYFILES
folder = DATA_DIR / 'keyfiles'

# Initiate empty df
keyfiles = pd.DataFrame()

print('Loading keyfiles...\n-----')

for file in folder.glob('*.xlsx'):
    try:
        df = pd.read_excel(file, engine='openpyxl')
        df['source'] = file.stem  # Add filename as source in df
        print(f'Loading: {file.name}')
        df = process_excel_columns(df)
        keyfiles = pd.concat([keyfiles, df], ignore_index=True)
        print('-----')
    except:
        # Hidden Excel temp files can mess up the loop
        continue

# Convert all entries to string
keyfiles = keyfiles.apply(lambda x: x.apply(lambda y: str(int(y)) if pd.notna(y) and isinstance(y, (float, int)) else y))


### RENAME COLS & ADD DATES
# Rename columns for alignment
keyfiles = keyfiles.rename(columns={'school ID': 'school', 'klas ID': 'class', 'id': 'person_id', 'subject ID': 'subject_id'})

# Add dates to keyfile by mapping
keyfiles['date'] = np.nan
date_map = tracking_summary.set_index(['school', 'class'])['date'].to_dict()

# # Update 'date' in keyfiles where keys match
keyfiles['date'] = keyfiles.apply(
    lambda row: date_map.get((row['school'], row['class']), row['date']),
    axis=1
)

# Uncomment the lines below to see output
print('Example keyfile entries:')
display(keyfiles.head())

Loading keyfiles...
-----
Loading: keyfile school 45 class 105.xlsx
Column 1 ('school ID') filled with value: 45
Column 2 ('klas ID') filled with value: 105
-----
Loading: keyfile school 1 class 104.xlsx
Column 1 ('school ID') filled with value: 44
Column 2 ('klas ID') filled with value: 104
-----
Loading: keyfile school 41 class 100.xlsx
Column 1 ('school ID') filled with value: 41
Column 2 ('klas ID') filled with value: 100
-----
Loading: keyfile school 41 class 101.xlsx
Column 1 ('school ID') filled with value: 41
Column 2 ('klas ID') filled with value: 101
-----
Loading: keyfile school 42 class 102.xlsx
Column 1 ('school ID') filled with value: 42
Column 2 ('klas ID') filled with value: 102
-----
Loading: keyfile school 43 class 103.xlsx
Column 1 ('school ID') filled with value: 43
Column 2 ('klas ID') filled with value: 103
-----
Loading: keyfile school 46 class 107.xlsx
Column 1 ('school ID') filled with value: 46
Column 2 ('klas ID') filled with value: 107
-----
Loading: keyfile

Unnamed: 0,school,class,subject_id,person_id,voornaam,achternaam,consent,trackingnnumer,comment,sID_survey,ID_survey,source,tagnummer,tagnr.,tagnr,Unnamed: 11,tagnummer.1,date
0,45,105,1,,,,1,22,,20,2221,keyfile school 45 class 105,,,,,,2023-06-08
1,45,105,2,,,,1,20,,21,2222,keyfile school 45 class 105,,,,,,2023-06-08
2,45,105,3,,,,1,11,,22,2223,keyfile school 45 class 105,,,,,,2023-06-08
3,45,105,4,,,,1,5,,23,2224,keyfile school 45 class 105,,,,,,2023-06-08
4,45,105,5,,,,1,2,,24,2225,keyfile school 45 class 105,,,,,,2023-06-08


### Drop irrelevant entries

If school and class were not found in tracking data, drop the corresponding entries from `keyfiles`.  

In [37]:
keyfiles = keyfiles.loc[(keyfiles['school'].isin(tracking_summary['school'])) & (keyfiles['class'].isin(tracking_summary['class']))]

### Comments

Comments left in multiple columns of the keyfiles by the observers are merged into one column `comment`.

In [38]:
# for col in keyfiles.columns[4:]:
#     print(f'Column {col}:\n{keyfiles[col].unique()}\n-----')

cols_keyfiles = keyfiles.columns.tolist() 
idx_source = cols_keyfiles.index('source')
comment_vals = keyfiles[cols_keyfiles[idx_source+1:-1]].stack().groupby(level=0).first()

# Fill keyfiles['comment'] with comment_vals where empty
for idx in keyfiles.index:
    if pd.isna(keyfiles.at[idx, 'comment']) or keyfiles.at[idx, 'comment'] == '':
        if idx in comment_vals.index and not pd.isna(comment_vals[idx]):
            keyfiles.at[idx, 'comment'] = comment_vals[idx]

keyfiles.loc[keyfiles['comment'].astype(str).str.isdigit(), 'comment'] = np.nan
keyfiles.loc[keyfiles['comment']=='-', 'comment'] = np.nan

# Drop old comment columns
keyfiles = keyfiles.drop(columns='Unnamed: 11')

print('Keyfile rows containing comments:')
display(keyfiles.loc[~keyfiles['comment'].isna(), ['school', 'class', 'subject_id', 'comment']])

Keyfile rows containing comments:


Unnamed: 0,school,class,subject_id,comment
9,45,105,10,it could be that not tag 12 but 30 was used fo...
112,43,103,4,"niet aanwezig, uit vragenlijst gehaald"
118,43,103,10,Vult vragenlijst niet in
134,43,103,26,niet aanwezig
143,46,107,1,Ja
162,46,107,20,"since student had no written consent, their na..."


### Tag numbers

In [39]:
# Create a new column 'tag_id' that combines all the tag number columns
keyfiles['tag_id'] = np.nan

# Inspect column names
# print(keyfiles.columns)

# Create list of tag number columns
tag_columns = ['tagnummer', 'tagnummer ', 'tagnr.', 'tagnr', 'trackingnnumer']

# Fill NaN values in 'tag_id' with values from other tag columns
for col in tag_columns:
    keyfiles['tag_id'] = keyfiles['tag_id'].fillna(keyfiles[col])

# Drop old tag number columns
keyfiles = keyfiles.drop(columns=tag_columns)

### REPLACE EMPTY TAGNUMBERS WITH 9999
keyfiles.loc[~keyfiles['tag_id'].astype(str).str.isdigit(), 'tag_id'] = '9999'

#### Teacher tags and subject ID

In [40]:
# Fill empty tag_id for teachers with '35' 
for idx, row in keyfiles.iterrows():
    found_in_row = False
    for col in keyfiles.columns:
        cell = row[col]
        if isinstance(cell, str) and 'leerkracht' in cell.lower():
            print(f"Row {idx}: found 'leerkracht' in column '{col}'")
            found_in_row = True
            break  # Stop checking that row
    
    if found_in_row:
        # Check if tag_id is empty or missing
        tag = row['tag_id']
        if pd.isna(tag) or tag == '' or tag is None or tag == '9999':
            keyfiles.at[idx, 'tag_id'] = '35'
            print("'tag_id' was missing or invalid — changed to '35'")
        else:
            print(f"'tag_id' exists: {tag}")
        
        # Always update consent to '1' if found
        keyfiles.at[idx, 'consent'] = '1'
        # Always update subject_id to '9999' if found
        keyfiles.at[idx, 'subject_id'] = '9999'
        
# Drop name columns - not useful anymore
keyfiles = keyfiles.drop(columns=['voornaam', 'achternaam'])

Row 23: found 'leerkracht' in column 'voornaam'
'tag_id' exists: 35
Row 142: found 'leerkracht' in column 'voornaam'
'tag_id' was missing or invalid — changed to '35'
Row 186: found 'leerkracht' in column 'achternaam'
'tag_id' exists: 35
Row 208: found 'leerkracht' in column 'voornaam'
'tag_id' exists: 27


### Consent

Column `consent` contains digits and commentary. Clear commentary such as 'ja' has been transformed into 1, denoting yes. Any entries *not* containing 1 in this column will not be taken into analysis. 

In [41]:
### REPLACE CONSENT ENTRIES WITH DIGITS -> 1 = YES
# Replace positive non-digit consent entries with '1'
keyfiles['consent'] = keyfiles['consent'].replace({'ja': '1'})

### REPLACE NON-CONSENT ENTRIES WITH '9999' -> ENTRIES TO EXCLUDE ARE THEN ['4','5','9']
keyfiles.loc[~keyfiles['consent'].isin(['1','4','5']), 'consent'] = '9999'

# Inspect consent entries
print('Entries in CONSENT:')
print(keyfiles['consent'].unique())

Entries in CONSENT:
['1' '4' '9999']


## Match TrackLab IDs

In [42]:
# Create mapping dictionary - convert keys to same type as keyfiles['tagnumber']
tracklab_id_map = keyfile_tagID.dropna(subset=['tag_id']).set_index('tag_id')['tracklab_id'].to_dict()

# Map values accordingly
keyfiles['tracklab_id'] = keyfiles['tag_id'].map(tracklab_id_map)

# Sort df
keyfiles.sort_values(by=['school', 'class', 'ID_survey'])

print('Example keyfile with tracklab ids included:')
display(keyfiles.head())

Example keyfile with tracklab ids included:


Unnamed: 0,school,class,subject_id,person_id,consent,comment,sID_survey,ID_survey,source,date,tag_id,tracklab_id
0,45,105,1,,1,,20,2221,keyfile school 45 class 105,2023-06-08,22,0x24046130B6FA
1,45,105,2,,1,,21,2222,keyfile school 45 class 105,2023-06-08,20,0x24046130CCF9
2,45,105,3,,1,,22,2223,keyfile school 45 class 105,2023-06-08,11,0x24025F44F682
3,45,105,4,,1,,23,2224,keyfile school 45 class 105,2023-06-08,5,0x24046131F437
4,45,105,5,,1,,24,2225,keyfile school 45 class 105,2023-06-08,2,0x24025F48A133


# Survey scores
## Teacher responses
### Retired code

❗️**The code blocks under this header were created in April 2025. The code is kept for posterity, but teacher responses were ultimately loaded from CSV files delivered to us in May.**

Teacher survey responses look like they were collected through an online form. Delivered raw as wide-format SPSS files.

School and class data has been added to each dataframe from source filename as a workaround because I was not able to determine which columns contained this data at first. 

Based on reading the codebook and inspecting the answers in the raw files, I've determined the following:

* Q30 = school
* Q31 = class
* Q32 = ?
* Q27 = T_gender
* Q28 = T_age
* Q29 = T_dutch
* Q30.0 = T_exp1
* Q31.0 = T_exp2
* Q32.0 = T_time_teaching
* Q33 = T_class_comp

In [483]:
# # Determine path to raw SPSS files
# folder = DATA_DIR / '01_survey' / 'teacher_raw_2023'
# 
# # Initiate empty dict to store teacher questionnaire dfs
# tq_all = {}
# 
# print('Loading teacher responses...')
# for file in folder.glob('*.sav'):
#     var_name = file.stem
#     df = pd.read_spss(file)
#     
#     # Add school/class as columns to each df from filename
#     school_num, class_num = var_name.split('_')[1:]
#     df['class'] = class_num
#     
#     # Change school 1 to 44
#     if school_num != '1':
#         df['school'] = school_num
#     else:
#         df['school'] = '44'
#         
#     # Store df in dict with filename as key
#     tq_all[var_name] = df
#     print(f'{var_name}')
#     
# print(f'Total dataframes: {len(tq_all)}')
# 
# # Uncomment to inspect example of available columns
# # print(tq_all['tq_1_104'].columns.tolist())
# 
# # Initiate empty dict to store relevant tq only
# tq_relevant = {}
# 
# print('Finding teacher responses matching available tracking data...')
# for df in tq_all:
#     school_num = str(tq_all[df]['school'].iloc[0])
#     class_num = str(tq_all[df]['class'].iloc[0])
#     
#     tq_match = tracking_summary[
#         (tracking_summary['school'] == school_num) &
#         (tracking_summary['class'] == class_num)
#     ]
#     
#     if not tq_match.empty:
#         tq_relevant[df] = tq_all[df]
#         print(f"{df}")
# 
# print(f"Total matching: {len(tq_relevant)}")
# 
# # Uncomment to inspect dataset
# # tq_relevant['tq_1_104']

#### IOP scores

IOP response `Q68` is given per student. Variable name format is `Q68_N`, where N should match an entry in `keyfiles['subject ID']`. By matching the subject ID, `Q68` can then be matched to the 4-digit `ID` in the file containing student survey responses (once these have been fixed). For an initial analysis, the matching to the `subject ID` and thus to tracking tag numbers should be enough. 

Confirmed with Nathalie that IOP responses were optional. If IOP response was given, variable `Q68` is followed by Q70 and Q71 with matching student number. **We are only interested in Q68.**

* Q68 question: "In vergelijking met andere leerlingen bezoek ik [naam kind]"
* Q68 responses: Minder vaak, Gemiddeld, Vaker

**14-05-2025:** ⚠️ After multiple attempts, school 45-105 could not be matched for IOP scores since the student IDs cannot be properly matched. 

In [485]:
# ### Eliminate irrelevant columns in tq dataframes
# 
# # Lists of relevant questions
# descriptives = ['Q27', 'Q28', 'Q29', 'Q30', 'Q31', 'Q32', 'Q33']
# iop_id = ['Q68']  # Add 'Q70', 'Q71' for detailed IOP responses
# 
# tq_filtered = {}
# 
# for key, df in tq_relevant.items():
#     # Create a mask for columns to keep
#     cols_to_keep = []
#     
#     for col in df.columns:
#         # Check if column matches any descriptive column
#         if any(q_id in col for q_id in descriptives):
#             cols_to_keep.append(col)
#         # Check if column contains any of the specified question IDs
#         elif any(q_id in col for q_id in iop_id):
#             cols_to_keep.append(col)
#     
#     # Create a new dataframe with only the columns to keep
#     tq_filtered[key] = df[cols_to_keep]
# 
# 
# ### Load IOP Q68 values into keyfiles df
# 
# # Create empty column 'iop' in keyfiles
# if 'iop' not in keyfiles.columns:
#     keyfiles['iop'] = None
# 
# # Iterate through each df in the dict
# for df_name, df in tq_filtered.items():
#     # Copy df to avoid pandas errors...
#     df_copy = df.copy()
#     
#     # Extract valid school and class values
#     school_num = df_copy['Q30'].unique().astype(str)
#     class_num = df_copy['Q31'].unique().astype(str)
#     
#     for s, c in zip(school_num, class_num):
#         if s in tracking_summary['school'].astype(str).unique():
#             school_num = s
#             class_num = c
#             break
#     else:
#         school_num = np.nan
#         class_num = np.nan
#     
#     # Identify Q68_N columns
#     q68_cols = [col for col in df_copy.columns if col.startswith('Q68_')]
#     
#     # Iterate through df rows
#     for idx, row in df_copy.iterrows():
#         # school = row['Q30']
#         # class_val = row['Q31']
#         # 
#         # Check each Q68_N column for matching subjects
#         for q68_col in q68_cols:
#             # Extract N from Q68_N column name
#             subject_id = q68_col.split('_')[1]
#             
#             # Get value from this Q68_N cell
#             q68_value = row[q68_col]
#             
#             # Only proceed if the cell has a valid value
#             if pd.notna(q68_value) and str(q68_value) != "0" and str(q68_value) != "":
#                 # Find matching rows in keyfiles where all three conditions are met
#                 matching_rows = keyfiles[(keyfiles['school'] == school_num) & 
#                                         (keyfiles['class'] == class_num) & 
#                                         (keyfiles['sID_survey'] == subject_id)]
#                 
#                 # If matches found, update the 'iop' column with the actual value from Q68_N
#                 if not matching_rows.empty:
#                     keyfiles.loc[matching_rows.index, 'iop'] = q68_value

#### STRS scores

**14-05-2025:** I cannot find the STRS scores anywhere in the data. Requested help from Yvonne and Nathalie. 

In [486]:
# temp = tq_relevant['tq_45_105']
# # strs_cols = [col for col in temp.columns if col.str.contains('strs')]
# # print(temp.columns.tolist())

### Final file load (CSV)

**16-05-2025: Update from Nathalie**
>Hannah just let me know she added some more processed versions of the 2023 to the folder. In these Excel files the variables are renamed into STRS8_c2165 for example, which means item 8 from the STRS for child 2165 (so here the IDs are used instead of the subject IDs). These are preliminary files Hannah said, but they do answer your question. You can find them in the folder postpilot_teacher_questionnaire_renamed_preliminary

**Attempting to reload teacher responses from these CSV files.**

In [487]:
# Loading this one locally
folder = Path('/Users/majaculjak/PycharmProjects/dsp/data/01_raw/01_survey/teacher_processed_2023')

# Initiate empty dict to store teacher questionnaire dfs
tq = {}

print('Loading teacher responses (CSV)...')
for file in folder.glob('*.csv'):
    var_name = file.stem
    df = pd.read_csv(file, sep=',')
    tq[var_name] = df
    print(f'{var_name}')
    
print(f'Total dataframes: {len(tq)}')


# Rename school and class related variables to match the rest
for df_name, df in tq.items():
    if 'SchoolID' in df.columns:
        df = df.rename(columns={'SchoolID': 'school'})
    if 'classID' in df.columns:
        df = df.rename(columns={'classID': 'class'})
    tq[df_name] = df  # Save change to dict
    
    
# Rename school 1 to 44 in file 'school 1 class 104' to match data
tq['school 1 class 104']['school'] = '44'


### Eliminate files not in tracking data

# Initiate empty dict to store relevant tq only
tq2= {}

print('\nFinding teacher responses matching available tracking data...')
for df in tq:
    school_num = str(tq[df]['school'].iloc[0])
    class_num = str(tq[df]['class'].iloc[0])
    
    tq_match = tracking_summary[(tracking_summary['school'] == school_num) & (tracking_summary['class'] == class_num)]
    
    if not tq_match.empty:
        tq2[df] = tq[df]
        print(f"{df}")

print(f"Total matching: {len(tq2)}")


### Remove second data row from school 42
# It was the only file with two rows; second row is empty and messing up my merge
for df_name, df in tq2.items():
    if len(df) > 1:
        tq2[df_name] = df.iloc[[0]]  # Keep only the first row


# Create empty IOP col to receive values
keyfiles['IOP'] = 9999


### Create STRS cols in keyfiles

for df_name, df in tq2.items():
    strs_cols = [col for col in df_copy.columns if re.match(r'^STRS\d+_c\d+', col)]

strs_cols_unique = []

for col in strs_cols:
    col = col.split('_c')[0]
    strs_cols_unique.append(col)

strs_cols_unique = sorted(
    set(strs_cols_unique),
    key=lambda x: int(x.replace('STRS', ''))
)

keyfiles[strs_cols_unique] = 9999

# Set up ID_survey as int; missing values become 9999
keyfiles['ID_survey'] = keyfiles['ID_survey'].fillna(9999).astype(int)


### Load IOP values into keyfiles

# Iterate through each df in the dict
for df_name, df in tq2.items():
    # Copy df to avoid pandas errors...
    df_copy = df.copy()
    
    # Extract valid school and class values
    school_num = df_copy['school'].unique().astype(str)
    class_num = df_copy['class'].unique().astype(str)
    
    for s, c in zip(school_num, class_num):
        if s in tracking_summary['school'].astype(str).unique():
            school_num = s
            class_num = c
            break
    else:
        school_num = np.nan
        class_num = np.nan
    
    # Identify IOP_cNNNN cols
    iop_cols = [col for col in df_copy.columns if col.startswith('IOP_c')]
    strs_cols = [col for col in df_copy.columns if re.match(r'^STRS\d+_c\d+', col)]
    
    for idx, row in df_copy.iterrows():
        for col in iop_cols:
            student_id = int(col.split('_c')[1])
            
            # Get value from this cell
            iop_value = row[col]
            
            # Only proceed if the cell has a valid value
            if pd.notna(iop_value) and str(iop_value) != "0" and str(iop_value) != "":
                # Find matching rows in keyfiles where all three conditions are met
                matching_rows = keyfiles[(keyfiles['school'] == school_num) & 
                                        (keyfiles['class'] == class_num) & 
                                        (keyfiles['ID_survey'] == student_id)]
                
                # If matches found, update the 'iop' column with the actual value from Q68_N
                if not matching_rows.empty:
                    keyfiles.loc[matching_rows.index, 'IOP'] = iop_value
        
        for col in strs_cols:
            col_name = col.split('_c')[0]
            student_id = int(col.split('_c')[1])
            
            strs_value = row[col]
            
            matching_rows = keyfiles[(keyfiles['school'] == school_num) & 
                                     (keyfiles['class'] == class_num) & 
                                     (keyfiles['ID_survey'] == student_id)]
            
            if not matching_rows.empty:
                keyfiles.loc[matching_rows.index, col_name] = strs_value

Loading teacher responses (CSV)...
school 1 class 104
school 45 class 105
school 45 class 106
school 42 class 102
school 46 class 107
school 47 class 108
school 41 class 100
school 41 class 101
school 43 class 103
school 49 class 111
school 49 class 110
school 49 class 112
school 49 class 113
Total dataframes: 13

Finding teacher responses matching available tracking data...
school 1 class 104
school 45 class 105
school 45 class 106
school 42 class 102
school 46 class 107
school 47 class 108
school 43 class 103
Total matching: 7


#### Compute STRS score aggregates

In [488]:
strs_close = [1, 3, 8, 9, 10]
strs_conflict = [2, 4, 5, 6, 7]

close_cols = []
conflict_cols = []

for col in strs_cols_unique:
    num = int(col.split('STRS')[1])
    if num in strs_close:
        close_cols.append(col)
    if num in strs_conflict:
        conflict_cols.append(col)

# Compute STRS scores
strs = keyfiles.copy()

strs[strs_cols_unique] = strs[strs_cols_unique].replace(9999, np.nan)
strs = strs.dropna(subset=strs_cols_unique, how='any')
strs[strs_cols_unique] = strs[strs_cols_unique].astype(float).astype(int)

strs['STRS_close'] = strs[close_cols].sum(axis=1)
strs['STRS_conflict'] = strs[conflict_cols].sum(axis=1)

# Reverse code conflict_cols to compute SPARTS_total
for col in conflict_cols:
    reversed_col = col + '_R'
    # Reverse coding: 1→5, 2→4, 3→3, 4→2, 5→1
    strs[reversed_col] = 6 - strs[col]

# Create list of reversed conflict column names
conflict_reversed = [col + '_R' for col in conflict_cols]

# Calculate total score (sum of closeness items and reversed conflict items)
strs['STRS_total'] = strs[close_cols + conflict_reversed].sum(axis=1)

# Update keyfiles with STRS aggregates
keyfiles.loc[strs.index, ['STRS_close', 'STRS_conflict', 'STRS_total']] = strs[['STRS_close', 'STRS_conflict', 'STRS_total']]

**18-05-2025:** Check which unique ID_survey values are present in the STRS reponses of the teachers. Done because the STRS responses for school 42 weren't merging properly with the keyfile. Problem turned out to be an unexpected second row of data in that teacher's response.

In [489]:
rows = []

for df_name, df in tq2.items():
    strs_cols = [col for col in df.columns if re.match(r'^STRS\d+_c\d+', col)]
    
    school_id = df['school'].iloc[0]
    class_id = df['class'].iloc[0]
    
    for col in strs_cols:
        ID_survey = col.split('_c')[1]
        rows.append({'school': school_id, 'class': class_id, 'ID_survey': ID_survey})

IDs = pd.DataFrame(rows)
IDs = IDs.drop_duplicates()

# Display
# IDs

## Student responses

### Aligning with keyfiles

**01-05-2025:** According to Nathalie, the order of the subject IDs ('subjectID') in the keyfiles are correct and can be matched to the student responses in the column 'sID'. The 4-digit person ID numbers ('id') were noted down incorrectly in the keyfiles. They can be copied from the student responses, according to 'sID'.

**08-05-2025:** After several hours spent trying to match student responses to keyfiles, it wasn't working. The data was matched **manually** with Maja copying student survey IDs into the separate keyfiles. 

In [43]:
# Load raw file
filename = 'TotalData_T1_all_cbs_ethnicity_gender'
path = DATA_DIR / '01_survey' / f"{filename}.xlsx"

students_raw = pd.read_excel(path)

students = students_raw.copy()

# Rename columns to match keyfiles
students = students.rename(columns={'School_ID': 'school', 'Class_ID': 'class', 'sID': 'subject_id', 'ID': 'person_id'}).astype(str)

students['person_id'] = students['person_id'].fillna(9999).astype(int) 

# Change school '1' to '44' to match other datasets
students.loc[students['school']=='1', 'school'] = '44'

# Uncomment to inspect beginning of students df
display(students.head())

### Inspect number of absent students
# Tried to see if it matches the amount of student data we have, but the variables turned out not to be filled in most of the time -> useless
# 
# temp = students
# temp['absent'] = temp['absent'].astype(int)
# temp.groupby(['school','class','nPupils','nAbsent'])['absent'].sum()

Unnamed: 0,time,school,class,cohort,tracking,condition_seating,condition_game,nPupils,nAbsent,dataPresent,...,SB13_sit2,SB14_sit2,SB15_sit2,SB16_sit2,SB17_sit2,SB18_sit2,SB19_sit2,SB20_sit2,SB21_sit2,SB22_sit2
0,1,31,79,2122,1,1,0,19,8.0,1,...,,,,,,,,,,
1,1,31,79,2122,1,1,0,19,8.0,1,...,,,,,,,,,,
2,1,31,79,2122,1,1,0,19,8.0,1,...,,,,,,,,,,
3,1,31,79,2122,1,1,0,19,8.0,1,...,,,,,,,,,,
4,1,31,79,2122,1,1,0,19,8.0,1,...,,,,,,,,,,


### Cleaning

In [44]:
# Count of all students per class 
s_count = pd.pivot_table(
    students,
    index=['school', 'class','nPupils'],
    values=['person_id'],
    aggfunc=pd.Series.nunique,
    margins = True,
    margins_name='Total'
)

print(f"Total students in raw dataset: {students['person_id'].nunique()}")
display(s_count)

Total students in raw dataset: 310


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,person_id
school,class,nPupils,Unnamed: 3_level_1
31,79.0,19.0,19
31,80.0,22.0,22
32,73.0,24.0,24
32,74.0,23.0,23
32,75.0,26.0,26
41,100.0,25.0,25
41,101.0,25.0,25
42,102.0,15.0,15
43,103.0,33.0,31
44,104.0,17.0,17


In [45]:
students = students.loc[
    (students['school']).isin(tracking_summary['school'].unique()) &
    (students['class']).isin(tracking_summary['class'].unique())
]

# Count of all students per class 
s_count = pd.pivot_table(
    students,
    index=['school', 'class'],
    values=['person_id'],
    aggfunc=pd.Series.nunique,
    margins = True,
    margins_name='Total'
)

print(f"Total students in dataset after removing schools/classes not available in tracking data: {students['person_id'].nunique()}")
display(s_count)

Total students in dataset after removing schools/classes not available in tracking data: 146


Unnamed: 0_level_0,Unnamed: 1_level_0,person_id
school,class,Unnamed: 2_level_1
42,102.0,15
43,103.0,31
44,104.0,17
45,105.0,22
45,106.0,21
46,107.0,21
47,108.0,19
Total,,146


Students from 45/105 are all marked as `tracking = 0`, but all except one student were tracked. Variable is unreliable and ultimately not necessary - **ignore it.**

In [46]:
s_tracking = pd.pivot_table(
    students,
    index=['school', 'class','nPupils'],
    columns=['tracking'],
    values='person_id',
    aggfunc=pd.Series.nunique,
    margins = True,
    margins_name='Total'
).fillna(0)

print(f"Total students tracked according to student master file: {students.loc[students['tracking']=='1', 'tracking'].count()}\n")
display(s_tracking)

Total students tracked according to student master file: 124



Unnamed: 0_level_0,Unnamed: 1_level_0,tracking,0,1,Total
school,class,nPupils,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,102.0,15.0,0.0,15.0,15
43,103.0,33.0,0.0,31.0,31
44,104.0,17.0,0.0,17.0,17
45,105.0,22.0,22.0,0.0,22
45,106.0,21.0,0.0,21.0,21
46,107.0,21.0,0.0,21.0,21
47,108.0,19.0,0.0,19.0,19
Total,,,22.0,124.0,146


In [None]:
# Count the kids in keyfiles ID_survey equals ID in students
# Where tagnumber != 1111 or 35

keyfile_filtered = keyfiles.copy()
# keyfile_filtered.loc[keyfile_filtered['school']=='44', 'school'] = '1'
keyfile_filtered = keyfile_filtered.loc[
    (keyfile_filtered['school'].isin(tracking_summary['school'].unique())) &
    (keyfile_filtered['class'].isin(tracking_summary['class'].unique())) &
    (keyfile_filtered['ID_survey'].isin(students['person_id'].unique())) &
    (keyfile_filtered['tag_id'] != '9999') &
    (keyfile_filtered['subject_id'] != '9999')
    ]

s_trackers = pd.pivot_table(
    keyfile_filtered,
    index=['school', 'class'],
    values=['ID_survey', 'tag_id', 'tracklab_id'],
    aggfunc = pd.Series.nunique,
)

# s_trackers.loc['Total'] = s_trackers.sum()

# temp.loc[temp[['school','class','tracklab_id']].duplicated(keep=False)] -> NO TRACKLAB IDS ARE DUPLICATED PER CLASS

# Limit student survey df to entries that have matching tracking tags
# students = students.loc[students['person_id'].astype(str).isin(keyfile_filtered['ID_survey'].astype(str))]
# students['person_id'].nunique()

# Display data
display(s_trackers)

### SPARTS scores

Source:  https://doi.org/10.1111/bjep.12094

Relevant variables named 'SPARTSN' (e.g. 'SPARTS1') in the codebook, but this name is not present in the data. Instead, variables named **'st_relN'** have been identified as SPARTS scores. As explained in the codebook, the questionnaire contained 13 items, but Q13 was not presented to all students. After filtering the dataset for relevant data only (i.e., responses of students whose tracking data we have available), only responses 1-12 were available anyway.  

Q12 is not part of the original scale, but developed for this study.

I cannot find a score sheet for this test that is not behind a paywall. The COTAN entry for the SPARTS lists a 25-item test instead of the 13-item test used. 

**08-05-2025:** Informed by Nathalie that no scoring sheet exists. Will separate SPARTS scores and compute raw scores. 

In [495]:
# Create the new dataframe with person_id and columns containing 'st_rel'
sparts_cols = [col for col in students.columns if 'st_rel' in col]
sparts_cols = sorted(set(sparts_cols), key = lambda x: int(x.replace('st_rel', '')))
sparts = students[['person_id'] + sparts_cols].copy()
print(f"Total students with SPARTS entries: {sparts['person_id'].nunique()}\n")

Total students with SPARTS entries: 146



In [496]:
# Drop rows containing any NaNs in SPARTS scores
sparts[sparts_cols] = sparts[sparts_cols].replace('nan', np.nan)
sparts = sparts.dropna(subset=sparts_cols, how='any')
sparts[sparts_cols] = sparts[sparts_cols].astype(float).astype(int)
print(f"Total students after removing entries with incomplete SPARTS responses: {sparts['person_id'].nunique()}\n")

Total students after removing entries with incomplete SPARTS responses: 130



In [None]:
# Rename st_rel columns to sparts
sparts = sparts.rename(columns={col: col.replace('st_rel', 'SPARTS') for col in sparts.columns if 'st_rel' in col})

# Determine closeness and conflict vars
sparts_close = [1, 2, 3, 4, 5, 13]
sparts_conflict = [6, 7, 8, 9, 10, 11]

close_cols = []
conflict_cols = []

sparts_cols = [col.replace('st_rel', 'SPARTS') for col in sparts_cols]

for col in sparts_cols:
    num = int(col.split('SPARTS')[1])
    if num in strs_close:
        close_cols.append(col)
    if num in strs_conflict:
        conflict_cols.append(col)

sparts['SPARTS_close'] = sparts[close_cols].sum(axis=1)
sparts['SPARTS_conflict'] = sparts[conflict_cols].sum(axis=1)

# Reverse code conflict_cols to compute SPARTS_total
for col in conflict_cols:
    reversed_col = col + '_R'
    # Reverse coding: 1→5, 2→4, 3→3, 4→2, 5→1
    sparts[reversed_col] = 6 - sparts[col]

# Create list of reversed conflict column names
conflict_reversed = [col + '_R' for col in conflict_cols]

# Calculate total score (sum of closeness items and reversed conflict items)
sparts['SPARTS_total'] = sparts[close_cols + conflict_reversed].sum(axis=1)

# Attach SPARTS scores to keyfiles
keyfiles = keyfiles.merge(
    sparts[sparts_cols + ['person_id', 'SPARTS_close', 'SPARTS_conflict', 'SPARTS_total']],
    left_on='ID_survey',
    right_on='person_id',
    how='left',
    indicator=True
).drop('person_id_y', axis=1).rename(columns={'person_id_x': 'person_id'})


# Change NaNs in raw SPARTS cols to 9999 to keep formatting
keyfiles[sparts_cols] = keyfiles[sparts_cols].fillna(9999).astype(int)

# Remove _merge col to not confuse anyone
keyfiles = keyfiles.drop(columns='_merge')

In [505]:
keyfiles

Unnamed: 0,school,class,subject_id,person_id,consent,comment,sID_survey,ID_survey,source,date,...,SPARTS7,SPARTS8,SPARTS9,SPARTS10,SPARTS11,SPARTS12,SPARTS13,SPARTS_close,SPARTS_conflict,SPARTS_total
0,45,105,1,,1,,20,2221,keyfile school 45 class 105,2023-06-08,...,2,2,2,2,2,3,4,12.0,15.0,27.0
1,45,105,2,,1,,21,2222,keyfile school 45 class 105,2023-06-08,...,2,3,1,2,1,4,4,13.0,16.0,27.0
2,45,105,3,,1,,22,2223,keyfile school 45 class 105,2023-06-08,...,2,3,1,1,1,5,4,13.0,18.0,25.0
3,45,105,4,,1,,23,2224,keyfile school 45 class 105,2023-06-08,...,1,3,2,3,2,3,3,15.0,15.0,30.0
4,45,105,5,,1,,24,2225,keyfile school 45 class 105,2023-06-08,...,1,1,1,1,1,5,5,13.0,17.0,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,47,108,18,2240,1,,18,2302,keyfile school 47 class 108,2023-05-24,...,5,4,1,5,3,4,5,18.0,18.0,30.0
153,47,108,19,2241,1,,19,2303,keyfile school 47 class 108,2023-05-24,...,1,2,1,1,1,5,5,13.0,17.0,26.0
154,47,108,,,9999,,,9999,keyfile school 47 class 108,2023-05-24,...,9999,9999,9999,9999,9999,9999,9999,,,
155,47,108,,,9999,,,9999,keyfile school 47 class 108,2023-05-24,...,9999,9999,9999,9999,9999,9999,9999,,,


# Final dataset

In [507]:
master = keyfiles.copy()

## Descriptives

In [None]:
### SCHOOLS & CLASSES
master.groupby('school')['class'].nunique()

### STUDENT AGE
master['age'].describe()

**Gender:** I tried to match student gender information to the keyfile, but it turns out that **gender information for 2023 entries is not available**. 

In [387]:
# Attach 'gender' column from student survey responses to keyfile
keyfile_filtered = keyfile_filtered.merge(
    students[['person_id', 'Gender', 'age']],
    left_on='ID_survey',
    right_on='person_id',
    how='left'
)

keyfile_filtered['age'] = keyfile_filtered['age'].astype(float).astype(int)

print(f"Unique entries in column 'gender': {keyfile_filtered['Gender'].unique()}")

Unique entries in column 'gender': ['nan']


## Cleaning consent (again)

Student `ID_survey = 2283` had only oral parental consent, so I originally marked them as 9 (no consent). However, their tracking data and SPARTS scores are present, so I will change the entry to 1 (consent) in case we decide to use this data. This student is missing corresponding IOP scores, so only a one-way student-teacher relationship can be examined. 

In [388]:
print('Entries with comments - printed for manual inspection:')
display(keyfile_filtered.loc[~keyfile_filtered['comment'].isna()])

print('Entries with consent != 1 - printed for manual inspection:')
display(keyfile_filtered.loc[keyfile_filtered['consent']!='1'])

keyfile_filtered.loc[keyfile_filtered['ID_survey']=='2283', 'consent'] = '1'

Entries with comments - printed for manual inspection:


Unnamed: 0,school,class,subject_id,person_id_x,consent,comment,sID_survey,ID_survey,source,date,...,SPARTS7_R,SPARTS8_R,SPARTS9_R,SPARTS10_R,SPARTS11_R,SPARTS_total,_merge,person_id_y,Gender,age
20,43,103,10,2181.0,1,Vult vragenlijst niet in,10,2183,keyfile school 43 class 103,2023-05-11,...,5,5,5,5,5,58,both,2183,,12
48,45,105,10,,1,it could be that not tag 12 but 30 was used fo...,29,2230,keyfile school 45 class 105,2023-06-08,...,3,1,5,3,4,47,both,2230,,11
86,46,107,20,2222.0,9999,"since student had no written consent, their na...",20,2283,keyfile school 46 class 107,2023-05-23,...,5,2,5,5,5,52,both,2283,,12


Entries with consent != 1 - printed for manual inspection:


Unnamed: 0,school,class,subject_id,person_id_x,consent,comment,sID_survey,ID_survey,source,date,...,SPARTS7_R,SPARTS8_R,SPARTS9_R,SPARTS10_R,SPARTS11_R,SPARTS_total,_merge,person_id_y,Gender,age
86,46,107,20,2222,9999,"since student had no written consent, their na...",20,2283,keyfile school 46 class 107,2023-05-23,...,5,2,5,5,5,52,both,2283,,12


## Export

Master file is exported to **workgroup folder** *> dsp > data > 02_interim*

In [508]:
filename = 'dataset-total'
today = pd.to_datetime('today').strftime('%Y-%m-%d_%H-%M')
savepath = DATA_DIR / '02_interim' / f"{filename}_{today}.xlsx"
master.to_excel(savepath, index=False, engine='openpyxl')