In [1]:
import glob
import os
import re
import datetime
import pyreadstat
import pandas as pd


# Methods

*I added an RQ and some hypotheses because we need a specific goal instead of just digging around in data. I didn't think about these much so please add to them and refine them.--MC, 11-04-2025*  

**RQ:** How do explicit (IOP) and implicit (XYZ coordinates) measurement of teacher movement relate to student-teacher relationships (STRS, SPARTS)?

**Hypothesis 1:** Higher IOP positively correlates with higher STRS and SPARTS.

**Hypothesis 2:** Higher IOP positively correlates with closer proximity. 
* *But how are we going to define proximity? Maybe determine a range for how many times a day teacher approached student (also need to determine how long they need to stay next to the student to count as approaching), and how much time total the teacher spent close to student?--MC, 11-04-2025*

1. **Import data**
    1. Import tracking data
        * Timestamps, XYZ coordinates, TagId
    2. Import survey data
        * Teachers: IOP, STRS
        * Students: SPARS
    3. Identify matching data
        * School/classroom/TagID -> teacher <-> student
2. **Pre-process data**
    1. Tracking data
        * Remove outliers
        * Fill in missing timestamps: tracker goes to sleep mode, see VU paper for method
        * Determine tracking start and end time
        * Determine baseline distances (e.g. personal space, see VU paper again)
    2. Survey data
        * Calculate scores: check codebook for sources on correct computation
        * Check for weird data (they're reliable scales so I guess it should be fine--MC, 11-04-2025) 
3. **Process data**
    * Calculate Euclidean distances teacher-student
    * Calculate how many times teacher approached student and vice versa
        * *How exactly are we going to determine this?--MC, 11-04-2025*
4. **Analysis** 
    * Regression analysis tracking data vs survey scores
    * ???

# Import data
## Option 1: Load tracking data from single folder

In [None]:
# Load all movement CSV files into dataframes separated per day
folder = '/Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/Data export 7 klassen april-juni 23/11 april'

file_path = glob.glob(folder + '/*.csv')

dataframes = [pd.read_csv(file, sep=';') for file in file_path]

## Option 2: Load tracking data from all subfolders

In [11]:
# Path to folder containing all 2023 notebooks
folder = '/Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/Data export 7 klassen april-juni 23'

In [12]:
# Create list of subfolders
subfolders = [f.name for f in os.scandir(folder) if f.is_dir()]

# Verify detected subfolders
print(f"Found subfolders: {subfolders}")  

Found subfolders: ['31 mei', '8 juni', '9 juni', '11 april', '11 mei', '23 mei', '24 mei']


In [13]:
# Load CSV files in separate dataframes per subfolder
for subfolder in subfolders:
    # Create variable name (replacing spaces with underscores)
    var_name = f"dfs_{subfolder.replace(' ', '_').lower()}"
    
    # Load CSV files from subfolder
    csv_files = glob.glob(os.path.join(folder, subfolder, '*.csv'))
    
    if not csv_files:
        print(f"No CSV files found in {subfolder}")
        continue
    
    dfs = [pd.read_csv(file, sep=';') for file in csv_files]
    
    # Store in global namespace
    globals()[var_name] = dfs
    print(f"Loaded {len(dfs)} DataFrames into {var_name}")

# Now you can access them like:
# dfs_subfolder1, dfs_subfolder2, etc.

Loaded 35 DataFrames into dfs_31_mei
Loaded 43 DataFrames into dfs_8_juni
Loaded 16 DataFrames into dfs_9_juni
Loaded 16 DataFrames into dfs_11_april
Loaded 30 DataFrames into dfs_11_mei
Loaded 40 DataFrames into dfs_23_mei
Loaded 15 DataFrames into dfs_24_mei


In [24]:
# Inspecting files from April 11

dfs_11_april = pd.concat(dfs_11_april)
dfs_11_april['TimeStamp'] = pd.to_datetime(dfs_11_april['TimeStamp'])

In [25]:
# dfs_11_april

Unnamed: 0,TimeStamp,X,Y,Z,TagId
0,2023-04-11 09:17:34.580,1.107881,0.706208,0.743003,0x24025F44CDE1
1,2023-04-11 09:17:34.751,1.107881,0.706208,0.515614,0x24025F44CDE1
2,2023-04-11 09:17:34.924,1.107881,0.706208,0.400963,0x24025F44CDE1
3,2023-04-11 09:17:35.093,1.107881,0.706208,0.353199,0x24025F44CDE1
4,2023-04-11 09:17:35.222,1.107881,0.706208,0.537513,0x24025F44CDE1
...,...,...,...,...,...
23843,2023-04-11 14:16:50.026,4.294939,4.364043,0.408307,0x24025F44F8D7
23844,2023-04-11 14:16:50.205,4.294939,4.364043,0.210187,0x24025F44F8D7
23845,2023-04-11 14:16:50.375,4.294939,4.364043,0.223637,0x24025F44F8D7
23846,2023-04-11 14:16:50.562,4.294939,4.364043,0.469377,0x24025F44F8D7


## Establish available data

Difficult to establish which data we do have and which data we don't because (1) no proper readme file was given and (2) naming conventions are inconsistent. I compared the folder and file names for the tracking data and established we seem to have the following available.

**Warning: the days for 2022 dates are incorrect.**

In [127]:
# Available data format = (YYYY-MM-DD, (school, class))

available_data= [
    ('2022-06-01', (31, 79)),
    ('2022-06-01', (31, 80)),
    ('2022-06-01', (31, 73)),
    ('2022-06-01', (31, 74)),
    ('2022-06-01', (31, 75)),
    ('2023-06-08', (45, 105)),
    ('2023-06-09', (45, 106)),
    ('2023-04-11', (42, 102)),
    ('2023-05-11', (43, 103)),
    ('2023-05-23', (46, 107)),
    ('2023-05-24', (47, 108)),
    ('2023-05-31', (1, 104))
]

Present in all student data file but no tracking data: (32, 73), (32, 74), (32, 75), (41, 100), (41, 101)

## Load survey answers
### Student data

In [41]:
# Load Excel containing all student data
students_raw = pd.read_excel('/Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/TotalData_T1_all_cbs_ethnicity_gender.xlsx')

Because we only need student data pertaining to their ID values and SPARTS scores (vars st_relX), we create a separate dataframe containing only these values.

In [42]:
# Create list of SPARTS score columns
st_rel = [col for col in students_raw.columns if col.startswith('st_rel')]

# Batch-limit student data up to and incl. column 'gender_sr' and add SPARTS scores
students = students_raw.loc[:,:'gender_sr']
students[st_rel] = students_raw[st_rel]

In [97]:
# Create keyfile with student IDs
keyfile_sid = dict(zip(students['ID'].astype(str), students['sID'].astype(str)))

### Teacher data

We received wide-format SPSS (.sav) files containing data for teachers. Of all survey data, only IOP and STRS scores are applicable to our research question. STRS variables seem easy to find. IOP variables are present in raw format and **not** named according to the codebook. -- MC, 11-04-2025

In [3]:
# Load SPSS files
folder = '/Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/Teacher Questionnaire Processing/Raw 2021-2022'

file_path = glob.glob(folder + '/*.sav')  # SPSS files

# This loads all the tq files in one mega-dataframe
teachers_raw = [pd.read_spss(file) for file in file_path]

# Check number of files loaded -> N = 75
# len(teachers_raw)

In [4]:
# Check df in long format
temp = teachers_raw[0].melt()
temp

IndexError: list index out of range

In [159]:
# Load CSV files
folder = '/Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/Teacher Questionnaire Processing/Raw_t1'

file_path = glob.glob(folder + '/*.csv')

dataframes = [pd.read_csv(file, sep=',') for file in file_path]

In [162]:
# Display a dataframe to check
# dataframes[6]

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration__in_seconds_,Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,T_MSSP28,T_COV1,T_COV2,T_COV3,T_COV3_remarks,T_COV4,T_COV4_remarks,T_general_remarks,Num,Time
0,2022-05-24 05:11:32,2022-05-24 05:32:44,0,100,1271,1,2022-05-24 05:32:44,R_22zxZ1hKBSNiYMM,anonymous,NL,...,4,Deze regels zijn er niet meer,Ook deze regels zijn er niet meer,5,,5,,Ik ben erg benieuwd naar de uitslag van zowel ...,1,2


In [None]:
# It takes a long time to batch import all the .sav files into dataframes, so first check which files we need
#todo second though, rewrite this if needed because this also takes a long time

files_to_import = []

for file in file_path:
    try:
        dataframe = pd.read_spss(file)
        if any(re.match(r'^x\d+_Q70_\d+$', col) for col in dataframe.columns):  # Contains IOP columns
            files_to_import.append(file)
            print(f"Found file: {file}")
    except Exception as e:
        print(f"Error reading {file}: {e}")


In [None]:
# Code in block above inspired by this
# files_to_import = []
# 
# for file in file_path:
#     try:
#         dataframe = pd.read_spss(file)
#         if any(re.match(r'^x\d+_Q70_\d+$', col) for col in dataframe.columns):  # Contains IOP columns
#             files_to_import.append(file)
#             print(f"Found file: {file}")
#     except Exception as e:
#         print(f"Error reading {file}: {e}")

Below is the first attempt to limit teacher response dataframes and it failed gloriously.--MC, 11-04-2025

In [79]:
# Manually determined teacher ID variables
id_cols_list = [
    'ResponseId', 
    'SchoolID', 
    'classID', 
    'teacherID', 
    'gender', 
    'age', 
    'teaching_week', 
    'same_group', 
    'changes_group'
]

# Separate and format IOP and STRS columns
iop_cols = [col for col in temp.columns if col.lower().startswith('iop')]  #TODO remove this bc it's probably not named like this
strs_cols = [col for col in temp.columns if col.lower().startswith('strs')]
# id_cols = [col for col in temp.columns if col.isin(id_cols_list)]
id_cols = [col for col in temp.columns if col.lower() in [x.lower() for x in id_cols_list]]

all_cols = id_cols + strs_cols + iop_cols

In [86]:
temp = temp[all_cols].melt()  # Wide to long format
temp

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration__in_seconds_,Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,Q65,Q66,Q67,Q46,Q47,Q53,Q54,Q55,Q56,Num
0,2021-11-17 01:11:41,2021-11-17 09:02:51,IP Address,44.0,28269.0,False,2021-11-24 09:02:51,R_2rMjIqzzgeNrKGp,anonymous,NL,...,,,,,,,,,,1
1,2021-11-24 09:13:09,2021-11-24 09:14:33,IP Address,29.0,84.0,False,2021-12-01 09:14:36,R_R8gwVUAlrPmpkM9,anonymous,NL,...,,,,,,,,,,1
2,2021-12-09 08:36:14,2021-12-10 09:24:58,IP Address,100.0,89324.0,True,2021-12-10 09:24:59,R_3NPUyPlcAxijh3I,anonymous,NL,...,We zetten de kinderen met storend gedrag t.o.v...,Kinderen die extra aandacht nodig hebben vanwe...,"Al wel beschreven, maar ook didactisch. Soms j...",Geen afstandsregels regels voor kinderen uit e...,Geen andere regels dan normaal. Geen afstand h...,Oneens,Mondkapjes in de gang: kinderen zitten de hele...,Nauwelijks,"Mondkapje in de gangen wordt amper gedragen, o...",1


## Translated R code
We were provided R scripts to import and pre-process teacher survey responses. I can't read R code, so I took the 'rename_column_teacher_main.R' and asked Claude to convert it to Python.--MC, 11-04-2025

In [None]:
import pandas as pd
import os
import re
from pathlib import Path

# List files in the directory
# In Jupyter, you may need to change this path to match your environment
files = os.listdir('/Users/majaculjak/Desktop/RawTQ')

# Check files that contain columns matching the pattern "^x\d+_Q70_\d+$"
for file in files:
    data = pd.read_spss(os.path.join('/Users/majaculjak/Desktop/RawTQ', file))
    if any(re.match(r'^x\d+_Q70_\d+$', col) for col in data.columns):
        print(file)

# Processing a specific file (in R this was done manually by changing file index)
# Select a file for processing
file = files[0]  # You would change this index manually to process different files
data = pd.read_spss(os.path.join('/Users/majaculjak/Desktop/RawTQ', file))

# Function to get variable label (mimics attr(data[[col]], "label") in R)
def get_label(data, column):
    try:
        return data[column].attrs['label']
    except (KeyError, AttributeError):
        return ""

# Rename columns based on patterns
new_column_names = {}

for col in data.columns:
    # CPCQ pattern
    if 'CPCQ' in col:
        cpcq_item = col.split('_')[0]
        new_column_names[col] = f'T_{cpcq_item}'
    
    # MSSP pattern
    elif 'MSSP' in col:
        mssp_item = col.split('_')[0]
        new_column_names[col] = f'T_{mssp_item}'
    
    # STRS pattern
    elif 'STRS' in col:
        strs_item = col.split('_')[0]
        new_column_names[col] = f'T_{strs_item}'
    
    # expct pattern
    elif 'expct' in col:
        label = get_label(data, col)
        child_exp = col.split('_')[2] if len(col.split('_')) > 2 else ""
        
        if 'Deze leerling behaalt waarschijnlijk een hoge score op de eindtoets in groep 8' in label:
            new_column_names[col] = f'T_EXP1_{child_exp}'
        elif 'officiele normeringen' in label:
            new_column_names[col] = f'T_EXP1a_{child_exp}'
        elif 'voor zijn/haar doen' in label:
            new_column_names[col] = f'T_EXP1b_{child_exp}'
        elif 'werkhouding' in label:
            new_column_names[col] = f'T_EXP2_{child_exp}'
    
    # ID columns
    elif 'SchoolID' in col:
        new_column_names[col] = 'School_ID'
    elif 'classID' in col:
        new_column_names[col] = 'Class_ID'
    elif 'teacherID' in col:
        new_column_names[col] = 'Teacher_ID'
    
    # Print columns with Q for inspection
    elif 'Q' in col:
        print(col)
        print(get_label(data, col))

# Rename Covid questions
for col in data.columns:
    label = get_label(data, col)
    
    if 'bent u het eens met de huidige regels die gelden in de school' in label:
        new_column_names[col] = 'T_COV3'
    elif 'toelichting' in label:
        if 'Q57' in col:
            new_column_names[col] = 'T_COV3_remarks'
        elif 'Q54' in col:
            new_column_names[col] = 'T_COV3_remarks'
        elif 'Q56' in col:
            new_column_names[col] = 'T_COV4_remarks'
        elif 'Q55' in col:
            new_column_names[col] = 'T_COV4_remarks'
    elif 'om de regels toe te passen in uw klas' in label:
        new_column_names[col] = 'T_COV4'
    elif 'Tot slot' in label:
        new_column_names[col] = 'T_general_remarks'

# Rename teacher considerations seating
for col in data.columns:
    if 'Q62' in col:
        new_column_names[col] = 'TCSA1'
    elif 'Q64' in col:
        new_column_names[col] = 'TCSA2'
    elif 'Q65' in col:
        new_column_names[col] = 'TCSA3'
    elif 'Q66' in col:
        new_column_names[col] = 'TCSA4'
    elif 'Q67' in col:
        new_column_names[col] = 'TCSA5'

# Process IOP questions
# Note: You would need to define keyfile here or load it from somewhere
# keyfile = pd.read_csv("path_to_keyfile.csv")  # Adjust according to your keyfile location

# This is placeholder code - you need to adjust based on your keyfile structure
keyfile = pd.DataFrame({
    'id': [],
    'name': []
})

for i, col in enumerate(data.columns):
    # Process Q68_X type columns (IOP)
    match = re.match(r'^Q68_(\d+)$', col)
    if match:
        child_number = match.group(1)
        label = get_label(data, col)
        child_name = re.sub(r'[^a-zA-Z0-9]', '', re.sub(r'.*\- ', '', label))
        
        if not keyfile.empty:  # Only proceed if keyfile has data
            matching_ids = keyfile.loc[keyfile['name'] == child_name, 'id']
            if not matching_ids.empty:
                child_id = matching_ids.iloc[0]
                new_column_names[col] = f"IOP_c{child_id}"
    
    # Process x1_Q70_X type columns (IOP_more)
    match = re.match(r'^x(\d+)_Q70_(\d+)$', col)
    if match:
        child_number = match.group(1)
        item_number = int(match.group(2))
        
        label = get_label(data, col)
        child_name = re.sub(r'[^a-zA-Z0-9]', '', re.sub(r'.*- (.*?) -.*', r'\1', label))
        
        if not keyfile.empty:  # Only proceed if keyfile has data
            matching_ids = keyfile.loc[keyfile['name'] == child_name, 'id']
            if not matching_ids.empty:
                child_id = matching_ids.iloc[0]
                if item_number == 11:
                    new_column_names[col] = f"IOP_more_Rother1_c{child_id}"
                else:
                    new_column_names[col] = f"IOP_more_R{item_number}_c{child_id}"
    
    # Process xX_Q70_11_TEXT type columns
    match = re.match(r'^x(\d+)_Q70_11_TEXT$', col)
    if match:
        child_number = match.group(1)
        
        label = get_label(data, col)
        child_name = re.sub(r'[^a-zA-Z0-9]', '', re.sub(r'.*- (.*?) -.*', r'\1', label))
        
        if not keyfile.empty:  # Only proceed if keyfile has data
            matching_ids = keyfile.loc[keyfile['name'] == child_name, 'id']
            if not matching_ids.empty:
                child_id = matching_ids.iloc[0]
                new_column_names[col] = f"IOP_more_Rother2_c{child_id}"
    
    # Process x1_Q71_X type columns (IOP_less)
    match = re.match(r'^x(\d+)_Q71_(\d+)$', col)
    if match:
        child_number = match.group(1)
        item_number = int(match.group(2))
        
        label = get_label(data, col)
        child_name = re.sub(r'[^a-zA-Z0-9]', '', re.sub(r'.*- (.*?) -.*', r'\1', label))
        
        if not keyfile.empty:  # Only proceed if keyfile has data
            matching_ids = keyfile.loc[keyfile['name'] == child_name, 'id']
            if not matching_ids.empty:
                child_id = matching_ids.iloc[0]
                if item_number == 8:
                    new_column_names[col] = f"IOP_less_Rother1_c{child_id}"
                else:
                    new_column_names[col] = f"IOP_less_R{item_number}_c{child_id}"
    
    # Process xX_Q71_8_TEXT type columns
    match = re.match(r'^x(\d+)_Q71_8_TEXT$', col)
    if match:
        child_number = match.group(1)
        
        label = get_label(data, col)
        child_name = re.sub(r'[^a-zA-Z0-9]', '', re.sub(r'.*- (.*?) -.*', r'\1', label))
        
        if not keyfile.empty:  # Only proceed if keyfile has data
            matching_ids = keyfile.loc[keyfile['name'] == child_name, 'id']
            if not matching_ids.empty:
                child_id = matching_ids.iloc[0]
                new_column_names[col] = f"IOP_less_Rother2_c{child_id}"

# Rename columns using the mappings
data = data.rename(columns=new_column_names)

# Remove columns matching pattern Q60_[123]
columns_to_keep = [col for col in data.columns if not re.match(r'.*Q60_[123]', col)]
data = data[columns_to_keep]

# Save file
file_name = file.split('.')[0]
output_dir = Path('Raw_t1')
output_dir.mkdir(exist_ok=True)
data.to_csv(output_dir / f"{file_name}.csv", index=False)

## Translated R code (optimized)
Then I asked Claude to optimize the code. -- Maja, 11-04-2025

In [99]:
import pandas as pd
import os
import re
from pathlib import Path

In [100]:
def get_label(data, column):
    """Get the label attribute of a column if it exists."""
    try:
        return data[column].attrs['label']
    except (KeyError, AttributeError):
        return ""

def extract_child_id(label):
    """
    Extract child ID from label text as a string.
    Attempts to find the first numeric ID in the label.
    """
    try:
        # Extract the first sequence of digits from the label
        digits_match = re.search(r'(\d+)', str(label))
        if digits_match:
            return str(digits_match.group(1))
        return None
    except Exception:
        return None

In [101]:
def process_teacher_questionnaire(data_dir, output_dir, keyfile_sid=None):
    """
    Process teacher questionnaire data files by renaming columns according to specific patterns.
    
    Args:
        data_dir: Directory containing SPSS data files
        output_dir: Directory where processed CSV files will be saved
        keyfile_sid: Dictionary mapping student IDs to sIDs
    """
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(exist_ok=True)
    
    # List files in the directory
    files = os.listdir(data_dir)
    
    # Find files with x#_Q70_# pattern
    q70_files = []
    for file in files:
        try:
            data = pd.read_spss(os.path.join(data_dir, file))
            if any(re.match(r'^x\d+_Q70_\d+$', col) for col in data.columns):
                q70_files.append(file)
                print(f"Found Q70 pattern in file: {file}")
        except Exception as e:
            print(f"Error reading {file}: {e}")
    
    # Process each file
    for file in files:
        try:
            print(f"Processing {file}...")
            data = pd.read_spss(os.path.join(data_dir, file))
            
            # Dictionary to store column renamings
            rename_map = {}
            
            # Process columns by pattern
            for col in data.columns:
                # Get column label
                label = get_label(data, col)
                
                # Process by pattern
                if 'CPCQ' in col or 'MSSP' in col or 'STRS' in col:
                    # Standard questionnaire items
                    prefix = col.split('_')[0]
                    rename_map[col] = f'T_{prefix}'
                
                # Handle expct pattern
                elif 'expct' in col:
                    parts = col.split('_')
                    if len(parts) > 2:
                        child_exp = parts[2]
                        if 'Deze leerling behaalt waarschijnlijk een hoge score op de eindtoets in groep 8' in label:
                            rename_map[col] = f'T_EXP1_{child_exp}'
                        elif 'officiele normeringen' in label:
                            rename_map[col] = f'T_EXP1a_{child_exp}'
                        elif 'voor zijn/haar doen' in label:
                            rename_map[col] = f'T_EXP1b_{child_exp}'
                        elif 'werkhouding' in label:
                            rename_map[col] = f'T_EXP2_{child_exp}'
                
                # Handle ID columns
                elif col == 'SchoolID':
                    rename_map[col] = 'School_ID'
                elif col == 'classID':
                    rename_map[col] = 'Class_ID'
                elif col == 'teacherID':
                    rename_map[col] = 'Teacher_ID'
                
                # Handle Covid questions
                elif 'bent u het eens met de huidige regels die gelden in de school' in label:
                    rename_map[col] = 'T_COV3'
                elif 'toelichting' in label and ('Q57' in col or 'Q54' in col):
                    rename_map[col] = 'T_COV3_remarks'
                elif 'toelichting' in label and ('Q56' in col or 'Q55' in col):
                    rename_map[col] = 'T_COV4_remarks'
                elif 'om de regels toe te passen in uw klas' in label:
                    rename_map[col] = 'T_COV4'
                elif 'Tot slot' in label:
                    rename_map[col] = 'T_general_remarks'
                
                # Handle teacher considerations seating
                elif col.startswith('Q6'):
                    q_num = col.split('_')[0]
                    if q_num == 'Q62':
                        rename_map[col] = 'TCSA1'
                    elif q_num == 'Q64':
                        rename_map[col] = 'TCSA2'
                    elif q_num == 'Q65':
                        rename_map[col] = 'TCSA3'
                    elif q_num == 'Q66':
                        rename_map[col] = 'TCSA4'
                    elif q_num == 'Q67':
                        rename_map[col] = 'TCSA5'
            
            # Process IOP questions if keyfile is available
            if keyfile_sid:
                for col in data.columns:
                    # IOP base questions (Q68_X)
                    match = re.match(r'^Q68_(\d+)$', col)
                    if match:
                        child_id = extract_child_id(get_label(data, col))
                        if child_id in keyfile_sid:
                            sid = keyfile_sid[child_id]
                            rename_map[col] = f"IOP_c{sid}"
                    
                    # IOP more questions (x#_Q70_X)
                    match = re.match(r'^x(\d+)_Q70_(\d+)$', col)
                    if match:
                        item_number = int(match.group(2))
                        child_id = extract_child_id(get_label(data, col))
                        if child_id in keyfile_sid:
                            sid = keyfile_sid[child_id]
                            suffix = f"Rother1_c{sid}" if item_number == 11 else f"R{item_number}_c{sid}"
                            rename_map[col] = f"IOP_more_{suffix}"
                    
                    # IOP more other text (x#_Q70_11_TEXT)
                    match = re.match(r'^x(\d+)_Q70_11_TEXT$', col)
                    if match:
                        child_id = extract_child_id(get_label(data, col))
                        if child_id in keyfile_sid:
                            sid = keyfile_sid[child_id]
                            rename_map[col] = f"IOP_more_Rother2_c{sid}"
                    
                    # IOP less questions (x#_Q71_X)
                    match = re.match(r'^x(\d+)_Q71_(\d+)$', col)
                    if match:
                        item_number = int(match.group(2))
                        child_id = extract_child_id(get_label(data, col))
                        if child_id in keyfile_sid:
                            sid = keyfile_sid[child_id]
                            suffix = f"Rother1_c{sid}" if item_number == 8 else f"R{item_number}_c{sid}"
                            rename_map[col] = f"IOP_less_{suffix}"
                    
                    # IOP less other text (x#_Q71_8_TEXT)
                    match = re.match(r'^x(\d+)_Q71_8_TEXT$', col)
                    if match:
                        child_id = extract_child_id(get_label(data, col))
                        if child_id in keyfile_sid:
                            sid = keyfile_sid[child_id]
                            rename_map[col] = f"IOP_less_Rother2_c{sid}"
            
            # Rename columns
            data = data.rename(columns=rename_map)
            
            # Remove columns matching pattern Q60_[123]
            data = data[[col for col in data.columns if not re.match(r'.*Q60_[123]', col)]]
            
            # Save file
            file_name = file.split('.')[0]
            data.to_csv(os.path.join(output_dir, f"{file_name}.csv"), index=False)
            print(f"Saved {file_name}.csv")
            
        except Exception as e:
            print(f"Error processing {file}: {e}")

In [105]:
# Set your directory paths
data_dir = '/Users/majaculjak/Desktop/RawTQ'  # Change this to your input directory
output_dir = '/Users/majaculjak/Desktop/ProcessedTQ'  # Change this to your output directory

# Load or create your keyfile dictionary
# Option 1: Load students data and create ID to sID mapping
# students = pd.read_csv('path_to_your_students_data.csv')  # Change to your actual path
# keyfile_sid = dict(zip(students['ID'], students['sID']))

# Option 2: For testing with a small dictionary
# keyfile_sid = {'123': 'S123', '456': 'S456'}  # Example mapping

# Process the files
process_teacher_questionnaire(data_dir, output_dir, keyfile_sid)

Processing tq1_10_64.sav...
Saved tq1_10_64.csv
Processing tq1_10_65.sav...
Saved tq1_10_65.csv


In [103]:
# temp = pd.read_csv('/Volumes/WRKGRP/STD-FSW-BSI-SD-Movement_Tracking/Teacher Questionnaire Processing/Raw_t1/tq1_32_75.csv')
# temp.melt()

In [106]:
temp = pd.read_csv('/Users/majaculjak/Desktop/ProcessedTQ/tq1_10_64.csv')
temp.melt()

Unnamed: 0,variable,value
0,StartDate,2021-11-17 03:05:41
1,EndDate,2021-11-18 00:02:39
2,Status,IP Address
3,Progress,100.0
4,Duration__in_seconds_,75418.0
...,...,...
6158,Q53,Heel erg eens
6159,Q54,Wij zijn een school die redelijk nuchter met d...
6160,Q55,Volledig
6161,Q56,Omdat er nu niet echt scherpe maatregelen zijn.
