Imports & loading raw data

In [1]:
import pandas as pd
import numpy as np

# Adjust paths/filenames if needed
trials_path = "Processed_Feedback_Data.csv"
questionnaire_path = "Final Questionnaire.csv"

df_trials_raw = pd.read_csv(trials_path)
df_q_raw = pd.read_csv(questionnaire_path)

print(df_trials_raw.head())
print(df_q_raw.head())


                       Timestamp  Participant Condition   Order Difficulty  \
0  2025/10/30 11:47:57 a. m. CET            2     Alone   First          2   
1  2025/10/30 12:00:14 p. m. CET            2  Audience  Second          2   
2  2025/10/30 12:11:56 p. m. CET            1  Audience   First          3   
3  2025/10/30 12:18:53 p. m. CET            1     Alone  Second          2   
4   2025/10/30 3:23:41 p. m. CET            3  Audience   First          3   

  Satisfaction Nervousness  Time_raw  Time_minutes  
0            3           4  00:10:54     10.900000  
1            3           4  00:10:48     10.800000  
2            3           1  00:07:50      7.833333  
3            3           1  00:05:46      5.766667  
4            1           4  00:14:31     14.516667  
                  Marca de temps Participant code (e.g., p01)  \
0  2025/10/30 12:01:46 p. m. CET                           02   
1  2025/10/30 12:20:10 p. m. CET                           03   
2  2025/10/30 12:2

Clean the final questionnaire data

In [3]:
df_q = df_q_raw.copy()

df_q = df_q.rename(columns={
    'Marca de temps': 'timestamp',
    'Participant code (e.g., p01)': 'participant',
    'How many people do they know from the audience': 'known_audience',
    'Age': 'age',
    'Gender': 'gender',
    'Nationality': 'nationality',
    'How often do you solve Sudoku puzzles? ': 'sudoku_frequency',
    'How comfortable do you generally feel performing or being watched by others? ':
        'comfort_watched',
    'Have you regularly performed in front of an audience (e.g., sports, theater, music, presentations)? ':
        'performed_audience',
    'If yes, how often have you performed in front of an audience? ':
        'performance_frequency'
})

df_q.head()

import re
import numpy as np

# Inspect raw codes (optional but useful for sanity check)
print(df_q['participant'].astype(str).unique())

# Extract the numeric part from each code (e.g., "p01" -> "01", "P14" -> "14")
df_q['participant'] = (
    df_q['participant']
    .astype(str)
    .str.strip()
    .str.extract(r'(\d+)')[0]   # take the first group of digits
)

# Now convert to integer (nullable Int64)
df_q['participant'] = df_q['participant'].astype('Int64')

print(df_q['participant'].unique())

# known_audience should be numeric
df_q['known_audience'] = pd.to_numeric(df_q['known_audience'], errors='coerce').astype('Int64')

# age as integer
df_q['age'] = pd.to_numeric(df_q['age'], errors='coerce').astype('Int64')

# gender, nationality as categorical
df_q['gender'] = df_q['gender'].astype('category')
df_q['nationality'] = df_q['nationality'].astype('category')

# Performed in front of audience: yes/no to boolean
df_q['performed_audience'] = df_q['performed_audience'].str.strip().str.capitalize()
df_q['performed_audience'] = df_q['performed_audience'].replace({'Yes': True, 'No': False})

# Optionally set ordered categories for experience/comfort
sudoku_order = ["Never", "Rarely", "Sometimes", "Often", "Very often"]
df_q['sudoku_frequency'] = pd.Categorical(
    df_q['sudoku_frequency'].str.strip(),
    categories=sudoku_order,
    ordered=True
)

comfort_order = [1, 2, 3, 4, 5]  # if your scale is 1–5, adjust if different
df_q['comfort_watched'] = pd.to_numeric(df_q['comfort_watched'], errors='coerce')
df_q['comfort_watched'] = pd.Categorical(
    df_q['comfort_watched'],
    categories=comfort_order,
    ordered=True
)

df_q.head()

df_q.to_csv("clean_questionnaire.csv", index=False)


['02' '03' '01' '04' '05' '06' '07' '08' '09' '10' '12' 'P14' '13' 'P15'
 '16' '18' '17' '19' '20' '21' '22' '23' '11']
<IntegerArray>
[ 2,  3,  1,  4,  5,  6,  7,  8,  9, 10, 12, 14, 13, 15, 16, 18, 17, 19, 20,
 21, 22, 23, 11]
Length: 23, dtype: Int64


  df_q['performed_audience'] = df_q['performed_audience'].replace({'Yes': True, 'No': False})


Clean feedback data

In [4]:
df_trials = df_trials_raw.copy()

df_trials = df_trials.rename(columns={
    'Timestamp': 'timestamp',
    'Participant': 'participant',
    'Condition': 'condition',
    'Order': 'order',
    'Difficulty': 'difficulty',
    'Satisfaction': 'satisfaction',
    'Nervousness': 'nervousness',
    'Time_raw': 'time_raw',
    'Time_minutes': 'time_minutes'
})

# Ensure types are reasonable
df_trials['participant'] = pd.to_numeric(df_trials['participant'], errors='coerce').astype('Int64')
df_trials['difficulty'] = pd.to_numeric(df_trials['difficulty'], errors='coerce').astype('Int64')

df_trials['time_minutes'] = pd.to_numeric(df_trials['time_minutes'], errors='coerce')

# Optional: set categories for condition and order
df_trials['condition'] = pd.Categorical(df_trials['condition'], categories=['Alone', 'Audience'], ordered=False)
df_trials['order'] = pd.Categorical(df_trials['order'], categories=['First', 'Second'], ordered=False)

# satisfaction/nervousness as numeric Likert
df_trials['satisfaction'] = pd.to_numeric(df_trials['satisfaction'], errors='coerce')
df_trials['nervousness'] = pd.to_numeric(df_trials['nervousness'], errors='coerce')

df_trials.head()

df_trials.to_csv("clean_trials.csv", index=False)


Merge into a single dataset

In [5]:
# Check unique participants in each dataset
print("Participants in trials:", sorted(df_trials['participant'].dropna().unique()))
print("Participants in questionnaire:", sorted(df_q['participant'].dropna().unique()))

# Merge: one row per trial, with questionnaire info duplicated per participant
df_master = df_trials.merge(df_q, on='participant', how='left', suffixes=('_trial', '_q'))

df_master.head()

missing_info = df_master[df_master['age'].isna()]['participant'].unique()
print("Participants with missing questionnaire data:", missing_info)


df_master.to_csv("master_dataset.csv", index=False)

Participants in trials: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23)]
Participants in questionnaire: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23)]
Participants with missing questionnaire data: <IntegerArray>
[]
Length: 0, dtype: Int64
