In [1]:
import pandas as pd
from scipy import stats
import glob
import os
import numpy as np
import re

### Import Raw Data .csv

In [2]:
def load_csv_data_from_directory(directory_path):
    csv_files = glob.glob(os.path.join(directory_path, '*.csv'))
    print(f'Found {len(csv_files)} CSV files.') 
    
    dataframes_list = []
    
    for csv_file in csv_files:
        print(f'Reading {csv_file}...')  
        df = pd.read_csv(csv_file)
        dataframes_list.append(df)
    
    if dataframes_list:
        combined_dataframe = pd.concat(dataframes_list, ignore_index=True)
        return combined_dataframe
    else:
        print("No CSV files found in the directory.")
        return pd.DataFrame() 

In [3]:
def load_csv_files(path, prefix=None, contains=None):
    files = os.listdir(path)
    dataframes = {}
    for file in files:
        if file.endswith('.csv'):
            if prefix and not file.startswith(prefix):
                continue
            if contains and contains not in file:
                continue
            df = pd.read_csv(os.path.join(path, file))
            key = os.path.splitext(file)[0]
            dataframes[key] = df
    return dataframes

path = "ParticipantData"
active_time_prefix = 'activeTime_participant'
feedback_contains = 'feedback'
answers_contains = 'answers'

# Load activeTime files
activeTime_dataframes = load_csv_files(path, prefix=active_time_prefix)

# Load feedback files
feedback_files = [f for f in os.listdir(path) if feedback_contains in f]
feedback_dataframes = load_csv_files(path, contains=feedback_contains)

# Load answers files
answers_files = [f for f in os.listdir(path) if answers_contains in f]
answers_dataframes = load_csv_files(path, contains=answers_contains)

print("Active Time Dataframes:", list(activeTime_dataframes.keys()))
print("Feedback Files:", feedback_files)
print("Answers Files:", answers_files)

Active Time Dataframes: ['activeTime_participant10__2023_12_7_19_13_57', 'activeTime_participant15__2023_12_11_14_2_52', 'activeTime_participant16__2023_12_11_16_15_30', 'activeTime_participant20__2023_12_11_18_20_19', 'activeTime_participant21__2023_12_12_11_20_59', 'activeTime_participant22__2023_12_12_15_10_1', 'activeTime_participant23__2023_12_12_18_1_13', 'activeTime_participant24__2023_12_13_13_35_48', 'activeTime_participant25__2023_12_13_18_34_56', 'activeTime_participant26__2023_12_14_14_28_0', 'activeTime_participant27__2023_12_14_16_12_49', 'activeTime_participant28__2023_12_14_17_58_57', 'activeTime_participant29__2023_12_15_10_17_24', 'activeTime_participant2__2023_12_7_12_29_23', 'activeTime_participant30__2023_12_15_13_32_17', 'activeTime_participant31__2023_12_15_16_5_36', 'activeTime_participant32__2023_12_18_10_21_54', 'activeTime_participant33__2023_12_18_12_14_51', 'activeTime_participant34__2023_12_18_14_9_13', 'activeTime_participant35__2023_12_18_16_10_31', 'act

### Reation Time (Active_Time) Features

In [6]:
# Read participant_indexes from participant_indexes.txt as int
participants_indexes = []
with open("participants_indexes.txt") as f:
    for line in f:
        participants_indexes.append(int(line.strip()))

In [7]:
activeTime_data = {}

for key in activeTime_dataframes.keys():
    participant_df = activeTime_dataframes[key] 

    phases = ['Start', 'Calibration', 'Tutorial', 'Rest']
    end_phases = ['EndofStart', 'EndofCalibration', 'EndofTutorial', 'EndofRest']
    
    durations = {}

    for phase, end_phase in zip(phases, end_phases):
        phase_df = participant_df[participant_df['ActivePhase'] == phase]
        end_phase_df = participant_df[participant_df['EndActivePhase'] == end_phase]

        # Calculate the duration of each phase
        duration = end_phase_df['Timestamp'].values - phase_df['Timestamp'].values 
        
        durations[phase] = duration

    # Extract the participant number from the key
    participant_number = int(key.split('participant')[1].split('_')[0])

    activeTime_data[participant_number] = durations

print(activeTime_data)

{10: {'Start': array([256.7826]), 'Calibration': array([5.823]), 'Tutorial': array([590.3329]), 'Rest': array([300.0045])}, 15: {'Start': array([127.475]), 'Calibration': array([8.6062]), 'Tutorial': array([717.8423]), 'Rest': array([300.0025])}, 16: {'Start': array([681.0095]), 'Calibration': array([5.9546]), 'Tutorial': array([389.0769]), 'Rest': array([300.006])}, 20: {'Start': array([443.1895]), 'Calibration': array([36.074]), 'Tutorial': array([519.9725]), 'Rest': array([300.007])}, 21: {'Start': array([738.7303]), 'Calibration': array([5.5453]), 'Tutorial': array([439.4244]), 'Rest': array([300.003])}, 22: {'Start': array([563.3018]), 'Calibration': array([30.0822]), 'Tutorial': array([548.777]), 'Rest': array([300.008])}, 23: {'Start': array([459.7932]), 'Calibration': array([4.9753]), 'Tutorial': array([382.1331]), 'Rest': array([300.0034])}, 24: {'Start': array([397.2629]), 'Calibration': array([55.9613]), 'Tutorial': array([331.3882]), 'Rest': array([300.0106])}, 25: {'Start'

In [8]:
experiment_set_times_dict = {}

for key in activeTime_dataframes.keys():
    participant_df = activeTime_dataframes[key] 

    experiment_times = []
    start_time = None
    count = 0

    for index, row in participant_df.iterrows():
        if row['ActivePhase'] == 'Experimental':
            # If this is the first 'Experimental'entry, store the timestamp
            if count == 0:
                start_time = row['Timestamp']
            count += 1
        elif row['ActivePhase'] == 'Feedback'and count == 5:
            end_time = row['Timestamp']
            experiment_times.append(end_time - start_time)

            # Reset the count and start_time
            count = 0
            start_time = None

    # Extract the participant number from the key
    participant_number = int(key.split('participant')[1].split('_')[0]) #TODO

    experiment_set_times_dict[participant_number] = experiment_times

print(experiment_set_times_dict)

{10: [169.0250000000001, 92.92099999999982, 184.56799999999998, 122.03199999999993, 157.93900000000008, 116.29199999999992, 104.90600000000018, 138.31500000000005], 15: [174.07399999999984, 105.14300000000003, 167.7829999999999, 127.3599999999999, 151.72399999999993, 121.98799999999983, 129.42499999999973, 126.25599999999986], 16: [132.40499999999997, 54.61400000000003, 72.32999999999993, 77.66500000000019, 81.91300000000001, 60.605999999999995, 59.15099999999984, 61.825000000000045], 20: [184.24399999999991, 124.048, 122.94100000000003, 109.7170000000001, 121.19999999999982, 95.93500000000017, 121.57999999999993, 131.64000000000033], 21: [123.9090000000001, 89.47000000000003, 69.54300000000012, 73.78600000000006, 98.08699999999999, 73.02300000000014, 74.52800000000002, 90.41399999999976], 22: [71.298, 63.480999999999995, 74.90000000000009, 56.41599999999994, 74.99099999999999, 65.97300000000018, 73.77700000000004, 87.13099999999986], 23: [112.83200000000011, 54.565000000000055, 67.071

In [9]:
experiment_phase_trials_dict = {}

for key in activeTime_dataframes.keys():
    activeTimedf = activeTime_dataframes[key]

    # Filter the DataFrame to find rows where ActivePhase is 'Experimental'and CurrentTrial is between 0 and 31
    experiment_phase_trials = activeTimedf[
        (activeTimedf['ActivePhase'].str.strip() == 'Experimental') &
        (activeTimedf['CurrentTrial'].between(0, 31))
    ]

    # Extract Timestamp values for these specific trials
    timestamps = experiment_phase_trials['Timestamp'] #TODO
    experiment_phase_trials = experiment_phase_trials.sort_values('CurrentTrial')
    experiment_phase_trials['TrialDuration'] = experiment_phase_trials['Timestamp'].shift(-1) - experiment_phase_trials['Timestamp']

    # Find the last 'Experimental' entry and its timestamp
    last_experimental_timestamp = activeTimedf[
        (activeTimedf['ActivePhase'] == 'Experimental') & 
        (activeTimedf['CurrentTrial'].astype(int) == 31)
    ]['Timestamp'].values[0]

    # Find the timestamp of the last 'Feedback'entry after the last 'Experimental' entry
    last_feedback_timestamp = activeTimedf[
        (activeTimedf['ActivePhase'] == 'Feedback') & 
        (activeTimedf['Timestamp'] > last_experimental_timestamp)
    ]['Timestamp'].values[0]

    # Calculate the duration of the last trial
    last_trial_duration = last_feedback_timestamp - last_experimental_timestamp
    experiment_phase_trials = experiment_phase_trials[:-1]
    
    last_trial_row = pd.DataFrame({
        'CurrentTrial': [31],
        'Timestamp': [last_experimental_timestamp],
        'TrialDuration': [last_trial_duration]
    })

    experiment_phase_trials = experiment_phase_trials.append(last_trial_row, ignore_index=True)
    experiment_phase_trials = experiment_phase_trials.sort_values('CurrentTrial')

    # Extract the participant number from the key
    participant_number = int(key.split('participant')[1].split('_')[0]) #TODO

    experiment_phase_trials_dict[participant_number] = experiment_phase_trials[['CurrentTrial', 'Timestamp', 'TrialDuration']]

experiment_phase_trials_dict

{10:     CurrentTrial  Timestamp  TrialDuration
 0              0   1214.735         42.870
 1              1   1257.605         38.501
 2              2   1296.106         15.808
 3              3   1311.914         23.400
 4              4   1335.314         28.160
 5              5   1363.474         27.248
 6              6   1390.722         25.673
 7              7   1416.395         31.456
 8              8   1447.851         47.656
 9              9   1495.507         41.215
 10            10   1536.722         47.697
 11            11   1584.419         51.280
 12            12   1635.699         23.336
 13            13   1659.035         32.405
 14            14   1691.440         36.050
 15            15   1727.490         35.850
 16            16   1763.340         44.011
 17            17   1807.351         37.563
 18            18   1844.914         45.729
 19            19   1890.643         33.789
 20            20   1924.432         35.674
 21            21   1960.106

### 9-Point Paas Score (Feedback) Features

In [10]:
feedback_keys = []
feedback_dataframes = {}

directory = 'ParticipantData'

for file in feedback_files:
    df = pd.read_csv(os.path.join(directory, file)) 
    key = file 
    
    feedback_keys.append(key)
    feedback_dataframes[key] = df
print(feedback_keys)

['feedback_participant10__2023_12_7_19_13_57.csv', 'feedback_participant15__2023_12_11_14_2_52.csv', 'feedback_participant16__2023_12_11_16_15_30.csv', 'feedback_participant20__2023_12_11_18_20_19.csv', 'feedback_participant21__2023_12_12_11_20_59.csv', 'feedback_participant22__2023_12_12_15_10_1.csv', 'feedback_participant23__2023_12_12_18_1_13.csv', 'feedback_participant24__2023_12_13_13_35_48.csv', 'feedback_participant25__2023_12_13_18_34_56.csv', 'feedback_participant26__2023_12_14_14_28_0.csv', 'feedback_participant27__2023_12_14_16_12_49.csv', 'feedback_participant28__2023_12_14_17_58_57.csv', 'feedback_participant29__2023_12_15_10_17_24.csv', 'feedback_participant2__2023_12_7_12_29_23.csv', 'feedback_participant30__2023_12_15_13_32_17.csv', 'feedback_participant31__2023_12_15_16_5_36.csv', 'feedback_participant32__2023_12_18_10_21_54.csv', 'feedback_participant33__2023_12_18_12_14_51.csv', 'feedback_participant34__2023_12_18_14_9_13.csv', 'feedback_participant35__2023_12_18_16_

In [11]:
feedback = {}

for key in feedback_keys:
    feedbackdf = feedback_dataframes[key]

    # Get Tutorial rows in ActivePhase column
    tutorial_feedback = feedbackdf[feedbackdf['ActivePhase'] == 'Tutorial']
    tutorial_feedback_score = tutorial_feedback['FeedbackScore']

    # Get Rest row in ActivePhase column
    rest_feedback = feedbackdf[feedbackdf['ActivePhase'] == 'Rest']
    rest_feedback_score = rest_feedback['FeedbackScore']

    # If there are multiple 'Rest'entries, discard the first one
    if len(rest_feedback) > 1:
        rest_feedback = rest_feedback.iloc[1:]

    # Get FeedbackScore column for every instance of "Experimental" in ActivePhase column
    experimental_feedback = feedbackdf[feedbackdf['ActivePhase'] == 'Experimental']
    experimental_feedback['Number'] = (experimental_feedback['ActivePhase'] == 'Experimental').cumsum()
    experimental_feedback_score = experimental_feedback['FeedbackScore']

    # Concatenate all phases together and reset the index
    all_phases = pd.concat([tutorial_feedback, rest_feedback, experimental_feedback]) #TODO
    all_phases = all_phases.reset_index(drop=True)

    # Extract the participant number from the key
    participant_number = int(key.split('participant')[1].split('_')[0]) #TODO

    feedback[participant_number] = {
        'tutorial': all_phases[all_phases['ActivePhase'] == 'Tutorial'][['ActivePhase', 'StimulusQuestion', 'FeedbackScore', 'TimeDifference']],
        'rest': all_phases[all_phases['ActivePhase'] == 'Rest'][['ActivePhase', 'FeedbackScore', 'TimeDifference']],
        'experimental': all_phases[all_phases['ActivePhase'] == 'Experimental'][['ActivePhase', 'Number', 'FeedbackScore', 'TimeDifference']]
    }

print(feedback[30]['rest'])

  ActivePhase  FeedbackScore  TimeDifference
3        Rest              1         9.02478


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  experimental_feedback['Number'] = (experimental_feedback['ActivePhase'] == 'Experimental').cumsum()


### Accuracy (Correct/Incorrect) Features

In [13]:
answers_keys = []
answers_dataframes = {}

for file in answers_files:
    df = pd.read_csv(os.path.join(directory, file)) 
    participant_number = int(file.split('participant')[1].split('_')[0])
    key = participant_number 

    answers_keys.append(key)
    answers_dataframes[key] = df
print(answers_keys)

[10, 15, 16, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 2, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 3, 40, 41, 44, 45, 46, 47, 48, 49, 4, 50, 51, 52, 53, 54, 56, 57, 59, 60, 61, 62, 65, 66, 67, 68, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81]


In [14]:
answers_data = {}

for key in answers_keys:
    answersdf = answers_dataframes[key]
    answersdf['IsCorrect'] = answersdf['IsCorrect'].astype(int)

    # Calculate the sum of correct and incorrect answers
    correct_sum = answersdf['IsCorrect'].sum()
    incorrect_sum = len(answersdf) - correct_sum

    # Calculate the overall accuracy
    overall_accuracy = (correct_sum / 32) * 100

    sum_df = pd.DataFrame({
        'Score': ['Correct', 'Incorrect'],
        'Total Trials': [correct_sum, incorrect_sum]
    })

    answers_data[key] = {'Total': sum_df, 'Accuracy': overall_accuracy}

    # Create a new column 'SubExperiment'that assigns a unique sub-experiment number to each set of 4 rows
    answersdf['SubExperiment'] = answersdf.index // 4

    # Group by 'SubExperiment'and calculate the sum of correct and incorrect answers for each group
    subexp_data = answersdf.groupby('SubExperiment')['IsCorrect'].agg(['sum', 'count'])
    subexp_data['Incorrect'] = subexp_data['count'] - subexp_data['sum']

    # Calculate the accuracy for each sub-experiment
    subexp_data['Accuracy'] = (subexp_data['sum'] / 32) * 100
    answers_data[key]['SubExperiments'] = subexp_data

    # Create a new column 'Trial'that assigns a unique trial number to each row
    answersdf['Trial'] = answersdf.index + 1

    # Group by 'Trial'and calculate the sum of correct and incorrect answers for each trial
    trial_data = answersdf.groupby(['SubExperiment', 'Trial'])['IsCorrect'].agg(['sum', 'count'])
    trial_data['Incorrect'] = trial_data['count'] - trial_data['sum']
    answers_data[key]['Trials'] = trial_data

print(answers_data[2]['Accuracy'])

answers_data

46.875


{10: {'Total':        Score  Total Trials
  0    Correct            22
  1  Incorrect            10,
  'Accuracy': 68.75,
  'SubExperiments':                sum  count  Incorrect  Accuracy
  SubExperiment                                 
  0                3      4          1     9.375
  1                3      4          1     9.375
  2                0      4          4     0.000
  3                4      4          0    12.500
  4                4      4          0    12.500
  5                2      4          2     6.250
  6                3      4          1     9.375
  7                3      4          1     9.375,
  'Trials':                      sum  count  Incorrect
  SubExperiment Trial                       
  0             1        1      1          0
                2        1      1          0
                3        0      1          1
                4        1      1          0
  1             5        1      1          0
                6        1      1          0

### Extract Reaction Time, 9-Point Paas Scale, and Accuracy Features into Dataframe for .csv

In [15]:
data = []

for participant_id in participants_indexes:
    participant_activeTime_data = activeTime_data[participant_id]
    participant_experiment_set_times = experiment_set_times_dict[participant_id]
    participant_experiment_phase_trials = experiment_phase_trials_dict[participant_id]['TrialDuration']
    participant_feedback = feedback[participant_id]
    participant_answers_data = answers_data[participant_id]

    for subexperiment in range(1, 9):
        participant_data = {
            'participant_number': participant_id,
            'subexperiment_number': subexperiment,
            #'start_time': participant_activeTime_data['Start'][0],
            #'calibration_time': participant_activeTime_data['Calibration'][0],
            #'tutorial_time': participant_activeTime_data['Tutorial'][0],
            'rest_time': participant_activeTime_data['Rest'][0],
            'time_subexperiment': participant_experiment_set_times[subexperiment-1],
            'accuracy_subexperiment' : participant_answers_data['SubExperiments'].loc[subexperiment-1, 'sum']/4 * 100,
            'accuracy_total': participant_answers_data['Accuracy'],
            'answer_correct_subexperiment': participant_answers_data['SubExperiments'].loc[subexperiment-1, 'sum'],
            'answer_incorrect_subexperiment': participant_answers_data['SubExperiments'].loc[subexperiment-1, 'Incorrect'],
            'feedback_score_subexperiment': participant_feedback['experimental']['FeedbackScore'][subexperiment+3],
        }

        trial_durations = participant_experiment_phase_trials[(subexperiment-1)*4:subexperiment*4]
        for i, duration in enumerate(trial_durations):
            participant_data[f'trial_duration_{i+1}'] = duration
        participant_data['trial_duration_mean'] = np.mean([duration for duration in trial_durations])

        # Calculate  performance for each subexperiment
        participant_data['performance_subexperiment'] = ((participant_data['accuracy_subexperiment'] / 100) / participant_data['trial_duration_mean']) * 100

        # Add feedback scores for tutorial and rest phases
        for i, score in enumerate(participant_feedback['tutorial']['FeedbackScore']):
            participant_data[f'feedback_score_tutorial_{i+1}'] = score
        for i, score in enumerate(participant_feedback['rest']['FeedbackScore']):
            participant_data[f'feedback_score_rest_{i+1}'] = score

        data.append(participant_data)

behavioural_analysis_df = pd.DataFrame(data)

print(behavioural_analysis_df)

     participant_number  subexperiment_number  rest_time  time_subexperiment  \
0                     2                     1   300.0060             121.681   
1                     2                     2   300.0060             109.627   
2                     2                     3   300.0060             140.315   
3                     2                     4   300.0060              89.458   
4                     2                     5   300.0060             144.345   
..                  ...                   ...        ...                 ...   
475                  81                     4   300.0065              48.956   
476                  81                     5   300.0065              70.082   
477                  81                     6   300.0065              66.879   
478                  81                     7   300.0065              63.741   
479                  81                     8   300.0065              73.119   

     accuracy_subexperiment  accuracy_t

### Output Dataframe into .csv

In [16]:
behavioural_analysis_df.to_csv('behavioural_analysis.csv', index=False)