# Preprocessing Questionnaire data to dataFrame #

This script reads in the questionnaire data -stored in csv files- of each participant and creates a csv file with the most important information, ready for data analysis.

The output CSV will have the following columns:

participantID\
Age: {1: "18-24", 2: "25-34", 3: "35-44", 4: "45-54", 5: "55-64", 6: "65-74"}\
Gender: {0: "Male", 1: "Female", 2: "Diverse"}\
Culture: {Japan, Germany, Jordan}\
Glasses: {0: no eyesight correction, 1: with eyesight correction}\
Handedness: {0: right handed, 1: left handed, 2: both or no preference}\
VR_Proficiency: {1: "Daily", 2: "Weekly", 3: "Few times a month", 4: "Rarely", 5: "First time"}\
Restedness: {1: very tired, 2, 3, 4, 5, 6, 7: fully awake}\
TIPI_Extraversion: Scale 1-7, 1:disagree strongly, 7:agree strongly. Calculated from: TIPI questions 1 (normal) and 6 (reversed)\
TIPI_Agreeableness: Scale 1-7, 1:disagree strongly, 7:agree strongly. Calculated from: TIPI questions 2 (reversed) and 7 (normal)\
TIPI_Conscientiousness: Scale 1-7, 1:disagree strongly, 7:agree strongly. Calculated from: TIPI questions 3 (normal) and 8 (reversed)\
TIPI_Emotional_Stability: Scale 1-7, 1:disagree strongly, 7:agree strongly. Calculated from: TIPI questions 4 (reversed) and 9 (normal)\
TIPI_Openness_to_Experiences: Scale 1-7, 1:disagree strongly, 7:agree strongly. Calculated from: TIPI questions 5 (normal) and 10 (reversed)\
AWE-S_Total_[for each scene]: Question ids q1 - q30\
IPQ_Total_[for each scene]: Question ids q31 - q44\
IPQ_Involvement_[for each scene]: Question ids q37-q40


In [3]:
import os
import pandas as pd
import re

def calculate_tipi_scores(demo_df):
    tipi_scores = {}
    tipi_scores['Extraversion'] = (int(demo_df.loc[demo_df['QuestionID'] == 'q9', 'Answer'].values[0]) + (8 - int(demo_df.loc[demo_df['QuestionID'] == 'q14', 'Answer'].values[0]))) / 2
    tipi_scores['Agreeableness'] = ((8 - int(demo_df.loc[demo_df['QuestionID'] == 'q10', 'Answer'].values[0])) + int(demo_df.loc[demo_df['QuestionID'] == 'q15', 'Answer'].values[0])) / 2
    tipi_scores['Conscientiousness'] = (int(demo_df.loc[demo_df['QuestionID'] == 'q11', 'Answer'].values[0]) + (8 - int(demo_df.loc[demo_df['QuestionID'] == 'q16', 'Answer'].values[0]))) / 2
    tipi_scores['Emotional_Stability'] = ((8 - int(demo_df.loc[demo_df['QuestionID'] == 'q12', 'Answer'].values[0])) + int(demo_df.loc[demo_df['QuestionID'] == 'q17', 'Answer'].values[0])) / 2
    tipi_scores['Openness_to_Experiences'] = (int(demo_df.loc[demo_df['QuestionID'] == 'q13', 'Answer'].values[0]) + (8 - int(demo_df.loc[demo_df['QuestionID'] == 'q18', 'Answer'].values[0]))) / 2
    return tipi_scores

def calculate_awes_score(scene_df):
    awe_scores = {}
    stimuli = ['Neutral1', 'Neutral3', 'Waterfall', 'Borealis', 'Underwater', 'AbstractAwe', 'Space', 'Mountain', 'Forest', 'Church']
    for stim in stimuli:
        scene_data = scene_df[scene_df['condition'] == stim]
        awe_scores[stim] = scene_data[scene_data['QuestionID'].between('q1', 'q30')]['Answer'].astype(float).mean()
    return awe_scores

def calculate_ipq_scores(scene_df):
    ipq_total_scores = {}
    ipq_involvement_scores = {}
    stimuli = ['Neutral1', 'Neutral3', 'Waterfall', 'Borealis', 'Underwater', 'AbstractAwe', 'Space', 'Mountain', 'Forest', 'Church']
    for stim in stimuli:
        scene_data = scene_df[scene_df['condition'] == stim]
        ipq_total_scores[stim] = scene_data[scene_data['QuestionID'].between('q31', 'q44')]['Answer'].astype(float).mean()
        ipq_involvement_scores[stim] = scene_data[scene_data['QuestionID'].isin(['q37', 'q38', 'q39', 'q40'])]['Answer'].astype(float).mean()
    return ipq_total_scores, ipq_involvement_scores

def parse_csv(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            lines = file.readlines()
    except UnicodeDecodeError:
        with open(filepath, 'r', encoding='utf-8-sig') as file:
            lines = file.readlines()
    data = [line.strip().split(',') for line in lines]
    parsed_data = []
    for row in data:
        if len(row) > 3:
            answer = row[-1]
            question_id = row[-2]
            question = ','.join(row[:-2])
            parsed_data.append([question_id, question, answer])
        else:
            parsed_data.append(row)
    return pd.DataFrame(parsed_data[1:], columns=parsed_data[0])

def process_data(base_path):
    all_data = []
    countries = ['Germany', 'Jordan', 'Japan']
    for country in countries:
        country_path = os.path.join(base_path, f'Questionnaire Data {country}')
        for participant_id in os.listdir(country_path):
            participant_path = os.path.join(country_path, participant_id)
            demo_file = [file for file in os.listdir(participant_path) if file.startswith('questionnaireID_Demo')][0]
            print(participant_path)
            demo_df = parse_csv(os.path.join(participant_path, demo_file))

            participant_data = {
                'participantID': participant_id,
                'Age': demo_df.loc[demo_df['QuestionID'] == 'q1', 'Answer'].values[0],
                'Gender': demo_df.loc[demo_df['QuestionID'] == 'q2', 'Answer'].values[0],
                'Culture': demo_df.loc[demo_df['QuestionID'] == 'q3', 'Answer'].values[0],
                'Glasses': demo_df.loc[demo_df['QuestionID'] == 'q4', 'Answer'].values[0],
                'Handedness': demo_df.loc[demo_df['QuestionID'] == 'q5', 'Answer'].values[0],
                'VR_Proficiency': demo_df.loc[demo_df['QuestionID'] == 'q7', 'Answer'].values[0],
                'Restedness': demo_df.loc[demo_df['QuestionID'] == 'q8', 'Answer'].values[0]
            }

            tipi_scores = calculate_tipi_scores(demo_df)
            participant_data.update({f'TIPI_{k}': v for k, v in tipi_scores.items()})

            scene_files = [file for file in os.listdir(participant_path) if file.startswith('questionnaireID_AWEIPQ')]
            scene_data = pd.concat([parse_csv(os.path.join(participant_path, file)).assign(condition=re.search(r'condition_(.*?)_answers', file).group(1)) for file in scene_files])
            
            awes_scores = calculate_awes_score(scene_data)
            participant_data.update({f'AWE-S_Total_{k}': v for k, v in awes_scores.items()})

            ipq_total_scores, ipq_involvement_scores = calculate_ipq_scores(scene_data)
            participant_data.update({f'IPQ_Total_{k}': v for k, v in ipq_total_scores.items()})
            participant_data.update({f'IPQ_Involvement_{k}': v for k, v in ipq_involvement_scores.items()})

            all_data.append(participant_data)

    df = pd.DataFrame(all_data)
    df.to_csv('processed data/combined_questionnaire_dataTest.csv', index=False)

base_path = '../data/'
process_data(base_path)


../data/Questionnaire Data Germany\2343
../data/Questionnaire Data Germany\3929
../data/Questionnaire Data Germany\4545
../data/Questionnaire Data Germany\5656
../data/Questionnaire Data Germany\71
../data/Questionnaire Data Germany\72
../data/Questionnaire Data Germany\73
../data/Questionnaire Data Germany\7345
../data/Questionnaire Data Germany\7373
../data/Questionnaire Data Germany\74
../data/Questionnaire Data Germany\75
../data/Questionnaire Data Germany\7575
../data/Questionnaire Data Germany\76
../data/Questionnaire Data Germany\777
../data/Questionnaire Data Germany\78
../data/Questionnaire Data Germany\7878
../data/Questionnaire Data Germany\79
../data/Questionnaire Data Germany\81
../data/Questionnaire Data Germany\8585
../data/Questionnaire Data Germany\9191
../data/Questionnaire Data Germany\9898
../data/Questionnaire Data Jordan\1111
../data/Questionnaire Data Jordan\1234
../data/Questionnaire Data Jordan\2222
../data/Questionnaire Data Jordan\2345
../data/Questionnaire D