In [1]:
from pandas import read_csv, DataFrame
from os import getcwd
from re import sub

NUM_PARTICIPANTS = 24

In [14]:
##########################################################################################
def get_raw_data():
    
    my_curr_dir = getcwd()
    # my_file_dir = my_curr_dir + "..\data\questionnaire\main_form.csv"
    my_file_dir = 'c:\\Users\\micha\\OneDrive\\My_GitHub_Repos\\robot-x-ar\\data\\questionnaire\\main_form.csv'
    big_df = read_csv(my_file_dir)
    num_cols = big_df.shape[1]
    df = big_df.iloc[:, 17:num_cols]
    df = df.fillna(1)
    # df.drop(1, axis="index", inplace=True)
    # df.reset_index(drop=True, inplace=True)
    
    print("\n Finished reading raw csv file! \n")
    
    return df


##########################################################################################
def preprocess_data():
    
    # get overall dataframe
    raw = get_raw_data()
    
    # get first 2 columns as lists
    part_id_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 0].tolist()
    for i in range(len(part_id_list)):
        # part_id_list[i] = int(sub("P", "", part_id_list[i]))
        part_id_list[i] = int(part_id_list[i])
    
    
    # get the conditions column and the single-scale difficulty column
    cond_id_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 1].tolist()
    
    difficulty_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 2].tolist()
    difficulty_list = [float(8.0 - float(difficulty_list[i])) for i in range(len(difficulty_list))]
    
    # get lists for each tlx dimension
    tlx1_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 3].tolist()
    tlx2_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 4].tolist()
    tlx3_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 5].tolist()
    tlx4_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 6].tolist()
    tlx5_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 7].tolist()
    tlx6_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 8].tolist()
    
    tlx1_list = [float(tlx1_list[i]) for i in range(len(tlx1_list))]
    tlx2_list = [float(tlx2_list[i]) for i in range(len(tlx2_list))]
    tlx3_list = [float(tlx3_list[i]) for i in range(len(tlx3_list))]
    tlx4_list = [float(21.0 - float(tlx4_list[i])) for i in range(len(tlx4_list))]
    tlx5_list = [float(tlx5_list[i]) for i in range(len(tlx5_list))]
    tlx6_list = [float(tlx6_list[i]) for i in range(len(tlx6_list))]
    
    # get average values of TLX
    tlx_ave_list = []
    for i in range(len(tlx1_list)):
        tlx_row_list = [tlx1_list[i], tlx2_list[i], tlx3_list[i], tlx4_list[i], tlx5_list[i], tlx6_list[i]]
        tlx_ave = sum(tlx_row_list) / len(tlx_row_list)
        tlx_ave_list.append(tlx_ave)
        
        
    # get the 6 embodiment dims (2 x ownership, 4 x agency)
    emb1_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 9].tolist()
    emb2_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 10].tolist()
    emb3_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 11].tolist()
    emb4_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 12].tolist()
    emb5_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 13].tolist()
    emb6_list = raw.iloc[2:NUM_PARTICIPANTS*4+2, 14].tolist()
    
    emb1_list = [float(emb1_list[i]) for i in range(len(emb1_list))]
    emb2_list = [float(emb2_list[i]) for i in range(len(emb2_list))]
    emb3_list = [float(emb3_list[i]) for i in range(len(emb3_list))]
    emb4_list = [float(emb4_list[i]) for i in range(len(emb4_list))]
    emb5_list = [float(emb5_list[i]) for i in range(len(emb5_list))]
    emb6_list = [float(emb6_list[i]) for i in range(len(emb6_list))]
    
    # get average values of Embodiment
    emb_ave_list = []
    for i in range(len(emb1_list)):
        emb_row_list = [emb1_list[i], emb2_list[i], emb3_list[i], emb4_list[i], emb5_list[i], emb6_list[i]]
        emb_ave = sum(emb_row_list) / len(emb_row_list)
        emb_ave_list.append(emb_ave)
        
    # average value of Ownership (emb)
    owner_ave_list = []
    for i in range(len(emb1_list)):
        owner_row_list = [emb1_list[i], emb2_list[i]]
        owner_ave = sum(owner_row_list) / len(owner_row_list)
        owner_ave_list.append(owner_ave)
        
    # average value of Agency (emb)
    agency_ave_list = []
    for i in range(len(emb1_list)):
        agency_row_list = [emb3_list[i], emb4_list[i], emb5_list[i], emb6_list[i]]
        agency_ave = sum(agency_row_list) / len(agency_row_list)
        agency_ave_list.append(agency_ave)
        
    
    ###### write to new (cleaned) dataframe ######
    
    # generate new dataframe
    df_dict = {
        'pid': part_id_list,
        'condition': cond_id_list,
        'difficulty': difficulty_list,
        'tlx_mental': tlx1_list,
        'tlx_physical': tlx2_list,
        'tlx_hurried': tlx3_list,
        'tlx_successful': tlx4_list,
        'tlx_hard': tlx5_list,
        'tlx_insecure': tlx6_list,
        'tlx_ave': tlx_ave_list,
        'emb_1': emb1_list,
        'emb_2': emb2_list,
        'emb_3': emb3_list,
        'emb_4': emb4_list,
        'emb_5': emb5_list,
        'emb_6': emb6_list,
        'emb_ave': emb_ave_list,
        'owner_ave': owner_ave_list,
        'agency_ave': agency_ave_list
    }
    cleaned_df = DataFrame(df_dict)
    
    # write new dataframe to csv file
    # dest_path = getcwd() + "\data\questionnaire" + '\main_form_cleaned.csv'
    dest_path = 'c:\\Users\\micha\\OneDrive\\My_GitHub_Repos\\robot-x-ar\\data\\questionnaire\\main_form_cleaned.csv'
    cleaned_df.to_csv(dest_path, index=False)
    
    print(" Successfully written pre-processed data to csv file! \n")

In [15]:
preprocess_data()


 Finished reading raw csv file! 

 Successfully written pre-processed data to csv file! 

