In [1]:
import pandas as pd
import numpy as np
import json
import re
import glob
from datetime import date
import math

# main_dir = '/Volumes/synapse/projects/SocialSpace/Projects/Behavior'
main_dir = '/Users/matty_gee/Desktop/SNT/SNT-behavior'

from data_prep_utils import * 
from compute_task_variables import *

from os.path import exists 
import sys
from scipy.stats import rankdata
# sys.path.insert(0, main_dir + '/code')
# from code_base import * 

character_roles = ['first', 'second', 'assistant', 'powerful', 'boss', 'neutral'] # order in matlab



# Aggregate

In [6]:
# compute the behavioral geometry            
base_dir = main_dir + '/Online/Prolific/Data/Original_2021'
task_files = glob.glob(base_dir + '/Task/Organized/*.xlsx')
compute_task_variables(base_dir, task_files)

In [3]:
summary_dir = main_dir + '/Online/Prolific/Data/Original_2021/Summary'
vt_df  = pd.read_excel(glob.glob(summary_dir + '/Posttask-original-VT_summary_*')[0])
pav_df  = pd.read_excel(glob.glob(summary_dir + '/Posttask-original-pavlovia_summary_*')[0])

# combine post task dfs, with diff columns
all_cols = list(set(pav_df.columns))
all_cols.sort()
dfs = []
for df_ in [pav_df, vt_df]:
    df = pd.DataFrame(index=np.arange(0,len(df_)), columns=all_cols)
    df.loc[:, list(df_.columns)] = df_
    dfs.append(df)
post_df = pd.concat(dfs)
sub_list = post_df['sub_id']
del post_df['sub_id']
post_df.insert(0, 'sub_id', sub_list)

# rename some columns for consistency...
post_df.columns = post_df.columns.str.replace("mem", "memory_")
post_df.columns = post_df.columns.str.replace("gender.1", "feminine_masculine")
dims = ['dots_affil', 'dots_power', 'feminine_masculine', 'approachable', 'arousal','competence','dominant', 'gender', 'likability',
        'memory', 'race', 'similarity', 'skincolor', 'stability', 'status', 'trustworthy', 'valence', 'youthful']
for dim in dims:
    for col in post_df.columns:
        if dim in col:
            col_ = col.split('_')
            if 'dots' in col:
                col_ = col_[1] + '_' + col_[2] + '_'+ col_[0]
            else: 
                col_ = col_[1] + '_' + col_[0]
            post_df.rename(columns={col:col_}, inplace=True)

In [4]:
task_df = pd.read_excel(glob.glob(summary_dir + '/SNT*')[0])
survey_df = pd.read_excel(glob.glob(summary_dir + '/Questionnaire_summary_*')[0])
survey_df.rename(columns={'sni_hc_score':'sni_network_diversity', 'sni_size_score':'sni_number_ppl'}, inplace=True)

# find overlap
task_subs = task_df['sub_id'].values
post_subs = post_df['sub_id'].values
survey_subs = survey_df['sub_id'].values

overlap = list(set(survey_subs) & set(task_subs) & set(post_subs))
sub_dfs = []
for sub in overlap:
    
    task_ix = np.where(task_subs == sub)[0][0]
    post_ix = np.where(post_subs == sub)[0][0]
    survey_ix = np.where(survey_subs == sub)[0][0]
    sub_data = np.concatenate((task_df.iloc[task_ix, 1:], 
                               post_df.iloc[post_ix, 1:], 
                               survey_df.iloc[survey_ix, 1:])).reshape(1,-1)                      
    colnames = np.concatenate((task_df.iloc[:, 1:].columns.values, 
                               post_df.iloc[:, 1:].columns.values, 
                               survey_df.iloc[:, 1:].columns.values))
    sub_dfs.append(pd.DataFrame(sub_data, columns=colnames)) 
    
data_df = pd.concat(sub_dfs)

data_df['demo_employed'] = ((data_df['demo_occupation']!=8) & (data_df['demo_occupation']!=9)) * 1

# normalize the dots data
data_df[['dots_affil_' + char for char in character_roles]] = (data_df[['dots_affil_' + char for char in character_roles]] - 500)/500
data_df[['dots_power_' + char for char in character_roles]] = (500 - data_df[['dots_power_' + char for char in character_roles]])/500
data_df.to_excel(summary_dir + '/All-data_summary_n' + str(len(data_df)) + '.xlsx', index=False)