# Data Post-processing

In [58]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [None]:
# Load results
df = pd.read_csv("raw_results.csv")

# Load metadata
metadata = pd.read_csv("metadata.csv", index_col='stimulus_ID')

In [None]:
df.describe()

In [61]:
# Make column name include stimuli_ID
new_df = df.copy()
new_columns = []

for column in df.columns:
  if "_initial_rating" in column:
    new_columns.append(f"{str(''.join(filter(str.isdigit, df.at[0, column])))}_initial_rating")
  elif "_second_rating" in column:
    new_columns.append(f"{str(''.join(filter(str.isdigit, df.at[0, column])))}_second_rating")
  elif "_prior_knowledge" in column:
    new_columns.append(f"{str(''.join(filter(str.isdigit, df.at[0, column])))}_prior_knowledge")
  elif "_page_1_time_Page" in column:
    new_columns.append(f"{str(''.join(filter(str.isdigit, df.at[0, column])))}_page_1_time")
  elif "_page_2_time_Page" in column:
    new_columns.append(f"{str(''.join(filter(str.isdigit, df.at[0, column])))}_page_2_time")
  else:
    new_columns.append(column)

new_df.columns = new_columns
new_df.drop([0,1], inplace=True)
new_df = new_df.reset_index().drop(columns=['index'])

In [None]:
# Count stimuli
stimuli_n = sum(df.columns.str.count('_explanation_type'))

# Drop nan
new_df = new_df[new_df['S20_stimulus_ID'].notna()]

# Make column name include stimuli number
for i in tqdm(new_df.index):
  stimuli_id = [int(new_df.at[i, f"S{x}_stimulus_ID"]) for x in range(1, stimuli_n+1)]

  for n, j in enumerate(stimuli_id):
    new_df.loc[i, f"S{n+1}_initial_rating"] = new_df.at[i, f'{j}_initial_rating']
    new_df.loc[i, f"S{n+1}_second_rating"] = new_df.at[i, f'{j}_second_rating']
    new_df.loc[i, f"S{n+1}_knowledge"] = new_df.at[i, f'7{j}_prior_knowledge']
    new_df.loc[i, f"S{n+1}_page_1_time"] = new_df.at[i, f'{j}_page_1_time']
    new_df.loc[i, f"S{n+1}_page_2_time"] = new_df.at[i, f'{j}_page_2_time']

In [63]:
# Replace string choices with numeric values


# Cognitive Reflection Test to [0, 1] <-- 0 is incorrect, 1 is correct
new_df.CRT_Q1 = pd.to_numeric(new_df.CRT_Q1, errors='coerce')
new_df.CRT_Q2 = pd.to_numeric(new_df.CRT_Q2, errors='coerce')
new_df.CRT_Q3 = pd.to_numeric(new_df.CRT_Q3, errors='coerce')

new_df.CRT_Q1 = new_df.apply(lambda x: 1 if x.CRT_Q1 == 4 else 0, axis=1)  # ground truth is  4
new_df.CRT_Q2 = new_df.apply(lambda x: 1 if x.CRT_Q2 == 29 else 0, axis=1) # ground truth is 29
new_df.CRT_Q3 = new_df.apply(lambda x: 1 if x.CRT_Q3 == 20 else 0, axis=1) # ground truth is 20

# Need for Cognition to [1,2,3,4,5]
replacer = {'Strongly disagree': 1, 'Disagree': 2, 'Neither agree nor disagree': 3, 'Agree': 4, 'Strongly agree': 5}
cols = new_df.columns[new_df.dtypes == 'object']
new_df[cols] = new_df[cols].replace(replacer)


# Trust 00 to [1,2,3,4,5,6]
replacer = {'Extremely Unreliable': 1, 'Moderately Unreliable': 2, 'Slightly Unreliable': 3, 'Slightly Reliable': 4, 'Moderately Reliable': 5,'Extremely Reliable' : 6}
cols = new_df.columns[new_df.dtypes == 'object']
new_df[cols] = new_df[cols].replace(replacer)

# Trust 01
replacer = {'1-Not at all' : 1,'6-Very much so':6}
cols = new_df.columns[new_df.dtypes == 'object']
new_df[cols] = new_df[cols].replace(replacer)

# Trust 02
replacer = {'Extremely NOT Misleading': 1,'Moderately NOT Misleading': 2,'Slightly NOT Misleading': 3,'Slightly Misleading': 4,'Moderately Misleading': 5,'Extremely Misleading' : 6}
cols = new_df.columns[new_df.dtypes == 'object']
new_df[cols] = new_df[cols].replace(replacer)

# Trust 03
replacer = {'Extremely Inaccurate': 1, 'Moderately Inaccurate': 2,'Slightly Inaccurate': 3,'Slightly accurate': 4,'Moderately accurate': 5,'Extremely accurate' : 6}
cols = new_df.columns[new_df.dtypes == 'object']
new_df[cols] = new_df[cols].replace(replacer)

# Trust 04
replacer = {'1-Not at all acting in my interest': 1,'6-Very much acting in my interest' : 6}
cols = new_df.columns[new_df.dtypes == 'object']
new_df[cols] = new_df[cols].replace(replacer)

# Trust 05
replacer = {'Extremely Fair': 1,'Moderately Fair': 2,'Slightly Fair': 3,'Slightly Unfair': 4,'Moderately Unfair': 5,'Extremely Unfair': 6}
cols = new_df.columns[new_df.dtypes == 'object']
new_df[cols] = new_df[cols].replace(replacer)


# Attention checks to [Pass, Fail]
replacer = {'Somewhat disagree' : 'Pass','Red,Green': 'Pass','FoxNews.com,NBC.com': 'Pass','New York Times website,Huffington Post,CNN.com,FoxNews.com,Google News':'Pass','Huffington Post,CNN.com,FoxNews.com':'Pass','CNN.com,FoxNews.com,Google News':'Pass'}
cols = new_df.columns[new_df.dtypes == 'object']
new_df[cols] = new_df[cols].replace(replacer)

# Make non-pass attention checks into fail
new_df.loc[~new_df.ps_0.isin(['Pass']), 'ps_0'] = 'Fail'
new_df.loc[~new_df.ps_1.isin(['Pass']), 'ps_1'] = 'Fail'
new_df.loc[~new_df.att1.isin(['Pass']), 'att1'] = 'Fail'
new_df.loc[~new_df.att2.isin(['Pass']), 'att2'] = 'Fail'

In [64]:
# Compute aggregate and weighted scores
trust_n = sum(new_df.columns.str.count('trust_'))
NFC_n = sum(new_df.columns.str.count('NFC_Q'))
CRT_n = sum(new_df.columns.str.count('CRT_Q'))

for i in new_df.index:
  new_df.at[i, 'trust'] = np.mean([float(new_df.at[i, f'trust_0{j}']) for j in range(trust_n)])
  new_df.at[i, 'NFC'] = np.mean([new_df.at[i, f'NFC_Q{j}'] for j in range(1, NFC_n+1)])
  new_df.at[i, 'CRT'] = np.mean([new_df.at[i, f'CRT_Q{j}'] for j in range(1, CRT_n+1)])

In [None]:
# Remove Unfinished Entries
print(f"Unfinished entries: {len(new_df[new_df['Finished'] == 'False'])}")
new_df = new_df[new_df["Finished"] != "False"]

In [None]:
# Remove participants that failed an attention check
new_df = new_df[(new_df.ps_1 == 'Pass') & (new_df.att1 == 'Pass') | (new_df.att1 == 'Pass') & (new_df.att2 == 'Pass') | (new_df.ps_1 == 'Pass') & (new_df.att2 == 'Pass')]

print(f"Participants before attention checks: {df.PROLIFIC_PID.nunique()}")
print(f"Participants after attention checks: {new_df.PROLIFIC_PID.nunique()}")

In [None]:
# Melt ratings to individual rows

dfs = []
for rating in ['initial_rating', 'second_rating', 'knowledge', 'page_1_time', 'page_2_time','stimulus_ID', 'explanation_type']:
  
  columns = [f"S{i}_{rating}" for i in range(1, stimuli_n+1)]
  rating_df = pd.melt(new_df, id_vars =['PROLIFIC_PID'], value_vars = columns, var_name ='order', value_name=rating)
  rating_df['order'] = rating_df['order'].str.replace(r'page_1',"").str.replace(r'page_2',"").str.replace(r'\D', '')
  dfs.append(rating_df)

# merge
merged = dfs[0]
for i in dfs[1:]:
  merged = merged.merge(i, how='left', on=['PROLIFIC_PID', 'order'])

# add other columns
keep = ['PROLIFIC_PID', 'RecordedDate','Duration (in seconds)','Comments', 'trust', 'NFC', 'CRT', 'condition', 'stimulation']
processed_df = merged.merge(new_df[keep], how='left', on='PROLIFIC_PID')

In [68]:
# Merge df and metadata
processed_df['stimulus_ID'] = pd.to_numeric(processed_df['stimulus_ID'])

processed_df['ground_truth'] = processed_df.apply(lambda x: metadata.loc[int(x['stimulus_ID']), 'ground_truth'], axis=1)
processed_df['logical_validity'] = processed_df.apply(lambda x: metadata.loc[int(x['stimulus_ID']), x['explanation_type'].lower()+'_logical_validity'], axis=1)
processed_df['topic'] = processed_df.apply(lambda x: metadata.loc[int(x['stimulus_ID']), 'topic'], axis=1)


In [None]:
processed_df.head(3)

In [70]:
# save processed df
processed_df.to_csv('results_processed.csv', index=False)