In [180]:
import pandas as pd

In [181]:
raw_data = pd.read_csv('pretest_data_incomplete.csv')

# Get Michael Data

For each item, Michael wants:
* Mean
* Standard Error of the Mean
* Spearman R
* Standard Error of the Estimate


In [182]:
#First, remove people who took a median of less than 5 seconds to finish every problem 
#That means they roughly finished everything in 2 minutes

time_data = raw_data.loc[:,[col for col in raw_data.columns if 'Page Submit' in col]].iloc[:,2:-1]
remove_index = [i for i,v in enumerate(time_data.median(axis=1)) if v <= 5]

In [183]:
#Get only agreement data
agree_data = raw_data.loc[:,['groupID'] + [x for x in raw_data.columns if 'agree' in x]]

#Remove people who answer too quickly
agree_data = agree_data.loc[[i for i in agree_data.index if i not in remove_index]]

#Translate answers into values
replace_dict = {'Strongly disagree':-3, 'Disagree':-2, 'Somewhat disagree':-1,
                 'Strongly agree':3, 'Agree':2, 'Somewhat agree':1,
                 'Not yet decided': 0}
agree_data = agree_data.replace(replace_dict)

agree_data.head()

Unnamed: 0,groupID,p1agree,p2agree,p3agree,p4agree,p5agree,p6agree,p7agree,p8agree,p9agree,...,c37agree,c38agree,c39agree,c40agree,c41agree,c42agree,c43agree,c44agree,c45agree,c46agree
1,0,-2.0,1.0,1.0,2.0,2.0,,,-2.0,,...,,3.0,0.0,,,-2.0,,2.0,2.0,2.0
2,0,3.0,3.0,3.0,3.0,3.0,3.0,,0.0,,...,,2.0,,-3.0,3.0,,2.0,0.0,0.0,2.0
3,0,,3.0,,,,2.0,-1.0,1.0,,...,1.0,,,-2.0,-2.0,1.0,1.0,1.0,,1.0
4,1,-2.0,,2.0,-3.0,2.0,0.0,,,1.0,...,2.0,3.0,,,,,-2.0,,-1.0,
5,0,-3.0,,0.0,,3.0,,,,0.0,...,,,-2.0,-2.0,-3.0,,,,0.0,


In [184]:
from scipy.stats import linregress

agree_data_results = agree_data.iloc[:,1:len(agree_data)].transpose()

agree_data_results['mean'] = agree_data_results.mean(axis=1)
agree_data_results['se_mean'] = agree_data_results.sem(axis=1)

sr = []
see = []

for col in agree_data_results.index:
    rel_data = agree_data.loc[:,['groupID',col]].dropna()

    rank_id = rankdata(rel_data['groupID'],'dense')
    rank_data = rankdata(rel_data[col],'dense')
    
    sr += [spearmanr(rank_id,rank_data)[0]]
    see += [linregress(rank_id,rank_data)[4]]
    
agree_data_results['spearmanr'] = sr
agree_data_results['se_estimate'] = see

agree_data_results = agree_data_results[['mean','se_mean','spearmanr','se_estimate']]
agree_data_results.to_csv('demolab_incomplete_results.csv')

In [185]:
agree_data_results.iloc[38:].head()

Unnamed: 0,mean,se_mean,spearmanr,se_estimate
p40agree,-0.710843,0.224394,-0.15885,0.469936
p41agree,2.313253,0.113526,-0.070825,0.239833
p42agree,1.1375,0.17894,-0.194386,0.387005
p43agree,-0.148148,0.158196,-0.013802,0.338697
p44agree,-0.575,0.222066,-0.217338,0.467071


## Get proportion of undecided vs. strong opinion

In [186]:
opinion_str_data = agree_data.iloc[:,1:len(agree_data)].transpose()

undecided = []
strongdec = []

for row in opinion_str_data.index:
    rowtotal = len(opinion_str_data.loc[row,:].dropna())
    undecided += [(opinion_str_data.loc[row,:]==0).sum() / rowtotal]
    strongdec += [(abs(opinion_str_data.loc[row,:])==3).sum() / rowtotal] 

opinion_str_data['%undecided'] = undecided
opinion_str_data['%strongdec'] = strongdec
opinion_str_data['delta_ud_sd'] = [undecided[i] - strongdec[i] for i in range(len(undecided))]

opinion_str_data[['%undecided','%strongdec','delta_ud_sd']]

Unnamed: 0,%undecided,%strongdec,delta_ud_sd
p1agree,0.195402,0.252874,-0.057471
p2agree,0.094118,0.258824,-0.164706
p3agree,0.108434,0.204819,-0.096386
p4agree,0.119048,0.297619,-0.178571
p5agree,0.197531,0.234568,-0.037037
p6agree,0.313253,0.180723,0.132530
p7agree,0.172840,0.333333,-0.160494
p8agree,0.243902,0.219512,0.024390
p9agree,0.219512,0.231707,-0.012195
p10agree,0.179487,0.243590,-0.064103


## Now add in Spearman R and calculate a weighted sum

In [190]:
data_dict = pd.read_csv('pretest_data_dict.csv')
data_dict['key'] = data_dict['key'].apply(lambda x: x[:x.find('\n')] if '\n' in x else x)
data_dict = data_dict.set_index('variable').to_dict()['key']

In [196]:
final_data = opinion_str_data[['%undecided']].join(agree_data_results['spearmanr'])
final_data['sum_uw'] = (1 - final_data['%undecided']) + abs(final_data['spearmanr'])

final_data['item'] = [data_dict[row] for row in final_data.index]
final_data.sort_values('sum_uw', ascending=True)

final_data.to_csv('demolab_incomplete_results_ranked.csv')

# More in-depth analysis

In [168]:
final_data.iloc[20:]

Unnamed: 0,%undecided,spearmanr,sum_uw
p21agree,0.125000,0.008579,1.116421
p22agree,0.048193,0.160272,0.887920
p23agree,0.075949,0.097198,0.978752
p24agree,0.058824,0.002973,1.055851
p25agree,0.036145,0.146719,0.889425
p26agree,0.192771,-0.148110,1.044661
p27agree,0.073171,-0.097262,0.975908
p28agree,0.098765,-0.167278,0.931487
p29agree,0.135802,-0.024426,1.111377
p31agree,0.049383,0.066127,0.983255
