In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
data = open('data.txt', 'r').read()
observations = data.split('\n')

In [3]:
tupledData = []
for obs in observations:
    if (len(obs) == 0):
        continue
    cleanedData = json.loads(obs)
    
    # Get the participant id
    for key, value in cleanedData.items():
        qtag = key.split('.')
        pid = qtag[0]
        section = qtag[1]
        trial = qtag[2]
        if (trial.isdigit()):
            trial = int(qtag[2])
            if (qtag[1]=='bldg' and qtag[3]!='time'):
                tstat = qtag[3]+qtag[4]
            else:
                tstat = qtag[3]
        else:
            tstat = ""
        if (pid != 'abcde' and pid != 'aaaaa'):
            tupledData.append((pid, section, trial, tstat, value))
        


In [4]:
# List of Participants
participants = [t[0] for t in tupledData if t[1]=='pre' and t[2]=='completed']
print(len(participants),participants)

61 ['cmrag', 'cocbp', 'pasxw', 'esjmj', 'calfs', 'boefn', 'hjefp', 'kskdo', 'mmmhb', 'masme', 'gmklg', 'cjmxw', 'ualkc', 'kadsg', 'momgm', 'gfcpg', 'amctc', 'holkh', 'inmnc', 'cjsgw', 'hdknh', 'emmmb', 'ffbxf', 'mmsem', 'omllb', 'sorfb', 'nmctc', 'amctc', 'cmgxt', 'rjssg', 'caipf', 'mjkub', 'eojnp', 'somxc', 'pmdxe', 'gmpbw', 'ejies', 'fobmm', 'amsxm', 'rfgzw', 'sadgc', 'lomle', 'sjals', 'bmtjn', 'cnkrc', 'bfscb', 'lacsc', 'lalml', 'cmall', 'lnowg', 'bmfag', 'cjsmm', 'ijnfm', 'dmadb', 'bakfl', 'sfcks', 'casmm', 'espbb', 'tamxr', 'bjpfr', 'cowxr']


In [5]:
scores = [int(t[4]) for t in tupledData if t[2]=="directions" and t[0] in participants]
print(len(scores),scores)

61 [4, 3, 2, 4, 2, 2, 3, 2, 3, 4, 1, 2, 3, 3, 1, 3, 4, 4, 2, 2, 5, 4, 5, 2, 3, 4, 4, 4, 3, 1, 5, 4, 3, 3, 2, 1, 3, 4, 2, 3, 4, 4, 4, 5, 4, 4, 4, 3, 4, 4, 2, 4, 2, 3, 3, 3, 3, 3, 4, 3, 4]


In [6]:
# Generate a histogram of the scores
yvals = [scores.count(i) for i in range(1,6)]
width = 0.7
labels = ['very poor','poor','average','good','very good']
plt.xlim(0.5,5.5)
plt.ylim(0, max(yvals)+1)
plt.bar(range(1,6), yvals, align='center', width=width)
plt.xticks(range(1,6), labels)
plt.show()

In [7]:
# Get the number correct and avg response time for each participant 
# in the Mental Rotation Task
correct = []
mspeeds = []
for pid in participants:
    num = 0
    speed = sum([int(t[4]) for t in tupledData if t[0]==pid and t[1]=='mrt' and t[3]=='time'])
    for i in range(1,30):
        pair = [t for t in tupledData if t[0]==pid and t[1]=='mrt' and t[2]==i and t[3]!='time']
        if (pair[0][4] == pair[1][4]):
            num += 1
    correct.append(num)
    mspeeds.append(speed/30)

In [8]:
# Generate a historgram of the accuracy and speed
yvals = [correct.count(i) for i in range(1,30)]
width = 0.7
plt.xlim(-0.5, 30.5)
plt.ylim(0, max(yvals)+1)
plt.bar(range(1,30), yvals, align='center', width=width)
plt.show()

In [18]:
plt.scatter(correct, mspeeds)
plt.show()

In [36]:
print(len(scores),len(mspeeds))

61 61


In [17]:
# Generate a scatter plot of speed and accuracy
plt.scatter(scores,mspeeds)
plt.show()

In [16]:
# T Tests
sm.stats.ttest_ind(scores, mspeeds)
sm.stats.ttest_ind(scores, correct)
X = [[1, s, s**2] for s in scores]
y = correct
print(sm.OLS(y,X).fit().summary())
y2 = mspeeds
print(sm.OLS(y2,X).fit().summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.038
Method:                 Least Squares   F-statistic:                     2.185
Date:                Mon, 16 May 2016   Prob (F-statistic):              0.122
Time:                        13:54:57   Log-Likelihood:                -182.28
No. Observations:                  61   AIC:                             370.6
Df Residuals:                      58   BIC:                             376.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         18.5980      4.560      4.079      0.0

In [31]:
X = np.array([scores, mspeeds]).T
X = sm.add_constant(X)
print(X)
y = correct
print(sm.OLS(y,X).fit().summary())

[[  1.00000000e+00   4.00000000e+00   4.76383333e+03]
 [  1.00000000e+00   3.00000000e+00   6.36183333e+03]
 [  1.00000000e+00   2.00000000e+00   7.89600000e+03]
 [  1.00000000e+00   4.00000000e+00   4.36436667e+03]
 [  1.00000000e+00   2.00000000e+00   5.41216667e+03]
 [  1.00000000e+00   2.00000000e+00   4.57650000e+03]
 [  1.00000000e+00   3.00000000e+00   3.99106667e+03]
 [  1.00000000e+00   2.00000000e+00   5.40690000e+03]
 [  1.00000000e+00   3.00000000e+00   3.87896667e+03]
 [  1.00000000e+00   4.00000000e+00   4.19116667e+03]
 [  1.00000000e+00   1.00000000e+00   4.76003333e+03]
 [  1.00000000e+00   2.00000000e+00   3.68836667e+03]
 [  1.00000000e+00   3.00000000e+00   5.48513333e+03]
 [  1.00000000e+00   3.00000000e+00   5.26716667e+03]
 [  1.00000000e+00   1.00000000e+00   6.67576667e+03]
 [  1.00000000e+00   3.00000000e+00   1.28093000e+04]
 [  1.00000000e+00   4.00000000e+00   1.54868667e+04]
 [  1.00000000e+00   4.00000000e+00   7.69100000e+03]
 [  1.00000000e+00   2.00000

In [39]:
# Get the average error and avg response time for each participant
# in the Placing Buildings on a Map Task
error = []
bspeeds = []
for pid in participants:
    dist = 0;
    speed = sum([int(t[4]) for t in tupledData if t[0]==pid and t[1]=='bldg' and t[3]=='time'])
    for i in range(1,15):
        pairx = [t for t in tupledData if t[0]==pid and t[1]=='bldg' and t[2]==i and (t[3]=='guessx' or t[3]=='actualx')]
        pairy = [t for t in tupledData if t[0]==pid and t[1]=='bldg' and t[2]==i and (t[3]=='guessy' or t[3]=='actualy')]
        dist += np.sqrt((float(pairx[0][4])-float(pairx[1][4]))**2+(float(pairy[0][4])-float(pairy[1][4]))**2)
    bspeeds.append(speed/15)
    error.append(dist/15)
# Check that the error is right with the scaling

In [43]:
# Generate a historgram of the error and speed
plt.scatter(scores,error)
plt.show()

In [44]:
# Generate a scatter plot of speed and accuracy
plt.scatter(scores,bspeeds)
plt.show()

In [45]:
# Run a T-test and OLS model on our scores
X = scores
X = sm.add_constant(X)
y = error
print(sm.OLS(y,X).fit().summary())
y2 = bspeeds
print(sm.OLS(y2,X).fit().summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.015
Method:                 Least Squares   F-statistic:                   0.08617
Date:                Mon, 16 May 2016   Prob (F-statistic):              0.770
Time:                        10:44:35   Log-Likelihood:                -336.15
No. Observations:                  61   AIC:                             676.3
Df Residuals:                      59   BIC:                             680.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const        104.1413     25.230      4.128      0.0