# Preprocessing of self reported answers and performance

This file creates a numpy array that associates the self reported answers to its corresponding task. This association is done per subject. Therefore for every subject there are four rows that describe these tasks and answers in numerical form.

Each in row in addition to include the self reported answers, it also has the performance obtained by that subject on an specific task and the number of times it was selected. For consistency, the latter is included whether it is relevant or not (e.g. during training or testing the number of times a task is selected is the same for all of them).

In order to obtain those metrics, six columns are extracted from the trial by trial file:

- id
- whether the subject has been informed about a random task or not
- task family
- task category
- answer

In [145]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import pandas as pd
import warnings

from ipywidgets import widgets
from IPython.html.widgets import *
from __future__ import division

warnings.filterwarnings('ignore')
rcParams.update({'font.size': 15})
#plt.style.use('ggplot')
#plt.style.use('seaborn-dark-palette')
plt.style.use('fivethirtyeight')

float_formatter = lambda x: "%.2f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})
np.set_printoptions(precision=2)

In [151]:
TASKS = 4
USERS = 201
PHASE = 'exploration'


# Indicate for which modality you want to generate the files
TYPE = 's' #s=strategic, ft=free exploration and training, f = free exploration

if TYPE == 's':
    dphase = {'exploration':0, 'test':1, 'train':2}
    csvFile = '../monster task data/Strategic/monsters_data_strategic_052217.csv'
elif TYPE == 'ft':
    dphase = {'exploration':0, 'train':1}
    csvFile = '../monster task data/Free with Familiarization/monsters_data_free_familiarize_05232017.csv'
elif TYPE == 'f':
    dphase = {'exploration':0}
    csvFile = '../monster task data/Free Only/monsters_data_free_only_052417.csv'

## Preprocess trial instances
 
* metricsUser contains 
    * User
    * condition
    * Task category
    * Times task selected
    * % Selection
    * #Times correct on this task
    * % Correct

In [152]:
def stringToInt(arr, position, values):
    for i, v in enumerate(values):
        arr[arr[:,position] == v, position] = i
    return arr
np.set_printoptions(precision=4)
# get id(0), condition(1), phase(2), family(7), category(8), correct(11)
csv = np.genfromtxt(csvFile, dtype=np.string_, delimiter=',', usecols=(0,1,2,7,8,11), skip_header=1)

# Get monster type
monsters = [m for m in np.unique(csv[:,3])]
# Get category
categories = [b'category1D', b'categoryIgnore1D', b'category2D', b'categoryRandom']
# Get ids
ids = [i for i in np.unique(csv[:,0]) ]
# Get condition
cond = [co for co in np.unique(csv[:,1]) ]
# Get unique phases
phases = [p for p in np.unique(csv[:,2]) ]
# For converting string to int boolean
bo = [b'False', b'True']

csvInt = csv.copy() 
# Convert fields to ints for easy processing
for i,j in enumerate([ids, cond, phases, monsters, categories, bo]):
    csvInt = stringToInt(csvInt, i, j)
csvInt = csvInt.astype('int')


#get dphase
csvInt = csvInt[csvInt[:, 2] == dphase[PHASE]]
splitCsv = [csvInt[csvInt[:,0]==i] for i in np.unique(csvInt[:,0])]

print np.unique(splitCsv[0][:,4])
print np.shape(splitCsv)[0]

metricsUser = []
cuser = 0
i=0
# get id(0), condition(1), phase(2), family(7), category(8), correct(11)
for user in range(np.shape(splitCsv)[0]):
#for user in range(1):
    questions = splitCsv[user]
    #print questions[0,1],questions[0][1]
    task, ctask = np.unique(questions[:,4], return_counts=True)
    #print task, ctask
    
    metricsTask = []
    # Count number of times the answer was correct per task and percent
    for t in range(TASKS):
       
        correct = np.sum(questions[questions[:,4] == t,5] == 1) 
        total = np.sum(questions[:,4] == t)
        # get id(0), condition(1), family(7), category(8), correct(11)
        if t in task:
            # Task category, times task selected, % selection, #times correct on this task, % correct
            metricsTask.append([user, questions[0][1], t, ctask[np.where(task==t)][0], round(float(ctask[np.where(task==t)][0])/np.shape(questions)[0],2), correct, round(np.nan_to_num(correct/total),2)])
        else:
            metricsTask.append([user,  questions[0][1], t, 0., 0., correct, np.nan_to_num(correct/total)])
    metricsUser.append(metricsTask)



[0 1 2 3]
201


## Preprocess self reported data

In [153]:
# Subjective data

if TYPE == 's':
    csvFile = '../monster task data/Strategic/monsters_extra_data_strategic_052217.csv'
elif TYPE == 'ft':
    csvFile = '../monster task data/Free with Familiarization/monsters_extra_data_free_familiarize_05232017.csv'
else:
    csvFile = '../monster task data/Free Only/monsters_extra_data_free_only_05242017.csv'

    
scsv = np.genfromtxt(csvFile, dtype=np.string_, delimiter=',', skip_header=1, usecols=range(30))


# For those people that did not explore all tasks we still have to find a way to know what monster corresponds
# to what category so get that information from training/testing phase
# csv us main file

# get id(0), condition(1), phase(2), family(7), category(8), correct(11)
csvInt2 = csv.copy()
print csv.shape
# Convert fields to ints for easy processing
for i,j in enumerate([ids, cond, phases, monsters, categories, bo]):
    csvInt2 = stringToInt(csvInt2, i, j)
    
csvInt2 = csvInt2.astype('int')

# Get only those in free exploration phase
# when free
csvInt2 = csvInt2[csvInt2[:, 2]==dphase['train']]
# Split by user
splitCsv = [csvInt2[csvInt2[:,0]==i] for i in np.unique(csvInt2[:,0])]
# each row consists of uid, monster family, task category, answers for that category
postCsv = []
X=[]
q=[]
i=0
#np.shape(splitCsv)[0]
for user in range(np.shape(splitCsv)[0]):
    # id(0), condition (1), phase (2), family(3), category(4), correct(5)
    # Get monster family and task category
    #family(1), category(2)
    questions = splitCsv[user][:,3:5]
    #print questions
    #print questions[:,0]
    postQ = scsv[user]
    
    #find mapping of monster and task category
    _, mIdx = np.unique(questions[:,0], return_index=True)
    
    #_, cIdx = np.unique(questions[:,0], return_index=True)
    _, cIdx = np.unique(questions[:,1], return_index=True)
    #print mIdx,cIdx
    #print "midX",questions[np.sort(cIdx),1]
    # stack monster type, complexity type
    moncat = np.vstack((questions[np.sort(mIdx),0], questions[np.sort(cIdx),1])).T
    #print "moncat",moncat
    #print "moncat2",moncat[moncat[:,0].argsort()]
    moncat = moncat[moncat[:,0].argsort()]
    #print moncat
    q.append(moncat)
    for n, mon in enumerate(moncat[:,0]):
        if TYPE == 'ft':
            #Use this when ALL answer columns are used
            answ = postQ[n*7+2:7*(n+1)+2]
        else:
            answ = postQ[n*7+2:7*(n+1)+2]
        # stack user id, monster id, category id, answers for that category
        #print "answ",answ
        postCsv.append(np.hstack((user, mon, moncat[n,1], answ.tolist())).tolist())
    





(74370, 6)


In [154]:
postCsv = np.asarray(postCsv)
float_formatter = lambda x: "%.2f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})
np.set_printoptions(precision=2)

#print metricsUser[0][0]
metricsUser = np.asarray(metricsUser, dtype='float')
NUSERS = np.shape(metricsUser)[0]
postCsv = postCsv.astype('float')

# Order by first column (id) and then by third column (category task complexity)
postCsv = postCsv[np.lexsort((postCsv[:,2], postCsv[:,0]))]

# user, condition, category task (complexity), #times task selected, % selection, #times correct on this task, % correct

#print "metr",metricsUser.shape
#print NUSERS
metricsUser = metricsUser.reshape(NUSERS*TASKS,7)

# Check that columns (user and task category complexity) are the same in order to fusion them
if np.array_equal(metricsUser[:,0], postCsv[:,0]) and np.array_equal(metricsUser[:,2], postCsv[:,2]):
    finalStats = np.hstack((metricsUser, postCsv[:,3:]))    
else:
    print np.shape(metricsUser[:,0]),np.shape(postCsv[:,0]) 
    print('ERROR: columns do not match, someone did not select all tasks')
print finalStats[0]

[  0.     1.     0.    65.     0.26  48.     0.74   8.     4.     7.    10.
  10.     9.     3.  ]


## Saving numpy arrays

In [155]:
if TYPE == 'ft':
    if PHASE == 'train':
        #np.savetxt('freeTrain-train.txt', finalStats)
        np.savetxt("freeTrain-train_0.4.csv",finalStats, delimiter=",",header="user,condition,Task category, times task selected, % selection, #times correct on this task, % correct,interested,complex,time,progress,rule,future-learn-0,future-learn-1")
    
    elif PHASE == 'exploration':
        np.savetxt('freeTrain-free_0.4.csv', metricsUser,delimiter=",",header="user,condition,Task category, times task selected, % selection, #times correct on this task, % correct")

if TYPE == 's':
    if PHASE == 'train': 
        np.savetxt('strategic-train_0.4.csv', finalStats,delimiter=",",header="user,condition,Task category, times task selected, % selection, #times correct on this task, % correct,interested,complex,time,progress,rule,future-learn-0,future-learn-1")
    elif PHASE == 'test':
        np.savetxt('strategic-test_0.4.csv', finalStats,delimiter=",",header="user,condition,Task category, times task selected, % selection, #times correct on this task, % correct,interested,complex,time,progress,rule,future-learn-0,future-learn-1")
    else:
        np.savetxt('strategic-free_0.4.csv', metricsUser,delimiter=",",header="user,condition,Task category, times task selected, % selection, #times correct on this task, % correct")        

elif TYPE == 'f':
    if PHASE == 'exploration':
        np.savetxt('free-free_0.4.csv', finalStats)


'''
if TYPE == 's':
    if PHASE == 'train': 
        np.savetxt('strategic-train.txt', finalStats)
    elif PHASE == 'test':
        np.savetxt('strategic-test.txt', finalStats)
    else:
        np.savetxt('strategic-free.txt', metricsUser)
elif TYPE == 'ft':
    if PHASE == 'train':
        np.savetxt('freeTrain-train.txt', finalStats)
    elif PHASE == 'exploration':
        np.savetxt('freeTrain-free.txt', metricsUser)
elif TYPE == 'f':
    if PHASE == 'train':
        np.savetxt('strategic-train.txt', finalStats)
    elif PHASE == 'test':
        np.savetxt('strategic-test.txt', finalStats)
    else:
        #np.savetxt('free-free.txt', metricsUser)
        np.savetxt('free-free.txt', finalStats)
'''

"\nif TYPE == 's':\n    if PHASE == 'train': \n        np.savetxt('strategic-train.txt', finalStats)\n    elif PHASE == 'test':\n        np.savetxt('strategic-test.txt', finalStats)\n    else:\n        np.savetxt('strategic-free.txt', metricsUser)\nelif TYPE == 'ft':\n    if PHASE == 'train':\n        np.savetxt('freeTrain-train.txt', finalStats)\n    elif PHASE == 'exploration':\n        np.savetxt('freeTrain-free.txt', metricsUser)\nelif TYPE == 'f':\n    if PHASE == 'train':\n        np.savetxt('strategic-train.txt', finalStats)\n    elif PHASE == 'test':\n        np.savetxt('strategic-test.txt', finalStats)\n    else:\n        #np.savetxt('free-free.txt', metricsUser)\n        np.savetxt('free-free.txt', finalStats)\n"