onlineL pilot study <br>
Process answers to in-scanner questions form Psychopy logs <br>
Create DataFrames all_logs <br>
Create DataFrame scores (aggragate score per subject, corrected in case of button-flip) <br>
Plot results per subject <br>
Plot score distribution over subjects (histogram)

In [1]:
# from os import listdir, makedirs, walk
from os.path import isfile, join, exists
from os import listdir
import subprocess
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
%config InlineBackend.figure_format = 'retina' # for 4k screen
from IPython.core.interactiveshell import InteractiveShell # for var view
InteractiveShell.ast_node_interactivity = "all" # for var view
import shutil
import pandas as pd

In [5]:
#define paths
const_scan_data_path = '/mnt/bucket/labs/hasson/meshulam/onlineL/pilot/scans/'
subjects_file_name = '/mnt/bucket/labs/hasson/meshulam/onlineL/pilot/process/z_pilot_datasets_to_process/pilot_datasets_to_process.txt'
questions_key_path = '/mnt/bucket/labs/hasson/meshulam/onlineL/pilot/in_scanner_questions/'
result_save_path = '/mnt/bucket/labs/hasson/meshulam/onlineL/pilot/in_scanner_questions/dataframes/'
fig_save_path = '/mnt/bucket/labs/hasson/meshulam/onlineL/pilot/in_scanner_questions/figs/'

In [6]:
#read list of subjects (datasets) from file
subjects = pd.read_csv(subjects_file_name)['subjects'].tolist()
print(str(len(subjects))+' Subject IDs on file in total')

# for some analyses, exclude first subject because display wasn't good & he was only presented with 5 questions & question order was different 
subj_to_exclude='BS_082917'
print('Subject in exclusion list: '+ subj_to_exclude)


21 Subject IDs on file in total
Subject in exclusion list: BS_082917


In [4]:
# set params
videos = ['cos','hist','pieman'] # lectures
# constant: how many questions in our study
const_number_of_questions = np.double(10)
# columns to read out of 'loops' log file
col_to_read = ['order', # question order
               'select_answer.response_raw', # response to question
               'select_answer.rt_raw', # RT to answer question
               'key_wait_trigger.rt_raw']  # time to wait until question presented (perhaps useful for bold analysis later)


In [5]:
def logfile_to_df(logfile_path,logfile_name):
    """
    read Psychopy logfile
    """
    # find logfile
    find_file = [f for f in listdir(logfile_path) if logfile_name in f]
    # error if logfile not found
    if len(find_file) != 1: 
        raise NameError('logfile error')
        return
    # read logfile
    log_contents = pd.read_csv(logfile_path + find_file[0])
    return log_contents

In [6]:
# read all videos, subjects, into a single dataframe all_logs, with question key (correct answers)
all_logs = pd.DataFrame() #output

# get key (correct answers)
key_filename = questions_key_path+'in_scan_questions_key.xlsx'
key_df = pd.read_excel(key_filename) # holds correct responses

# go over all subjects
for this_subject in subjects: # all subjects, no exclusions
    
    #read logfile
    logfile_path = const_scan_data_path + this_subject + '/logs/' # path to subject's logfile
    logfile_name = 'questions.csv' #shortest with the CPH sequence data
    log_contents = logfile_to_df(logfile_path,logfile_name)

    for video in videos: # go over lectures
        lec_questions_subj = pd.DataFrame() # for output
        # go over all columns with relevant data in this logfile
        for this_col in col_to_read:   
            # read data
            lec_data = [l2 for l1,l2 in zip(log_contents['questionPath'],log_contents[this_col]) if (video+'_questions') in l1]
            # assign to dataframe as float
            lec_questions_subj = lec_questions_subj.assign(new_col=[float(i) for i in lec_data])
            lec_questions_subj = lec_questions_subj.rename(columns={"new_col": this_col})
            
        # fill NANs if no data (good for subjects with 5 q only)
        if len(lec_questions_subj)<const_number_of_questions:
            number_of_cols_to_add=const_number_of_questions-len(lec_questions_subj)
            df_nan = pd.DataFrame(np.nan, index=np.arange(number_of_cols_to_add), columns=col_to_read)
            lec_questions_subj = pd.merge_ordered(df_nan,lec_questions_subj)
            #lec_questions_subj['order_presented'] = np.arange(const_number_of_questions)

        # add subject and video id
        lec_questions_subj['subject']=this_subject
        lec_questions_subj['video']=video
        lec_questions_subj['question_number']=np.arange(10)+1
        
        # add key
        lec_questions_subj['correct_answer_from_key']=key_df[(key_df.video == video)].correct_answer.values      
        # concat over videos, subjects    
        all_logs = pd.concat([all_logs,lec_questions_subj])

        

In [7]:
all_logs

Unnamed: 0,order,select_answer.response_raw,select_answer.rt_raw,key_wait_trigger.rt_raw,subject,video,question_number,correct_answer_from_key
0,0.0,2.0,14.082000,33.585987,BS_082917,cos,1,2
1,1.0,4.0,1.481000,1.367147,BS_082917,cos,2,3
2,2.0,1.0,28.115000,0.483978,BS_082917,cos,3,1
3,3.0,1.0,25.533001,0.833866,BS_082917,cos,4,4
4,4.0,3.0,6.065000,0.417096,BS_082917,cos,5,1
5,,,,,BS_082917,cos,6,4
6,,,,,BS_082917,cos,7,2
7,,,,,BS_082917,cos,8,2
8,,,,,BS_082917,cos,9,3
9,,,,,BS_082917,cos,10,1


In [59]:
# save all_logs dataframe as txt and xls
all_logs.to_csv(result_save_path+'raw_in_scan_responses_'+str(len(subjects))+'_subjects.txt')
all_logs.to_excel(result_save_path+'raw_in_scan_responses_'+str(len(subjects))+'_subjects.xls')

In [107]:
def calc_scores(subjects,videos,logs_df):
    """
    Calculate percent correct for each subject in each lecture (except excluded subj)
    input: 
    subjects list
    videos list    
    logs dataframe
    output: df with columns subject,video,percent correct
    """
    scores = pd.DataFrame(columns=['subject','video','percent_correct']) # output
    for this_subject in [s for s in subjects if s!=subj_to_exclude]: # all subjects without excluded subject (only 10 questions)
        for this_video in videos:
            # filter df: subject,video
            subj_df = all_logs[(all_logs.subject == this_subject)&(all_logs.video == this_video)]
            # find which answers where correct
            where_correct = np.where(subj_df['select_answer.response_raw'] == subj_df['correct_answer_from_key'],1,0)
            # calc percent correct for subject in this video
            percent_correct = np.sum(where_correct)/len(where_correct)*100
            # append to output var
            scores = scores.append({'percent_correct':percent_correct, 'subject':this_subject, 'video':this_video}, ignore_index=True)    
    return scores         
scores = calc_scores(subjects,videos,all_logs)
scores


Unnamed: 0,subject,video,percent_correct
0,TC_091917,cos,90.0
1,TC_091917,hist,100.0
2,TC_091917,pieman,90.0
3,JY_091917,cos,70.0
4,JY_091917,hist,70.0
5,JY_091917,pieman,50.0
6,CK_092617,cos,70.0
7,CK_092617,hist,90.0
8,CK_092617,pieman,100.0
9,BT_092617,cos,80.0


In [108]:
# which subjects got super low scores?
# define threshold
const_score_too_low_threshold = 10 # 10=didn't get more than 10% correct in each video
# get list of subjects with super low scores
subjects_with_super_low_score = [this_subject for this_subject in scores['subject'].unique() 
    if(sum(scores[(scores.subject==this_subject)]['percent_correct'])<=const_score_too_low_threshold*len(videos))]
print ('Subjects with super low overall scores:')
print(subjects_with_super_low_score)


Subjects with super low overall scores:
['AA_100917']


In [110]:
# fix subjects with overall superlow scores, assuming they switched response buttons (up/down)
def transform_key_single(full_df,subj_id):
    """
    func switches correct answer key for single subj
    for subjects who switched key order (top-bottom)
    Input: 
    full_df - dataframe to switch keys in
    subj_id
    Output:
    full_df - dataframe with switched keys for subject subj_id
    written because subjects all performed well except for AA_100917, score 10 in hist and 0 in cos, pieman
    """
    # filter df: subject,video
    subj_df = full_df[(full_df.subject == subj_id)]['correct_answer_from_key']
    # define transformation to fix answers if switched keys
    switch_dict = {1:4,2:3,3:2,4:1}
    # transform
    switched_df = subj_df.apply(lambda x: switch_dict[int(x)])
    full_df.loc[(full_df.subject == subj_id),'correct_answer_from_key'] = switched_df.values
    return full_df

def transform_key_multiple(full_df, subjects_to_switch):
    # transfrom subjects who got low scores
    for this_subject in subjects_to_switch:
        all_logs_transformed_key = transform_key(full_df,this_subject)
    return all_logs_transformed_key

all_logs_fixed = transform_key_multiple(all_logs,subjects_with_super_low_score)


In [111]:
# recalulate scores now that buttons were switched for super-low score subjects
scores = calc_scores(subjects,videos,all_logs_transformed_key)
scores
# and voila, score for AA has markedly improved


Unnamed: 0,subject,video,percent_correct
0,TC_091917,cos,90.0
1,TC_091917,hist,100.0
2,TC_091917,pieman,90.0
3,JY_091917,cos,70.0
4,JY_091917,hist,70.0
5,JY_091917,pieman,50.0
6,CK_092617,cos,70.0
7,CK_092617,hist,90.0
8,CK_092617,pieman,100.0
9,BT_092617,cos,80.0


In [113]:
# save scores dataframe as txt and xls
scores.to_csv(result_save_path+'in_scan_scores_'+str(len(subjects))+'_subjects.txt')
scores.to_excel(result_save_path+'in_scan_scores_'+str(len(subjects))+'_subjects.xls')

In [295]:
# create df score2plot for bar graph describing scores for all subjects in each vid

# transform scores dataframe
score2plot = pd.DataFrame(columns=[['subject']+videos]) # output
for this_subject in scores['subject'].unique():
    subj_scores = np.zeros([len(videos)])
    for iVideo, this_video in enumerate(videos):
        this_score = scores[(scores['subject']==this_subject) & (scores['video']==this_video)]['percent_correct'].values
        subj_scores[iVideo]=this_score[0]
    score2plot=score2plot.append({'subject':this_subject, videos[0]:subj_scores[0], videos[1]:subj_scores[1], videos[2]:subj_scores[2]},ignore_index=True)
score2plot


Unnamed: 0,subject,cos,hist,pieman
0,TC_091917,90.0,100.0,90.0
1,JY_091917,70.0,70.0,50.0
2,CK_092617,70.0,90.0,100.0
3,BT_092617,80.0,100.0,90.0
4,RT_092817,70.0,80.0,70.0
5,CN_092817,90.0,100.0,80.0
6,LB_100317,70.0,100.0,70.0
7,AL_100517,40.0,100.0,100.0
8,EW_100917,90.0,80.0,100.0
9,AA_100917,70.0,90.0,90.0


In [296]:
# plot bar graph from score2plot: results per subject
#plt.gcf().clear()
fig1, ax1 = plt.subplots()
index = np.arange(len(score2plot))
bar_width = 0.15
opacity = 0.7
bar1 = plt.bar(index+bar_width*0,score2plot['cos'].values, bar_width, alpha=opacity, color = 'b', label='COS')
bar2 = plt.bar(index+bar_width*1,score2plot['hist'].values, bar_width, alpha=opacity, color = 'g', label='HIST') 
bar3 = plt.bar(index+bar_width*2,score2plot['pieman'].values, bar_width, alpha=opacity, color = 'r', label='PIEMAN') 

plt.xlabel('Subjects')
plt.ylabel('Scores')
plt.title('Subject scores by video')
#plt.xticks(index + bar_width, (score2plot['subject']))
plt.xticks(index + bar_width, (index+1))
plt.legend()
 
#plt.tight_layout()
#plt.show()

<matplotlib.text.Text at 0x7fac6a4c6128>

<matplotlib.text.Text at 0x7fac690a27b8>

<matplotlib.text.Text at 0x7fac6907c2b0>

([<matplotlib.axis.XTick at 0x7fac690a63c8>,
  <matplotlib.axis.XTick at 0x7fac690554a8>,
  <matplotlib.axis.XTick at 0x7fac690450f0>,
  <matplotlib.axis.XTick at 0x7fac696837f0>,
  <matplotlib.axis.XTick at 0x7fac69683cc0>,
  <matplotlib.axis.XTick at 0x7fac69a0d6d8>,
  <matplotlib.axis.XTick at 0x7fac6929a128>,
  <matplotlib.axis.XTick at 0x7fac6929a278>,
  <matplotlib.axis.XTick at 0x7fac693b2b38>,
  <matplotlib.axis.XTick at 0x7fac69554f28>,
  <matplotlib.axis.XTick at 0x7fac695544a8>,
  <matplotlib.axis.XTick at 0x7fac695e7ef0>,
  <matplotlib.axis.XTick at 0x7fac695ff710>,
  <matplotlib.axis.XTick at 0x7fac695ff278>,
  <matplotlib.axis.XTick at 0x7fac6975ffd0>,
  <matplotlib.axis.XTick at 0x7fac6979deb8>,
  <matplotlib.axis.XTick at 0x7fac6979d7b8>,
  <matplotlib.axis.XTick at 0x7fac6982c7f0>,
  <matplotlib.axis.XTick at 0x7fac6983a940>,
  <matplotlib.axis.XTick at 0x7fac6983a978>],
 <a list of 20 Text xticklabel objects>)

<matplotlib.legend.Legend at 0x7fac68b5a9e8>

In [297]:
# plot distribution over subjects per lecture

# create dict with histogram
bns = np.arange(12)*10-1 # bins for histogram
response_dist={}
for this_video in videos:
    vals=score2plot[this_video].values
    response_dist[this_video] = np.histogram(vals,bins=bns,density=False)

fig2, ax2 = plt.subplots()
index = np.arange(len(videos))
opacity = 0.7
index = bns[0:-1]+1 # index for plot
d1 = plt.plot(index,response_dist['cos'][0], alpha=opacity, color = 'b', label='COS')#
d2 = plt.plot(index,response_dist['hist'][0], alpha=opacity, color = 'g', label='HIST')#
d3 = plt.plot(index,response_dist['pieman'][0], alpha=opacity, color = 'r', label='PIEMAN')#
plt.xlabel('Percent correct')
plt.ylabel('Number of subjects')
plt.title('Distribution of in-scanner test scores, over subjects')
plt.legend()


<matplotlib.text.Text at 0x7fac68099e80>

<matplotlib.text.Text at 0x7fac68da3c50>

<matplotlib.text.Text at 0x7fac695622e8>

<matplotlib.legend.Legend at 0x7fac69844dd8>

In [298]:
# save fig data as table
scores.to_csv(result_save_path+'subject_scores_by_video_'+str(len(score2plot))+'_subjects.txt')
# save fig1: per subject
fig1.savefig(fig_save_path+'subject_scores_by_video_'+str(len(score2plot))+'_subjects.jpg')
# save fig2: score distribution over all subjects
fig2.savefig(fig_save_path+'score_dist_by_video_'+str(len(score2plot))+'_subjects.jpg')

In [23]:
# assess question quality
# plot answer histogram per question and save plots
# as sanity check to see that there's a clear peak in the response distribution

# prep plt
plt.gcf().clear()
# select parameters: video and question number (1-10)
for choose_video in videos:
    print(choose_video)
    for choose_question in (np.arange(10)+1):
        print(choose_question)
        # filter df: video, question, exclusions
        df_filtered = all_logs[(all_logs.video == choose_video)
                                    & (all_logs.question_number == choose_question)
                                    & (all_logs.subject != subj_to_exclude)]
        #print (df_filtered)
        # extract data from df
        response = df_filtered['select_answer.response_raw'].values
        # remove nans
        response = response[~np.isnan(response)] 
        # create histogram
        response_hist=np.histogram(response,bins=[0,1.1,2.1,3.1,4.1],density=False)
        #print(response_hist)
        # plot histogram
        plt.bar(np.arange(len(response_hist[0])),response_hist[0])
        plt.title('vid: '+ choose_video + ', question = ' + str(int(choose_question))+' (N=20)')
        plt.savefig(fig_save_path+'dist_ans_per_q\\'+choose_video+'_'+'q'+str(int(choose_question)))
        plt.gcf().clear()
        
    



cos
1


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59f02fe80>

2


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ef9d2e8>

3


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59eee2da0>

4


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ed66470>

5


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59efb8cf8>

6


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59f0e4320>

7


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59eff4320>

8


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb5a2544198>

9


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ef33208>

10


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ee93470>

hist
1


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb5a2a786a0>

2


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ef24438>

3


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb5a2620cf8>

4


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb5a25bb208>

5


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ee1e908>

6


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb5a26ee7b8>

7


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb5a27367b8>

8


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59f1205c0>

9


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb5a26f8080>

10


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59f04ceb8>

pieman
1


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb5a2629908>

2


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59f06ecc0>

3


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ee585c0>

4


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59f1367f0>

5


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ed38cf8>

6


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ef16198>

7


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ef319b0>

8


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59eed91d0>

9


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59ef5b358>

10


<Container object of 4 artists>

<matplotlib.text.Text at 0x7fb59f21e908>

In [None]:
plt.close("all")

In [17]:
all_logs

Unnamed: 0,key_wait_trigger.rt_raw,order,order_presented,question_number,select_answer.response_raw,select_answer.rt_raw,subject,video
0,33.585987,0.0,0.0,1,2.0,14.082000,BS_082917,cos
1,1.367147,1.0,1.0,2,4.0,1.481000,BS_082917,cos
2,0.483978,2.0,2.0,3,1.0,28.115000,BS_082917,cos
3,0.833866,3.0,3.0,4,1.0,25.533001,BS_082917,cos
4,0.417096,4.0,4.0,5,3.0,6.065000,BS_082917,cos
5,,,5.0,6,,,BS_082917,cos
6,,,6.0,7,,,BS_082917,cos
7,,,7.0,8,,,BS_082917,cos
8,,,8.0,9,,,BS_082917,cos
9,,,9.0,10,,,BS_082917,cos
