# Bias modification data analysis

In [None]:
# import packages
import os
import json
import time
import pandas as pd
import numpy as np

## 1. Define functions for data extraction / analyses

In [None]:
# function for creating a dataframe based on each information in  PANAS/DOT/VS JSON files 

def file_info_to_dataframe(file_info_list, task_title):

    # create the dataframe
    file_df = pd.DataFrame(file_info_list, columns = ['ppt_id', task_title + '_file_name', 'time_info', 'file_type'])

    # add 'time_cleaned' column, listing when task was performed 
    file_df['time_cleaned'] = pd.to_datetime(file_df['time_info'].astype(int)/1000, unit='s')

    # sort dataframe by file_name (and so time)
    file_df = file_df.sort_values([task_title + '_file_name'], ascending=[True]) 
    file_df.reset_index(drop=True, inplace = True) # reset index after sorting

    # add 'rolling_count_ppt' column listing the rolling count of task files for each ppt, ordered by time
    file_df['rolling_count_ppt'] = file_df.groupby(by='ppt_id').cumcount()+1

    # create 'time_date' collumn; lists date when the task was performed (without hour/minute/second info)
    file_df['time_date'] = file_df['time_cleaned'].dt.strftime('%Y-%m-%d')

    return file_df


In [None]:
# function for finding type of DOT or VS files (test, trial, active-training, sham-training)

def get_game_info(game_files, game_file_info):
    for i in game_files:
        with open(i) as json_file:
            game_json_data = json.load(json_file)
            game_ppt_id = game_json_data['bbuid']
            game_time_file = str(game_json_data['time'])
            
            if (game_json_data['data']['data'][0]['g_d']['t'] == 'test') and (game_json_data['data']['data'][0]['g_d']['f'] == 'ftue'):
                game_file_info.append([game_ppt_id,i,game_time_file,'trial'])

            elif (game_json_data['data']['data'][0]['g_d']['t'] == 'test') and (game_json_data['data']['data'][0]['g_d']['f'] == 'normal'):
                game_file_info.append([game_ppt_id,i,game_time_file,'test'])

            elif (game_json_data['data']['data'][0]['g_d']['t'] == 'training') and (game_json_data['data']['data'][0]['g_d']['v'] == 'Active'):
                game_file_info.append([game_ppt_id,i,game_time_file,'active-training'])

            elif (game_json_data['data']['data'][0]['g_d']['t'] == 'training') and (game_json_data['data']['data'][0]['g_d']['v'] == 'Sham'):
                game_file_info.append([game_ppt_id,i,game_time_file,'sham-training'])

            else:
                game_file_info.append([game_ppt_id,i,game_time_file,'other'])


In [None]:
# function for creating a dataframe with ppt ids and the training type conducted, for DOT or VS

def split_game_ids_by_training_type_in_dfs(game_file_df_cleaned, game_active_training_df, game_sham_training_df):
    # get unique ids of all ppts with any task file in a dataframe
    game_ids_training_type_df = pd.DataFrame(game_file_df_cleaned.ppt_id.unique())

    # rename first column as 'ppt_id'
    game_ids_training_type_df.rename(columns={0: "ppt_id"}, inplace=True)

    # identify ppts with active or sham training for this task
    game_train_conditions = [
        game_ids_training_type_df.ppt_id.isin(game_active_training_df.ppt_id),
        game_ids_training_type_df.ppt_id.isin(game_sham_training_df.ppt_id)
    ]

    # specify training type choices for the training conditions
    game_train_choices = ['active-training', 'sham-training']

    # add training_type column, matching to game training conditions and choices; those without these are listed as no-training
    game_ids_training_type_df['training_type'] = np.select(game_train_conditions, game_train_choices, default='no-training')

    return game_ids_training_type_df


In [None]:
# function used when analysing accuracy of DOT and VS test files

def is_correct(s):
    return 1 if s == 'true' else 0


In [None]:
# function for adding DOT and VS training types to any mood dataframe

def add_game_training_type_to_mood_dfs(mood_values_df, game_training_type_df, game_title):
    
    # merge data frm game_training_type_df to the moood df by ppt_id
    split_mood_values_df = pd.merge(mood_values_df, game_training_type_df, how='left', on='ppt_id')
    
    # replace NaNs with 'no-task-info'; the ppts don't have relevant task files
    split_mood_values_df = split_mood_values_df.fillna('no-' + game_title + '-info')
    
    # set name of training type column to match game_title
    training_column_name = game_title + '_training_type'
    split_mood_values_df.rename(columns={'training_type': training_column_name}, inplace=True)

    return split_mood_values_df


In [None]:
# function for setting overall training category of participants, based on DOT and VS data

def mood_overall_training_category(mood_responses_by_training_df):

    # identify ppts with as DOT-active, DOT-sham, VS-active, VS-sham, no-training
    overall_train_conditions = [
        (mood_responses_by_training_df['dot_training_type'] == "active-training"),
        (mood_responses_by_training_df['dot_training_type'] == "sham-training"),
        (mood_responses_by_training_df['vs_training_type'] == "active-training"),
        (mood_responses_by_training_df['vs_training_type'] == "sham-training")
    ]

    # specify training type choices for the training conditions
    overall_train_choices = ['dot-active-training', 'dot-sham-training', 'vs-active-training', 'vs-sham-training']

    # add training_type column, matching to game training conditions and choices; those without these are listed as no-training
    mood_responses_by_training_df['overall_training_type'] = np.select(overall_train_conditions, overall_train_choices, default='no-training')

    return mood_responses_by_training_df


## 2. Sort into mood, dot and vs files

In [None]:
# get list of files in current directory, ignoring hidden files
all_files = sorted([f for f in os.listdir('./') if not f.startswith('.')])


In [None]:
# create lists to store file names in
vs_files = []
dot_files = []
mood_files = []
other_files = []
file_info = [] # create df with cols: file name, ppt id, time, type ('mood', 'dot', 'vs')


In [None]:
# start timer for duration of processing files
start_file_sort = time.time()

for i in all_files:

    with open(i) as json_file:
        data = json.load(json_file) # load JSON file
        ppt_id = data['bbuid'] # set ppt_id
        time_file = str(data['time']) # find time info from file
        
        # for mood files, save file name, ppt_id, time info, and file type (mood)
        if 'm_d' in data['data']:
            mood_files.append(i)
            file_info.append([i,ppt_id,time_file,'mood'])

        # sort DOT, VS, and other files; save file name, ppt_id, time info, and file type (mood)
        elif 'data' in data['data']: 
            
            if len(data['data']['data']) == 0:
                other_files.append(i)
                
            elif data['data']['data'][0]['g_d']['g'] == 'SMI':
                vs_files.append(i)
                file_info.append([i,ppt_id,time_file,'vs'])

            elif data['data']['data'][0]['g_d']['g'] == 'DOT':
                dot_files.append(i)
                file_info.append([i,ppt_id,time_file,'dot'])
                
            else:
                other_files.append(i)

        else:
            other_files.append(i)

# end timer for duration of processing files
end_file_sort = time.time()


In [None]:
# Print description of file numbers

# find time to sort files
elapsed_time_sort = end_file_sort - start_file_sort
print('Time to read and sort files in %H:%M:%S format: ',  time.strftime("%H:%M:%S", time.gmtime(elapsed_time_sort)))

# print info on numbers of files
print('Number of mood files: ', len(mood_files))
print('Number of dot files: ', len(dot_files))
print('Number of vs files: ', len(vs_files))
print('Number of other files: ', len(other_files))
print('Total number of files: ', len(mood_files) + len(dot_files) + len(vs_files) + len(other_files))


In [None]:
# check file categorised as 'other' - no data was saved
with open(other_files[0]) as json_file:
    data = json.load(json_file)
    print(data)


In [None]:
# create file_df dataframe; for each file, shows 'file_name', 'ppt_id', 'time_info', 'type' 
file_df = pd.DataFrame(file_info, columns = ['file_name', 'ppt_id', 'time_info', 'type'])
file_df['time_cleaned'] =  pd.to_datetime(file_df['time_info'].astype(int)/1000, unit='s')
file_df = file_df.sort_values(['file_name'], ascending=[True]) # order by file_name
file_df['rolling_count_ppt'] = file_df.groupby(by='ppt_id').cumcount()+1
file_df = file_df.reset_index(drop=True)
file_df['time_day'] = file_df['time_cleaned'].dt.strftime('%Y-%m-%d')


In [None]:
# visualise dataframe
file_df.head()


In [None]:
# get unique ppt IDs from the 'ppt_id' column in file_df, without duplicates
all_ppts = file_df['ppt_id'].drop_duplicates().to_list()
print('Number of unique ppts: ', len(all_ppts))


In [None]:
# save DOT, VS and mood file names separately
with open("../mood_files.txt", "w") as outfile:
    outfile.write("\n".join(str(item) for item in mood_files))

with open("../vs_files.txt", "w") as outfile:
    outfile.write("\n".join(str(item) for item in vs_files))

with open("../dot_files.txt", "w") as outfile:
    outfile.write("\n".join(str(item) for item in dot_files))

# save list of all ppt IDs
with open("../ppt_ids.txt", "w") as outfile:
    outfile.write("\n".join(str(item) for item in all_ppts))
 
# save file_info dataframe as CSV
file_df.to_csv(path_or_buf='../file_info.csv', index=False)



# 3. Analyse dot-probe (DOT) data

In [None]:
# find names of all DOT files
dot_txt = open("../dot_files.txt","r")
dot_content = dot_txt.read()
dot_files = dot_content.split("\n")
dot_txt.close()


### Create dataframe listing information about DOT files

In [None]:
dot_file_info = [] # to create df with cols: file name, ppt id, time, type

start_dot_file_sort = time.time()

get_game_info(dot_files, dot_file_info)

end_dot_file_sort = time.time()
elapsed_dot_file_sort_time = end_dot_file_sort - start_dot_file_sort

print('Time to read and sort DOT files in %H:%M:%S format: ', time.strftime("%H:%M:%S", time.gmtime(elapsed_dot_file_sort_time)))


In [None]:
# create dot_file_df as the dataframe
dot_file_df = file_info_to_dataframe(dot_file_info, 'dot')

# reorder columns in dataframe to group similar ones together
dot_file_df = dot_file_df[['ppt_id', 'dot_file_name', 'file_type', 'time_info', 'time_cleaned', 'time_date', 'rolling_count_ppt']]

# set time_info column as a numeric
dot_file_df.time_info = pd.to_numeric(dot_file_df.time_info)


In [None]:
# list total number of DOT files
print('Total number of DOT files: ', len(dot_file_df))


### Split participants by training type (active, sham), and remove any cases where participants did both types of training

In [None]:
# split dataframe by training file_type
dot_active_training_df = dot_file_df[dot_file_df['file_type'] == "active-training"]
dot_sham_training_df = dot_file_df[dot_file_df['file_type'] == "sham-training"]

# find cases where patients did 1 file_type of training then the other - by whether they are both in dot_active_training_df & dot_sham_training_df 
dot_active_sham_df = pd.concat([dot_active_training_df[(dot_active_training_df["ppt_id"].isin(dot_sham_training_df["ppt_id"]))], dot_sham_training_df[(dot_sham_training_df["ppt_id"].isin(dot_active_training_df["ppt_id"]))]])
dot_active_sham_df = dot_active_sham_df.sort_values(['dot_file_name'], ascending=[True])
dot_active_sham_df = dot_active_sham_df.reset_index(drop=True)

print('Number of patients with both dot active and sham training: ', dot_active_sham_df.ppt_id.nunique())
print('Patient IDs with both DOT active and sham training: ', dot_active_sham_df.ppt_id.unique())

print('Number patients with 1+ DOT active-training, before removing participants with both training types: ', dot_active_training_df.ppt_id.nunique())
print('Number patients with 1+ DOT sham-training, before removing participants with both training types: ', dot_sham_training_df.ppt_id.nunique())


In [None]:
# both ppts did several dot sham trainings before active training.
# need to delete all files from and including the first time with active training

# set time_info column as a numeric
dot_file_df.time_info = pd.to_numeric(dot_file_df.time_info)

# dot_file_df[dot_file_df['ppt_id'].str.match('AAlglYcdjt')] # inspect images
# for ppt 'AAlglYcdjt', delete from this onwards: file_id: 'AAlglYcdjt-1496548054352-0', time: '1496548054352'
dot_file_df_cleaned = dot_file_df.loc[~((dot_file_df['ppt_id'] == 'AAlglYcdjt') & (dot_file_df['time_info'] >= 1496548054352)),:]

# dot_file_df[dot_file_df['ppt_id'].str.match('NfScLwnQSr')] # inspect images
# for ppt 'NfScLwnQSr', delete from this onwards: file_id: 'NfScLwnQSr-1496686814167-0', time: '1496686814167'
dot_file_df_cleaned = dot_file_df_cleaned.loc[~((dot_file_df_cleaned['ppt_id'] == 'NfScLwnQSr') & (dot_file_df_cleaned['time_info'] >= 1496686814167)),:]

print('Number of rows and columns, before removing participants with both active and sham training: ', dot_file_df.shape)
print('Number of rows and columns, after removing participants with both active and sham training: ', dot_file_df_cleaned.shape)


### Describe counts of files and unique participants with different DOT file types

In [None]:
# split dataframe into active-training, sham-training, test, trial, and other(if exists) dataframes; find the number of files + ppts within each
dot_active_training_df = dot_file_df_cleaned[dot_file_df_cleaned['file_type'] == "active-training"]
dot_sham_training_df = dot_file_df_cleaned[dot_file_df_cleaned['file_type'] == "sham-training"]
dot_test_files_df = dot_file_df_cleaned[dot_file_df_cleaned['file_type'] == "test"]
dot_trial_files_df = dot_file_df_cleaned[dot_file_df_cleaned['file_type'] == "trial"]
dot_other_df = dot_file_df_cleaned[dot_file_df_cleaned['file_type'] == "other"]


In [None]:
print('Number of unique ppts with any DOT responses: ', dot_file_df.ppt_id.nunique()) # check counts are as expected

print('Number of unique patients with DOT tests: ', dot_test_files_df.ppt_id.nunique())
print('Number of DOT test files: ', len(dot_test_files_df))

print('Number of participants with DOT trial files: ', dot_trial_files_df.ppt_id.nunique())

print('Number of non-training, test, or trial DOT files (if any): ', len(dot_other_df))


### Create a dataframe with all ppt IDs of participants with DOT files, and list their training type (active-training, sham-training, no-training)

In [None]:
dot_ids_training_type_df = split_game_ids_by_training_type_in_dfs(dot_file_df_cleaned, dot_active_training_df, dot_sham_training_df)


In [None]:
# Describe counts of DOT participants by training type
print('Number of unique participants grouped by training type: \n', 
      dot_ids_training_type_df.groupby('training_type').agg(['count']))

dot_ids_training_type_df.head()


### Extract reaction time, accuracy, and trial numbers for each DOT test file

New fields to be added:
* reaction_time (average RT for individual test session)
* accuracy (average response accuracy for each file; max=1.0)
* number_trials
* rolling_test_count
* total_test_count
* ppt_training_type
* days_elapsed_from_first_test


In [None]:
# create dot_test_df which lists DOT test file information including responses
dot_test_df = dot_test_files_df.drop(columns=['rolling_count_ppt'])
dot_test_df.reset_index(drop=True, inplace=True)


In [None]:
start_dot_test_read = time.time()

for index, row in dot_test_df.iterrows():
    with open(row['dot_file_name']) as json_file:
        dot_test_data = json.load(json_file)

        response_data = dot_test_data['data']['data'][0]['r_d']
        
        # work with cases that have information on  images presented to ppts:
        if 'r_s' in response_data[0]: 
            len_trial_items = len(response_data)
            num_trials = 0 # counter to find number of trials
            rt_trials = []
            acc_trials = []
        
            for i in range(len_trial_items):
                if 'r_t' in response_data[i]:
                    rt_score = response_data[i]['r_t']
                    rt_trials.append(int(rt_score))
                    accuracy_score = response_data[i]['res']
                    acc_trials.append(is_correct(accuracy_score))
                    num_trials += 1

            dot_test_df.loc[index, 'mean_rt'] = np.mean(rt_trials)
            dot_test_df.loc[index, 'mean_acc'] = np.mean(acc_trials)
            dot_test_df.loc[index, 'num_trials'] = num_trials
        
        else: 
            
            len_trial_items = len(response_data)
            num_trials = 0
            rt_trials = []
            acc_trials = []
        
            for i in range(len_trial_items):
                rt_score = response_data[i]['r_t']
                rt_trials.append(int(rt_score))
                accuracy_score = response_data[i]['res']
                acc_trials.append(is_correct(accuracy_score))
                num_trials += 1

            dot_test_df.loc[index, 'mean_rt'] = np.mean(rt_trials)
            dot_test_df.loc[index, 'mean_acc'] = np.mean(acc_trials)
            dot_test_df.loc[index, 'num_trials'] = num_trials


end_dot_test_read = time.time()
elapsed_dot_test_read_time = end_dot_test_read - start_dot_test_read

print('Time to read and calculate mean RTs and accuracies of DOT test files in %H:%M:%S format: ', time.strftime("%H:%M:%S", time.gmtime(elapsed_dot_test_read_time)))


In [None]:
# dot_test_df.head()

### Remove outliers from all test files

In [None]:
# check for unusually fast reaction times
dot_fast_rts = dot_test_df[dot_test_df['mean_rt'] < 50]
print('Number of DOT test files with reaction times < 50ms: ', dot_fast_rts.ppt_id.count())

# check for unusually slow reaction times
dot_slow_rts = dot_test_df[dot_test_df['mean_rt'] > 5000]
print('Number of DOT test files with reaction times > 5000ms: ', dot_slow_rts.ppt_id.count())

# remove fast RTs
dot_test_df_cleaned = dot_test_df.drop(dot_slow_rts.index)

# check for tests where participants were answering below chance (accuracy < 0.5)
dot_low_accuracy = dot_test_df_cleaned[dot_test_df_cleaned['mean_acc'] < 0.5]
print('Number of DOT test files with accuracies < 0.5: ', dot_low_accuracy.ppt_id.count())

# remove low accuracies
dot_test_df_cleaned = dot_test_df_cleaned.drop(dot_low_accuracy.index)

# reset index of dataframe
dot_test_df_cleaned.reset_index(drop=True, inplace=True)


In [None]:
print('Number of DOT test files before removing outliers: ', len(dot_test_df))
print('Number of DOT test files after removing outliers: ', len(dot_test_df_cleaned))
print('Slowested RT in cleaned DOT test files, after removing outliers: ', np.round(dot_test_df_cleaned['mean_rt'].max(), decimals=2), 
      '\n Fastest RT in cleaned DOT test files, after removing outliers: ', np.round(dot_test_df_cleaned['mean_rt'].min(), decimals=2))
print('Lowest accuracy in cleaned DOT test files, after removing outliers: ', np.round(dot_test_df_cleaned['mean_acc'].min(), decimals=2),
     '\n Highest accuracy in cleaned DOT test files, after removing outliers: ', np.round(dot_test_df_cleaned['mean_acc'].max(), decimals=2))


In [None]:
# For each ppt, add rolling count of DOT test number to dot_test_df_cleaned
dot_test_df_cleaned['rolling_test_count_by_ppt'] = dot_test_df_cleaned.groupby(by='ppt_id').cumcount()+1

# For each ppt, add total number of test sessions completed
dot_test_df_cleaned['total_test_count_by_ppt'] = dot_test_df_cleaned.groupby(by='ppt_id')['ppt_id'].transform('count')


### Add DOT training type of each ppt to DOT test df

In [None]:
# Add training type to the DOT test df (active, sham, none), by mapping from dot_ids_training_type_df
dot_test_df_cleaned['training_type'] = dot_test_df_cleaned['ppt_id'].map(dot_ids_training_type_df.set_index('ppt_id')['training_type'])


### For each participant, calculate the number of days between the first session and each subsequent session ('days_elapsed_from_first_test')


In [None]:
# Add days_elapsed_from_first_test column to the DOT test df - for each ppt, this is the number of days between a session and the first session
dot_test_df_cleaned['time_date'] = pd.to_datetime(dot_test_df_cleaned['time_date'])

start_dot_test_days_elapsed_calc = time.time()

dot_test_df_cleaned = dot_test_df_cleaned.assign(days_elapsed_from_first_test=dot_test_df_cleaned.groupby('ppt_id').time_date.apply(lambda x: x - x.iloc[0]))

end_dot_test_days_elapsed_calc = time.time()

elapsed_dot_test_days_elapsed_calc_time = end_dot_test_days_elapsed_calc - start_dot_test_days_elapsed_calc

print('Time to calculate days elapsed from first DOT test files for each ppt in %H:%M:%S format: ', 
      time.strftime("%H:%M:%S", time.gmtime(elapsed_dot_test_days_elapsed_calc_time)))



### Find numbers of patients with DOT test data after removing outliers, and split by training type

In [None]:
num_dot_test_ppts_analysis = dot_test_df_cleaned.ppt_id.nunique()
print('Number of unique participants included in DOT test analysis, after removing outliers: ', num_dot_test_ppts_analysis)

# check numbers are expected, and that no row doesn't have a training type:
print('Number of unique patients with DOT test files, split by training: \n', dot_test_df_cleaned[['ppt_id', 'training_type']].groupby('training_type').nunique('ppt_id'))
print('Number of DOT test files, split by training performed by ppt: \n', dot_test_df_cleaned[['ppt_id', 'training_type']].groupby('training_type').count())


In [None]:
# view structure of DOT test dataframe
dot_test_df_cleaned.head()

### Save csvs for future use

In [None]:
# dot_file_df.to_csv(path_or_buf='../dot_file_info_training_not_cleaned.csv', index=False)
# dot_file_df_cleaned.to_csv(path_or_buf='../dot_file_info_training_cleaned.csv', index=False)
# dot_ids_training_type_df.to_csv(path_or_buf='../dot_ids_training_type_df.csv', index=False)
# dot_test_df.to_csv(path_or_buf='../dot_test_responses_df_with_outliers.csv', index=False)
# dot_test_df_cleaned.to_csv(path_or_buf='../dot_test_responses_df_without_outliers.csv', index=False)


# 4. Analyse visual-search (VS) data

In [None]:
# find names of all VS files
vs_txt = open("../vs_files.txt","r")
vs_content = vs_txt.read()
vs_files = vs_content.split("\n")
vs_txt.close()

### Create dataframe listing information about VS files

In [None]:
vs_file_info = [] # to create df with cols: file name, ppt id, time, type

start_vs_file_sort = time.time()

get_game_info(vs_files, vs_file_info)

end_vs_file_sort = time.time()
elapsed_vs_file_sort_time = end_vs_file_sort - start_vs_file_sort

print('Time to read and sort VS files in %H:%M:%S format: ', time.strftime("%H:%M:%S", time.gmtime(elapsed_vs_file_sort_time)))


In [None]:
# create vs_file_df as the dataframe
vs_file_df = file_info_to_dataframe(vs_file_info, 'vs')

# reorder columns in dataframe to group similar ones together
vs_file_df = vs_file_df[['ppt_id', 'vs_file_name', 'file_type', 'time_info', 'time_cleaned', 'time_date', 'rolling_count_ppt']]

# set time_info column as a numeric
vs_file_df.time_info = pd.to_numeric(vs_file_df.time_info)


In [None]:
# list total number of VS files
print('Total number of VS files: ', len(vs_file_df))


### Split participants by training type (active, sham), and remove any cases where participants did both types of training


In [None]:
# split dataframe by training file_type
vs_active_training_df = vs_file_df[vs_file_df['file_type'] == "active-training"]
vs_sham_training_df = vs_file_df[vs_file_df['file_type'] == "sham-training"]

# find cases where patients did 1 file_type of training then the other - by whether they are both in vs_active_training_df & vs_sham_training_df 
vs_active_sham_df = pd.concat([vs_active_training_df[(vs_active_training_df["ppt_id"].isin(vs_sham_training_df["ppt_id"]))], vs_sham_training_df[(vs_sham_training_df["ppt_id"].isin(vs_active_training_df["ppt_id"]))]])
vs_active_sham_df = vs_active_sham_df.sort_values(['vs_file_name'], ascending=[True])
vs_active_sham_df = vs_active_sham_df.reset_index(drop=True)


In [None]:
# describe outputs
print('Number of patients with both VS active and sham training: ', vs_active_sham_df.ppt_id.nunique())
print('Patient IDs with both VS active and sham training: ', vs_active_sham_df.ppt_id.unique())

print('Number patients with 1+ VS active-training, before removing any participants with both training types: ', 
      vs_active_training_df.ppt_id.nunique())
print('Number patients with 1+ VS sham-training, before removing any participants with both training types: ', 
      vs_sham_training_df.ppt_id.nunique())


In [None]:
# # inspect images of patient with active and sham training
# vs_file_df[vs_file_df['ppt_id'].str.match('gnUnXlzcJP')]
print('Participant did several VS active trainings before sham training. \n \
Remove all files from + including the first time with sham training.')

# for ppt 'gnUnXlzcJP', delete from this time onwards: 1501311001449
vs_file_df_cleaned = vs_file_df.loc[~((vs_file_df['ppt_id'] == 'gnUnXlzcJP') & (vs_file_df['time_info'] >= 1501311001449)),:]


In [None]:
# describe outputs after removing these cases
print('Number of rows and columns, before removing participants with both active and sham training: ', 
      vs_file_df.shape)
print('Number of rows and columns, after removing participants with both active and sham training: ', 
      vs_file_df_cleaned.shape)

### Describe counts of files and unique participants with different VS file types


In [None]:
# split dataframe into active-training, sham-training, test, trial, and other(if exists) dataframes; find the number of files + ppts within each
vs_active_training_df = vs_file_df_cleaned[vs_file_df_cleaned['file_type'] == "active-training"]
vs_sham_training_df = vs_file_df_cleaned[vs_file_df_cleaned['file_type'] == "sham-training"]
vs_test_files_df = vs_file_df_cleaned[vs_file_df_cleaned['file_type'] == "test"]
vs_trial_files_df = vs_file_df_cleaned[vs_file_df_cleaned['file_type'] == "trial"]
vs_other_df = vs_file_df_cleaned[vs_file_df_cleaned['file_type'] == "other"]


In [None]:
print('Number of unique ppts with any VS responses: ', vs_file_df_cleaned.ppt_id.nunique()) # check counts are as expected

print('Number of unique patients with VS tests: ', vs_test_files_df.ppt_id.nunique())
print('Number of VS test files: ', len(vs_test_files_df))

print('Number of participants with VS trial files: ', vs_trial_files_df.ppt_id.nunique())

print('Number of non-training, test, or trial VS files (if any): ', len(vs_other_df))


### Create a dataframe with all ppt IDs of participants with VS files, and list their training type (active-training, sham-training, no-training)


In [None]:
vs_ids_training_type_df = split_game_ids_by_training_type_in_dfs(vs_file_df_cleaned, vs_active_training_df, vs_sham_training_df)


In [None]:
# Describe counts of VS participants by training type
print('Number of unique participants grouped by training type: \n', 
      vs_ids_training_type_df.groupby('training_type').agg(['count']))

vs_ids_training_type_df.head()

### Extract reaction time, accuracy, and trial numbers for each VS test file

New fields to be added:
* reaction_time (average RT for individual test session)
* accuracy (average response accuracy for each file; max=1.0)
* number_trials
* rolling_test_count
* total_test_count
* ppt_training_type
* days_elapsed_from_first_test


In [None]:
# create vs_test_df which lists VS test file information including responses
vs_test_df = vs_test_files_df.drop(columns=['rolling_count_ppt'])
vs_test_df.reset_index(drop=True, inplace=True)

In [None]:
start_vs_test_read = time.time()

for index, row in vs_test_df.iterrows():
    with open(row['vs_file_name']) as json_file:
        vs_test_data = json.load(json_file)

        # access first section of vs test data
        response_data_1 = vs_test_data['data']['data'][0]['r_d']
        # access second section of vs test data
        response_data_2 = vs_test_data['data']['data'][1]['r_d']
        
        # work with cases that have information on  images presented to ppts:
        if 'r_s' in response_data_1[0]: 

            # get reaction time and accuracy from first section of vs test data
            # get number of items
            len_trial_items_1 = len(response_data_1)
            # counter to find number of trials
            num_trials_1 = 0
            # lists to store reaction times and accuracies
            rt_trials_1 = []
            acc_trials_1 = []
            # extract the rt and acc scores
            for i in range(len_trial_items_1):
                if 'r_t' in response_data_1[i]:
                    rt_score = response_data_1[i]['r_t']
                    rt_trials_1.append(int(rt_score))
                    accuracy_score = response_data_1[i]['res']
                    acc_trials_1.append(is_correct(accuracy_score))
                    num_trials_1 += 1


            # get number of items in second section of vs test data
            len_trial_items_2 = len(response_data_2)
            # counter to find number of trials in first section of vs test data
            num_trials_2 = 0 
            # lists to store reaction times and accuracies
            rt_trials_2 = []
            acc_trials_2 = []
            # extract the rt and acc scores
            for i in range(len_trial_items_2):
                if 'r_t' in response_data_2[i]:
                    rt_score = response_data_2[i]['r_t']
                    rt_trials_2.append(int(rt_score))
                    accuracy_score = response_data_2[i]['res']
                    acc_trials_2.append(is_correct(accuracy_score))
                    num_trials_2 += 1

            # find combined reaction time, accuracy, and total number of trials across whole test session
            rt_trials_all = rt_trials_1 + rt_trials_2
            acc_trials_all = acc_trials_1 + acc_trials_2
            num_trials_all = num_trials_1 + num_trials_2

            # add mean reaction time, accuracy, and number of trials for the VS test file to the test dataframe
            vs_test_df.loc[index, 'mean_rt'] = np.mean(rt_trials_all)
            vs_test_df.loc[index, 'mean_acc'] = np.mean(acc_trials_all)
            vs_test_df.loc[index, 'num_trials'] = num_trials_all
        
        else: 

            # get reaction time and accuracy from first section of vs test data
            # get number of items
            len_trial_items_1 = len(response_data_1)
            # counter to find number of trials
            num_trials_1 = 0
            # lists to store reaction times and accuracies
            rt_trials_1 = []
            acc_trials_1 = []
            # extract the rt and acc scores
            for i in range(len_trial_items_1):
                rt_score = response_data_1[i]['r_t']
                rt_trials_1.append(int(rt_score))
                accuracy_score = response_data_1[i]['res']
                acc_trials_1.append(is_correct(accuracy_score))
                num_trials_1 += 1


            # get number of items in second section of vs test data
            len_trial_items_2 = len(response_data_2)
            # counter to find number of trials in first section of vs test data
            num_trials_2 = 0 
            # lists to store reaction times and accuracies
            rt_trials_2 = []
            acc_trials_2 = []
            # extract the rt and acc scores
            for i in range(len_trial_items_2):
                rt_score = response_data_2[i]['r_t']
                rt_trials_2.append(int(rt_score))
                accuracy_score = response_data_2[i]['res']
                acc_trials_2.append(is_correct(accuracy_score))
                num_trials_2 += 1

            # find combined reaction time, accuracy, and total number of trials across whole test session
            rt_trials_all = rt_trials_1 + rt_trials_2
            acc_trials_all = acc_trials_1 + acc_trials_2
            num_trials_all = num_trials_1 + num_trials_2

            # add mean reaction time, accuracy, and number of trials for the VS test file to the test dataframe
            vs_test_df.loc[index, 'mean_rt'] = np.mean(rt_trials_all)
            vs_test_df.loc[index, 'mean_acc'] = np.mean(acc_trials_all)
            vs_test_df.loc[index, 'num_trials'] = num_trials_all

end_vs_test_read = time.time()
elapsed_vs_test_read_time = end_vs_test_read - start_vs_test_read

print('Time to read and calculate mean RTs and accuracies of VS test files in %H:%M:%S format: ', time.strftime("%H:%M:%S", time.gmtime(elapsed_vs_test_read_time)))


In [None]:
# vs_test_df.head()

### Remove outliers from all test files


In [None]:
# check for unusually fast reaction times
vs_fast_rts = vs_test_df[vs_test_df['mean_rt'] < 50]
print('Number of VS test files with reaction times < 50ms: ', vs_fast_rts.ppt_id.count())

# remove fast RTs
vs_test_df_cleaned = vs_test_df.drop(vs_fast_rts.index)

# check for unusually slow reaction times
vs_slow_rts = vs_test_df_cleaned[vs_test_df_cleaned['mean_rt'] > 7750]
print('Number of VS test files with reaction times > 7750, after removing fast RTs: ', vs_slow_rts.ppt_id.count())

# remove fast RTs
vs_test_df_cleaned = vs_test_df_cleaned.drop(vs_slow_rts.index)

# check for tests where participants were answering below chance (accuracy < 0.5)
vs_low_accuracy = vs_test_df_cleaned[vs_test_df_cleaned['mean_acc'] < 0.5]
print('Number of VS test files with accuracies < 0.5, after removing fast & slow RTs: ', vs_low_accuracy.ppt_id.count())

# remove low accuracies
vs_test_df_cleaned = vs_test_df_cleaned.drop(vs_low_accuracy.index)

# reset index of dataframe
vs_test_df_cleaned.reset_index(drop=True, inplace=True)


In [None]:
print('Number of VS test files before removing outliers: ', len(vs_test_df))
print('Number of VS test files after removing outliers: ', len(vs_test_df_cleaned))
print('Slowested RT in cleaned VS test files, after removing outliers: ', np.round(vs_test_df_cleaned['mean_rt'].max(), decimals=2), 
      '\n Fastest RT in cleaned VS test files, after removing outliers: ', np.round(vs_test_df_cleaned['mean_rt'].min(), decimals=2))
print('Lowest accuracy in cleaned VS test files, after removing outliers: ', np.round(vs_test_df_cleaned['mean_acc'].min(), decimals=2),
     '\n Highest accuracy in cleaned VS test files, after removing outliers: ', np.round(vs_test_df_cleaned['mean_acc'].max(), decimals=2))


### Add additional fields to VS test df


In [None]:
# For each ppt, add rolling count of VS test number to vs_test_df_cleaned
vs_test_df_cleaned['rolling_test_count_by_ppt'] = vs_test_df_cleaned.groupby(by='ppt_id').cumcount()+1

# For each ppt, add total number of test sessions completed
vs_test_df_cleaned['total_test_count_by_ppt'] = vs_test_df_cleaned.groupby(by='ppt_id')['ppt_id'].transform('count')


In [None]:
# Add training type to the VS test df (active, sham, none), by mapping from vs_ids_training_type_df
vs_test_df_cleaned['training_type'] = vs_test_df_cleaned['ppt_id'].map(vs_ids_training_type_df.set_index('ppt_id')['training_type'])


In [None]:
# Add days_elapsed_from_first_test column to the VS test df - for each ppt, this is the number of days between a session and the first session
vs_test_df_cleaned['time_date'] = pd.to_datetime(vs_test_df_cleaned['time_date'])

start_vs_test_days_elapsed_calc = time.time()

vs_test_df_cleaned = vs_test_df_cleaned.assign(days_elapsed_from_first_test=vs_test_df_cleaned.groupby('ppt_id').time_date.apply(lambda x: x - x.iloc[0]))

end_vs_test_days_elapsed_calc = time.time()

elapsed_vs_test_days_elapsed_calc_time = end_vs_test_days_elapsed_calc - start_vs_test_days_elapsed_calc

print('Time to calculate days elapsed from first VS test files for each ppt in %H:%M:%S format: ', 
      time.strftime("%H:%M:%S", time.gmtime(elapsed_vs_test_days_elapsed_calc_time)))


### Find numbers of patients with VS test data after removing outliers, and split by training type


In [None]:
num_vs_test_ppts_analysis = vs_test_df_cleaned.ppt_id.nunique()
print('Number of unique participants included in VS test analysis, after removing outliers: ', num_vs_test_ppts_analysis)

# check numbers are expected, and that no row doesn't have a training type:
print('Number of unique patients with VS test files, split by training: \n', vs_test_df_cleaned[['ppt_id', 'training_type']].groupby('training_type').nunique('ppt_id'))
print('Number of VS test files, split by training performed by ppt: \n', vs_test_df_cleaned[['ppt_id', 'training_type']].groupby('training_type').count())


In [None]:
# view structure of VS test dataframe
vs_test_df_cleaned.head()

### Save csvs for future use


In [None]:
# vs_file_df.to_csv(path_or_buf='../vs_file_info_training_not_cleaned.csv', index=False)
# vs_file_df_cleaned.to_csv(path_or_buf='../vs_file_info_training_cleaned.csv', index=False)
# vs_ids_training_type_df.to_csv(path_or_buf='../vs_ids_training_type_df.csv', index=False)
# vs_test_df.to_csv(path_or_buf='../vs_test_responses_df_with_outliers.csv', index=False)
# vs_test_df_cleaned.to_csv(path_or_buf='../vs_test_responses_df_without_outliers.csv', index=False)


# 5. Process mood (PANAS/HAS) responses

In [None]:
mood_txt = open("../mood_files.txt","r")
mood_content = mood_txt.read()
mood_files = mood_content.split("\n")
mood_txt.close()


### Create vars for PANAS questionaire responses and for Happy/Anxious/Sad

- mood dataframe: lists file_name, ppt_id, time, and type for each file ('panas', 'has')
- PANAS var: struct w ppt_id, time and data for PANAS files only
-  HAS var: struct w ppt_id, time and data for HAS files only

In [None]:
# create lists to store file names + info
panas_files = []
has_files = []
other_mood_files = []
mood_ppts = []
mood_file_info = [] # to create df with cols: file name, ppt id, time, type ('panas', 'has')


In [None]:
start_mood_file_sort = time.time()

for i in mood_files:

    with open(i) as json_file:
        mood_data = json.load(json_file)
        mood_ppt_id = mood_data['bbuid']
        mood_time_file = str(mood_data['time'])

        if len(mood_data['data']['m_d']) == 20: 
            panas_files.append(i)
            mood_file_info.append([mood_ppt_id,i,mood_time_file,'panas'])

        elif len(mood_data['data']['m_d']) == 3:
            has_files.append(i)
            mood_file_info.append([mood_ppt_id,i,mood_time_file,'has'])

        else:
            other_mood_files.append(i)

end_mood_file_sort = time.time()


In [None]:
elapsed_mood_file_sort = end_mood_file_sort - start_mood_file_sort

print('Time to read and sort mood files in %H:%M:%S format: ', 
      time.strftime("%H:%M:%S", time.gmtime(elapsed_mood_file_sort)))
print('Number of mood files: ', len(mood_files))
print('Number of panas files: ', len(panas_files))
print('Number of has files: ', len(has_files))
print('Number of other files: ', len(other_mood_files))
print('Total number of mood files: ', len(has_files) + len(panas_files) + len(other_mood_files))


In [None]:
# create mood_file_df as the dataframe
mood_file_df = file_info_to_dataframe(mood_file_info, 'mood')

# reorder columns in dataframe to group similar ones together
mood_file_df = mood_file_df[['ppt_id', 'mood_file_name', 'file_type', 'time_info', 'time_cleaned', 'time_date', 'rolling_count_ppt']]

# set time_info column as a numeric
mood_file_df.time_info = pd.to_numeric(mood_file_df.time_info)


In [None]:
# get unique ppt IDs from the 'ppt_id' column in file_df, without duplicates
mood_ppts = mood_file_df['ppt_id'].drop_duplicates().to_list()
print('Number of unique ppts with mood responses: ', len(mood_ppts))


In [None]:
# Remove files from ppts who did more than 1 type of training

# set time_info column as a numeric
mood_file_df.time_info = pd.to_numeric(mood_file_df.time_info)

# use same cut-off times for participants as in the DOT/VS cleaning steps above
mood_file_df_cleaned = mood_file_df.loc[~((mood_file_df['ppt_id'] == 'AAlglYcdjt') & (mood_file_df['time_info'] >= 1496548054352)),:]
mood_file_df_cleaned = mood_file_df_cleaned.loc[~((mood_file_df_cleaned['ppt_id'] == 'gnUnXlzcJP') & (mood_file_df_cleaned['time_info'] >= 1501311001449)),:]
mood_file_df_cleaned = mood_file_df_cleaned.loc[~((mood_file_df_cleaned['ppt_id'] == 'NfScLwnQSr') & (mood_file_df_cleaned['time_info'] >= 1496686814167)),:]

print('Number of rows, before removing participants with both active and sham training on any task: ', len(mood_file_df))
print('Number of rows, after removing participants with both active and sham training on any task: ', len(mood_file_df_cleaned))


In [None]:
# # visualise first few rows of dataframe
# mood_file_df.head()


In [None]:
# create a PANAS-specific dataframe
panas_rows_df = mood_file_df_cleaned[mood_file_df_cleaned['file_type'] == "panas"] # select only PANAS file types

panas_rows_df = panas_rows_df.sort_values(['mood_file_name'], ascending=[True]) # order by file_name

panas_rows_df.reset_index(drop=True, inplace=True) # reset index

panas_rows_df.drop(columns=['rolling_count_ppt'], inplace=True) # drop column as not useful for this dataframe


In [None]:
print('Number of unique patients with PANAS responses: ', panas_rows_df['ppt_id'].nunique())


In [None]:
# create a HAS-specific dataframe
has_rows_df = mood_file_df_cleaned[mood_file_df_cleaned['file_type'] == "has"]
print('Number of unique patients with HAS responses: ', has_rows_df['ppt_id'].nunique())


In [None]:
# get sets of unique ppt ids for both PANAS files and HAS files
unique_panas_ppt_ids = set(panas_rows_df['ppt_id'].unique())
unique_has_ppt_ids = set(has_rows_df['ppt_id'].unique())

# find ppts with both PANAS and HAS
mood_commonalities = unique_panas_ppt_ids.intersection(unique_has_ppt_ids)
print('Number of unique ppts with both panas and has responses: ', len(mood_commonalities))

# find ppts with HAS and not PANAS
has_not_panas_ids = unique_has_ppt_ids - mood_commonalities
print('Number of unique ppts with has and not panas responses: ', len(has_not_panas_ids))

# find ppts with PANAS and not HAS
panas_not_has_ids = unique_panas_ppt_ids - mood_commonalities
print('Number of unique ppts with panas and not has responses: ', len(panas_not_has_ids))

print('Number of unique ppts with any mood responses: ', len(mood_commonalities) + len(has_not_panas_ids) + len(panas_not_has_ids))


### Add columns to PANAS-specific dataframe

In [None]:
# Add days_elapsed column: days between each PANAS report and the ppt's first PANAS report

panas_rows_df['time_date'] = pd.to_datetime(panas_rows_df['time_date'])

start_panas_days_elapsed_calc = time.time()

panas_rows_df = panas_rows_df.assign(days_elapsed_from_first_report=panas_rows_df.groupby('ppt_id').time_date.apply(lambda x: x - x.iloc[0]))

end_panas_days_elapsed_calc = time.time()

elapsed_panas_days_elapsed_calc_time = end_panas_days_elapsed_calc - start_panas_days_elapsed_calc

print('Time to calculate days elapsed from first PANAS report files for each ppt in %H:%M:%S format: ', 
      time.strftime("%H:%M:%S", time.gmtime(elapsed_panas_days_elapsed_calc_time)))


In [None]:
# For each ppt, add total number of PANAS reports completed
panas_rows_df['total_panas_count_by_ppt'] = panas_rows_df.groupby(by='ppt_id')['ppt_id'].transform('count')

# For each ppt, add rolling count of PANAS files
panas_rows_df['panas_rolling_count_ppt'] = panas_rows_df.groupby(by='ppt_id').cumcount()+1 


### Extract responses from panas and has json files

In [None]:
# add new fields to panas_rows_df: each of the 20 questions + responses, rolling_count_test_no, total_panas_tests, training_type, days_elapsed
panas_q_a = []

for i in panas_files:
    with open(i) as json_file:
        data = json.load(json_file)
        
        for question in range(20):
            panas_q_a.append([i, data['data']['m_d'][question]['q'], data['data']['m_d'][question]['a']])


In [None]:
# create panas_q_a_df listing the questions and responses for each file
panas_q_a_df = pd.DataFrame(panas_q_a, columns = ['mood_file_name', 'question', 'answer'])


In [None]:
# pivot dataframe to change from long to wide format
pivot_panas_q_a_df = panas_q_a_df.pivot(index='mood_file_name', columns='question', values='answer')


In [None]:
panas_rows_df = pd.merge(panas_rows_df, pivot_panas_q_a_df, on='mood_file_name')


### Turn responses into numbers: 

* 1 = Not At All, 
* 2 = A Little, 
* 3 = Moderately, 
* 4 = Quite a Bit, 
* 5 = Extremely


In [None]:
# set scale
answer_mapping = {'Not At All': 1, 'A Little': 2, 'Moderately': 3, 'Quite A Bit': 4, 'Extremely': 5}
# replace text values with numeric ones
panas_rows_values_df = panas_rows_df.applymap(lambda a: answer_mapping.get(a) if a in answer_mapping else a)


### Add compound PA and NA, and overall mood scores for each mood file

In [None]:
panas_rows_values_df['PA'] = (panas_rows_values_df['Interested'] + panas_rows_values_df['Excited'] + panas_rows_values_df['Strong']
                             + panas_rows_values_df['Enthusiastic'] + panas_rows_values_df['Proud'] + panas_rows_values_df['Alert']
                             + panas_rows_values_df['Inspired'] + panas_rows_values_df['Determined'] + panas_rows_values_df['Attentive']
                             + panas_rows_values_df['Active'])

panas_rows_values_df['NA'] = (panas_rows_values_df['Distressed'] + panas_rows_values_df['Upset'] + panas_rows_values_df['Guilty']
                             + panas_rows_values_df['Scared'] + panas_rows_values_df['Hostile'] + panas_rows_values_df['Irritable']
                             + panas_rows_values_df['Ashamed'] + panas_rows_values_df['Nervous'] + panas_rows_values_df['Jittery']
                             + panas_rows_values_df['Afraid'])

panas_rows_values_df['mood'] = panas_rows_values_df['PA'] - panas_rows_values_df['NA']


In [None]:
# # visualise PANAS dataframe with quantities that can be used for analyses
# panas_rows_values_df.head()


### Add DOT and VS training type to the PANAS dataframe

In [None]:
# read in DOT and VS ppt IDs by training type CSVs
dot_ids_training_type_df = pd.read_csv('../dot_ids_training_type_df.csv')
vs_ids_training_type_df = pd.read_csv('../vs_ids_training_type_df.csv')


In [None]:
panas_rows_values_by_training_df = add_game_training_type_to_mood_dfs(panas_rows_values_df, dot_ids_training_type_df, 'dot')
panas_rows_values_by_training_df = add_game_training_type_to_mood_dfs(panas_rows_values_by_training_df, vs_ids_training_type_df, 'vs')


In [None]:
# Check for patients with both VS and DOT training
panas_training_overlap = panas_rows_values_by_training_df.loc[((panas_rows_values_by_training_df['dot_training_type'].str.contains('active|sham', regex=True)) 
                                      & (panas_rows_values_by_training_df['vs_training_type'].str.contains('active|sham', regex=True))),:]

print('Number of ppts with both VS and DOT training: ', panas_training_overlap.ppt_id.nunique())
# print('ppt_ids of ppts with both VS and DOT training: ', panas_training_overlap.ppt_id.unique())



In [None]:
# Inspect ppt's DOT and VS data to find which training type was conducted first, and the cutoff for excluding data

# read in csvs
dot_file_info_training_cleaned = pd.read_csv('../dot_file_info_training_cleaned.csv')
vs_file_info_training_cleaned = pd.read_csv('../vs_file_info_training_cleaned.csv')

# inspect all rows for this ppt
# print(dot_file_info_training_cleaned[dot_file_info_training_cleaned['ppt_id'].str.match('PXpDwlCoZL')]) # inspect images
# print(vs_file_info_training_cleaned[vs_file_info_training_cleaned['ppt_id'].str.match('PXpDwlCoZL')]) # inspect images


In [None]:
# for ppt 'PXpDwlCoZL', delete from this time onwards: '1516531705886', as ppt training for anlternative task
panas_rows_values_by_training_df_cleaned = panas_rows_values_by_training_df.loc[~((panas_rows_values_by_training_df['ppt_id'] == 'PXpDwlCoZL') & (panas_rows_values_by_training_df['time_info'] >= 1516531705886)),:]

panas_rows_values_by_training_df_cleaned.reset_index(drop=True, inplace=True) # reset index

# replace the vs_training_type column for this patient with 'no-training', as they did not do VS training in data considered
panas_rows_values_by_training_df_cleaned.loc[(panas_rows_values_by_training_df_cleaned['ppt_id'] == 'PXpDwlCoZL'), 'vs_training_type'] = 'no-training'

print('Number of rows, before removing participants with both active and sham training on any task: ', len(panas_rows_values_by_training_df))
print('Number of rows, after removing participants with both active and sham training on any task: ', len(panas_rows_values_by_training_df_cleaned))


In [None]:
num_panas_dot_active = panas_rows_values_by_training_df_cleaned[panas_rows_values_by_training_df_cleaned['dot_training_type'] == "active-training"].ppt_id.nunique()
num_panas_dot_sham = panas_rows_values_by_training_df_cleaned[panas_rows_values_by_training_df_cleaned['dot_training_type'] == "sham-training"].ppt_id.nunique()

num_panas_vs_active = panas_rows_values_by_training_df_cleaned[panas_rows_values_by_training_df_cleaned['vs_training_type'] == "active-training"].ppt_id.nunique()
num_panas_vs_sham = panas_rows_values_by_training_df_cleaned[panas_rows_values_by_training_df_cleaned['vs_training_type'] == "sham-training"].ppt_id.nunique()

num_any_panas = panas_rows_values_by_training_df_cleaned.ppt_id.nunique()
num_panas_no_training = num_any_panas - num_panas_dot_active - num_panas_dot_sham - num_panas_vs_active - num_panas_vs_sham

print('# unique ppts with PANAS reports who did 1+ DOT active training session: ', num_panas_dot_active)
print('# unique ppts with PANAS reports who did 1+ DOT sham training session: ', num_panas_dot_sham)
print('# unique ppts with PANAS reports who did 1+ VS active training session: ', num_panas_vs_active)
print('# unique ppts with PANAS reports who did 1+ VS sham training session: ', num_panas_vs_sham)
print('# unique ppts with PANAS reports without any training sessions: ', num_panas_no_training)
print('# unique ppts with PANAS reports, regardless of training: ', num_any_panas)


In [None]:
num_panas_dot_active_reports = panas_rows_values_by_training_df_cleaned[panas_rows_values_by_training_df_cleaned['dot_training_type'] == "active-training"].mood_file_name.count()
num_panas_dot_sham_reports = panas_rows_values_by_training_df_cleaned[panas_rows_values_by_training_df_cleaned['dot_training_type'] == "sham-training"].mood_file_name.count()

num_panas_vs_active_reports = panas_rows_values_by_training_df_cleaned[panas_rows_values_by_training_df_cleaned['vs_training_type'] == "active-training"].mood_file_name.count()
num_panas_vs_sham_reports = panas_rows_values_by_training_df_cleaned[panas_rows_values_by_training_df_cleaned['vs_training_type'] == "sham-training"].mood_file_name.count()

num_any_panas_reports = panas_rows_values_by_training_df_cleaned.mood_file_name.count()
num_panas_no_training_reports = num_any_panas_reports - num_panas_dot_active_reports - num_panas_dot_sham_reports - num_panas_vs_active_reports - num_panas_vs_sham_reports

print('# PANAS reports for ppts who did 1+ DOT active training session: ', num_panas_dot_active_reports)
print('# PANAS reports for ppts who did 1+ DOT sham training session: ', num_panas_dot_sham_reports)
print('# PANAS reports for ppts who did 1+ VS active training session: ', num_panas_vs_active_reports)
print('# PANAS reports for ppts who did 1+ VS sham training session: ', num_panas_vs_sham_reports)
print('# PANAS reports for ppts without any training sessions: ', num_panas_no_training_reports)
print('# PANAS reports, regardless of training: ', num_any_panas_reports)


In [None]:
panas_rows_values_by_training_df_cleaned = mood_overall_training_category(panas_rows_values_by_training_df_cleaned)


In [None]:
### Visualise outputs of dataframe, prior to analysis
panas_rows_values_by_training_df_cleaned.head()


### Save mood outputs as CSVs

In [None]:
# panas_rows_values_df.to_csv(path_or_buf='../panas_rows_values_df_not_cleaned.csv', index=False)
# panas_rows_values_by_training_df_cleaned.to_csv(path_or_buf='../panas_rows_values_by_training_df_cleaned.csv', index=False)

# mood_file_df.to_csv(path_or_buf='../mood_file_df_not_cleaned.csv', index=False)
# mood_file_df_cleaned.to_csv(path_or_buf='../mood_file_df_cleaned.csv', index=False)

with open("../has_files.txt", "w") as outfile:
    outfile.write("\n".join(str(item) for item in has_files))

with open("../panas_files.txt", "w") as outfile:
    outfile.write("\n".join(str(item) for item in panas_files))



## Prepare PANAS files for analysis and create plots for mood scores by training type and time

Create dataframes for analysis with 2 PANAS reports and 6 PANAS reports, separately


In [None]:
# read whole dataframe, with columns of interest
panas_df = pd.read_csv('~/bias_mod_analysis/panas_rows_values_by_training_df_cleaned.csv')


In [None]:
# select rows where (a) each ppt has at least 2 PANAS reports, and (b) only the 1st & 2nd PANAS reports are selected 
two_panas_df = panas_df.loc[(panas_df['total_panas_count_by_ppt'] >= 2) & (panas_df['panas_rolling_count_ppt'] <= 2), :]
two_panas_df = two_panas_df[['ppt_id', 'mood', 'panas_rolling_count_ppt', 'overall_training_type']]
two_panas_df.reset_index(drop=True, inplace=True)


In [None]:
# view first few rows of df
two_panas_df.head()


In [None]:
# reshape from long to wide format for analysis
two_panas_wide_df = two_panas_df.pivot_table(index=['ppt_id', 'overall_training_type'], 
                                        columns='panas_rolling_count_ppt', values='mood').reset_index()
# rename time column
two_panas_wide_df.rename(columns={1: "report_1", 2: "report_2"}, inplace=True)

# remove axis name
two_panas_wide_df = two_panas_wide_df.rename_axis(None, axis=1)  


In [None]:
# view first few rows of df
two_panas_wide_df.head()


In [None]:
# select rows where (a) each ppt has at least 6 PANAS reports, and (b) only the 1st & 6th PANAS reports are selected 
six_panas_df = panas_df.loc[(panas_df['total_panas_count_by_ppt'] >= 6) & (panas_df['panas_rolling_count_ppt'] <= 6), :]
six_panas_df = six_panas_df[['ppt_id', 'mood', 'panas_rolling_count_ppt', 'overall_training_type']]
six_panas_df.reset_index(drop=True, inplace=True)


In [None]:
# reshape from long to wide format for analysis
six_panas_wide_df = six_panas_df.pivot_table(index=['ppt_id', 'overall_training_type'], 
                                        columns='panas_rolling_count_ppt', values='mood').reset_index()
# rename time column
six_panas_wide_df.rename(columns={1: "report_1", 2: "report_2", 3: "report_3", 4: "report_4", 5: "report_5", 6: "report_6"}, inplace=True)

# remove axis name
six_panas_wide_df = six_panas_wide_df.rename_axis(None, axis=1)  


In [None]:
# save as csvs
two_panas_wide_df.to_csv(path_or_buf='two_panas_wide_df.csv', index=False)
six_panas_wide_df.to_csv(path_or_buf='six_panas_wide_df.csv', index=False)
