# PathFinding Data PreProcessing
* find subject data
* find experiment and eyetracking files
* combine experiment files for single subject to consolidate
* combine eyetracking files for single subject and pre-process
* performance analysis of PathFinding experiments

## Configuration

In [1]:
# General configuration
import os

# data_directory: str
#     Path to a directory to store data.
data_directory = '.'

# install_missing_packages: bool
#     A flag indicating if missing packages should be automatically installed
install_missing_packages = True

# use_conda: bool
#     A flag indicating if conda should be used for software installation.
#     If False, pip will be used. The default is to use conda if jupyter
#     is run in a conda environment.
use_conda = 'CONDA_EXE' in os.environ

## Checking for missing packages

In [2]:
import importlib

def check_package(package, pip_pkg: str = None, conda_pkg: str = None):
    """Check if a given package is installed. If missing install
    it (if global flag `install_missing_packages` is True) either with
    pip or with conda (depending on `use_conda`).
    """
    if importlib.util.find_spec(package) is not None:
        return  # ok, package is already installed

    if not install_missing_packages:
        raise RuntimeError(f"{package} is not installed!")

    if use_conda:
        import conda.cli
        conda.cli.main('conda', 'install',  '-y', conda_pkg or package)
    else:
        import subprocess
        import sys            
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pip_pkg or package])
        
# This is to exit cells without error tracebacks (cosmetic purpose)
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

## Creating the required environment (skip if already done)

Running the following cell will create a file graphs.yml that can be used to setup a conda environment containing the required packages. If you already downloaded the file from my GitHub, skip the next cell and create the env directly from it.

In [3]:
%%writefile graphs.yml
name: graphs
channels:
  - conda-forge
  - defaults
dependencies:
  - python=3.6
  - jupyter
  - imageio
  - imageio-ffmpeg
  - matplotlib
  - scikit-image
  - opencv
  - networkx
  - pandas
  - statsmodels

Writing graphs.yml


### Environment Creation
To create the environment, open the terminal, go to the directory where you stored the graphs.yml file (the directory of the notebook) and type
conda env create -f graphs.yml
After running this command you have to activate the environment (Linux/MacOS: conda activate graphs, Windows: activate graphs) and then reopen the notebook in that environment.

## Main Part

### Imports and directory information

In [5]:
import os
import cv2
import json
import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import glob
import scipy.cluster.vq as clusters
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#from sklearn.preprocessing import normalize
from pandas.plotting import autocorrelation_plot as AC_plot 
from statsmodels.graphics import tsaplots
from statsmodels.tsa.stattools import acf
from skimage.filters import gaussian
from mpl_toolkits.mplot3d import Axes3D 
from matplotlib.colors import LinearSegmentedColormap
import time

In [6]:
OG_DATA_PATH = './'

condition = 'Single' # Single, Dyadic, SingleC
DATA_PATH = './Data {}/'.format(condition)
LAB_DATA_PATH = "C:/Users/experiment/Desktop/SpaReBuildsTesting/999_DO NOT USE_OLD_Builds_Data/4_Pathfinding_StudyProject/data/"
LAB_DATA_PATH += condition.lower() + '/';
DATA_PATH = LAB_DATA_PATH # Comment out when not in lab

# print(LAB_DATA_PATH);
RESSOURCES_PATH = './Ressources/'
#houselist 
house_file = RESSOURCES_PATH + 'building_collider_list.csv'
try:
    houselist = pd.read_csv(house_file)
except:
    print('HouseList could not be loaded!')

PROCESSED_DATA_PATH = './Results/' + condition + '/'
PROCESSED_DATA_FOLDER = sorted([f for f in os.listdir(PROCESSED_DATA_PATH) if not f.startswith('.')], key=str.lower)

# Extracting all subject IDs from the data folder
# Getting the Folder without hidden files in ascending order 
DATA_FOLDER = sorted([f for f in os.listdir(DATA_PATH) if not f.startswith('.')], key=str.lower)

subIDs = []
for sub in DATA_FOLDER:
    if sub[0].isdigit():
        subIDs.append(int(sub[0:4]))
    else:
        pass
subIDs = np.unique(subIDs)


#subIDs = [1023] # remove to do for all subIDs
print(subIDs)

[1004 1005 1008 1010 1011 1013 1017 1018 1019 1021 1022 1023 1054 1055
 1056 1057 1058 1068 1069 1072 1073 1074 1075 1077 1079 1080]


In [7]:
import random


# Store Time limits for each path condition
path_time_limits_A = [300, 305, 234, 332, 268, 394, 152, 383, 332];
target_house_names_A = ['windmill', 'book', 'shark', 'bear', 'no graffiti', \
                        'banque strechnitz', 'foxes', 'johny patisserie', \
                       'gorilla', 'remote house']

target_house_names_B = target_house_names_A.copy();
target_house_names_B.reverse();
path_time_limits_B = path_time_limits_A.copy();
path_time_limits_B.reverse();


# Leader : follower map
dyadic_pair_map = {
    '1021' : '1022',
    '1023' : '1013',
    '1005' : '1055',
    '1074' : '1069',
    '1008' : '1058',
    '1054' : '1004',
    '1011' : '1017',
    '1018' : '1057'
}

# Generate random session IDs to group leader and follower from same session easily in analysis
dyadic_session_ids = {}
for leader in dyadic_pair_map.keys():
    _id = random.randint(1000,9999);
    dyadic_session_ids[leader] = _id;
    dyadic_session_ids[dyadic_pair_map[leader]] = _id;
    
print(dyadic_session_ids);

{'1021': 7830, '1022': 7830, '1023': 4039, '1013': 4039, '1005': 7335, '1055': 7335, '1074': 6308, '1069': 6308, '1008': 7398, '1058': 7398, '1054': 4675, '1004': 4675, '1011': 9329, '1017': 9329, '1018': 5112, '1057': 5112}


<img src='Pics/paths.png'>

# Performance Analysis

In [16]:
# Columns for consolidated data from all experiment files
pA_col_names = ['SubjectID', 'Condition', 'Dyadic?', 'Leader?', 'SessionID'];
for i in range(1,10):
    path_prefix = "P"+str(i);
    
    pA_col_names.append(path_prefix + ":Success")
    pA_col_names.append(path_prefix + ":TimeSec")
    pA_col_names.append(path_prefix + ":Time After Guide")
    pA_col_names.append(path_prefix + ":Distance Covered")

pA_col_names.append("#Success");


In [17]:
# Function to take subject ID and get distance walked for each path from subject's eyetracking files
def get_covered_distances(subID, condition):
    # TODO: have to think about no-hits for this?
#     return [0] * 10;
    
    # Get the consolidated hitpoints file which has HMD position data for each hitpoint
    # If hitpoint file not found, return 0 distance for all paths
    PROCESSED_DATA_PATH = './Results/' + condition + '/'
    try:
        filename = PROCESSED_DATA_PATH + str(subID) + '_' + condition +"_CompleteHitpoints.csv";
        data = pd.read_csv(filename);
    except:
        print('Could not read ' + filename);
        return[0] * 10;

    # Column names for intermediate DF and final DF respectively
    condense_col_names = ['PathNumber', 'xsqDiff', 'zsqDiff', 'eucDist']
    covered_dist_col_names = ['eucDist'];
    
    # Remove all second ordinal hits (HMD position is NA in consolidated hitpoint file)
    data = data.dropna(subset=['hmdPosition.x','hmdPosition.y','hmdPosition.z']);
    data = data.reset_index(drop=True)
    data['hmdPosition.z'] = data['hmdPosition.z'].astype(float);
    
    # Create intermediate and Final DF respectively
    condense_df = pd.DataFrame(index=range(len(data)), columns=condense_col_names)
    cov_dist_df = pd.DataFrame(index=range(10), columns=covered_dist_col_names);
    
    # Transfer over path number as is
    condense_df.PathNumber = data.PathNumber
    
    # Get the difference in the X- and Z-coordinates between each consecutive point 
    # Y-coordinate corresponds to HMD height and is ignored for map-distance covered
    condense_df.xsqDiff = data['hmdPosition.x'] - data['hmdPosition.x'].shift();
    condense_df.zsqDiff = data['hmdPosition.z'] - data['hmdPosition.z'].shift();
    
    # Square the x- and z- differences between each consecutive point
    condense_df.xsqDiff = condense_df.xsqDiff ** 2;
    condense_df.zsqDiff = condense_df.zsqDiff ** 2;
    
    # add the squared distances and take square root to get euclidean distance
    #    between each consecutive point
    condense_df.eucDist = (condense_df.xsqDiff + condense_df.zsqDiff) ** 0.5;
    
    # sum up the total euclidean distance grouped by path number to get distance 
    #    covered for each path
    cov_dist_df['eucDist'] = condense_df.groupby(['PathNumber'])['eucDist'].sum()
    
    # Return distances as list
    return list(cov_dist_df.eucDist)

get_covered_distances('1023', 'Single')


[26.215299149971234,
 21393.52049688685,
 294.73202812801316,
 801.497957145294,
 20619.802340090715,
 1049.87276946605,
 1964.5247858188238,
 207.17994739139058,
 2941.0415830954616,
 714.6703888026038]

In [18]:
# Create analysis data frame
perf_anal_df = pd.DataFrame(columns=pA_col_names);

# Loop through all subjects
for subject in subIDs:
    
    # Get distance covered per path for given subject
    covered_distances = get_covered_distances(subject, condition);
    
    # Read the experiment data for the subject
    sub_expt_data = {}
    sub_expt_file_name = PROCESSED_DATA_PATH + str(subject) + "_"+condition+"Pathfinding_Final.json";
    try:
        with open(sub_expt_file_name, 'r') as fp:
            sub_expt_data = json.load(fp);
    except:
        print("Could not read experiment file for subject " + str(subject));
        continue;
    
    print("Running for subject " + str(subject))
    # Create the initial dictionary to store the experiment performance information for current subject 
    sub_per_data = {'SubjectID' : sub_expt_data['ParticipantID'],
                    'Condition' : 'B' if sub_expt_data['PathsReversed'] else 'A',
                    'Dyadic?' : sub_expt_data['IsDyadic'],
                    'Leader?' : sub_expt_data['IsLeader']
                   };
    
    # Store randomly generated (see cell somewher above) session ID if dyadic
    if sub_per_data['Dyadic?']: 
        sub_per_data['SessionID'] = dyadic_session_ids[sub_per_data['SubjectID']];
    else:
        sub_per_data['SessionID'] = 'NaN';
    
    # Get the path time limits for the current condition
    path_time_limits = path_time_limits_B if sub_expt_data['PathsReversed'] else path_time_limits_A;
    num_success = 0;
    
    # for each path (ignoring the tutorial path 0)
    for i in range(1,10):
        # Get the trial data from the experiment file for the current path
        current_trial_data = sub_expt_data['TrialData'][i];
        
        # Get the time for the path by subtracting start time from end time
        sub_per_data["P"+str(i)+":TimeSec"] = current_trial_data['EndMachineTimeStamp'] \
                                                - current_trial_data['StartMachineTimeStamp'];
        
        # Path was a success for subject if they did not use the path guide
        sub_per_data["P"+str(i)+":Success"] = not current_trial_data['GuideUsed'];
        
        # Count number of successes
        if sub_per_data["P"+str(i)+":Success"]: 
            num_success += 1;
        
        # Get the time after the guide appeared
        sub_per_data["P"+str(i)+":Time After Guide"] = sub_per_data["P"+str(i)+":TimeSec"] - path_time_limits[i-1];
        
        # Store the distance covered for the path
        sub_per_data["P"+str(i)+":Distance Covered"] = covered_distances[i];
    
    # Store the number of successes
    sub_per_data['#Success'] = num_success;
    
    # Normalize collected data and append to dataframe
    sub_df = pd.json_normalize(sub_per_data);
    perf_anal_df = perf_anal_df.append(sub_df);

print('Analysis Complete');

Running for subject 1004
Running for subject 1005
Running for subject 1008
Running for subject 1010
Running for subject 1011
Running for subject 1013
Running for subject 1017
Running for subject 1018
Running for subject 1019
Running for subject 1021
Running for subject 1022
Running for subject 1023
Running for subject 1054
Running for subject 1055
Running for subject 1056
Running for subject 1057
Running for subject 1058
Running for subject 1068
Running for subject 1069
Running for subject 1072
Running for subject 1073
Running for subject 1074
Running for subject 1075
Running for subject 1077
Running for subject 1079
Running for subject 1080
Analysis Complete


In [122]:
# perf_anal_df

In [19]:
# Save to Single
perf_anal_df.to_csv(PROCESSED_DATA_PATH + 'Single_Performance_Analysis.csv', index=False);


In [55]:
# Save to Dyadic
perf_anal_df.to_csv(PROCESSED_DATA_PATH + 'Dyadic_Performance_Analysis.csv', index=False);


In [60]:
# Save to SingleC
perf_anal_df.to_csv(PROCESSED_DATA_PATH + 'SingleC_Performance_Analysis.csv', index=False);


In [20]:
# Display Collected Single Experiment Data
anal = pd.read_csv('./Results/Single/Single_Performance_Analysis.csv');
anal

Unnamed: 0,SubjectID,Condition,Dyadic?,Leader?,SessionID,P1:Success,P1:TimeSec,P1:Time After Guide,P1:Distance Covered,P2:Success,...,P7:Distance Covered,P8:Success,P8:TimeSec,P8:Time After Guide,P8:Distance Covered,P9:Success,P9:TimeSec,P9:Time After Guide,P9:Distance Covered,#Success
0,1004,B,False,False,,False,449.037666,117.037666,56648.457167,False,...,298.541384,False,390.436826,85.436826,35230.016228,True,113.507971,-186.492029,54361.765631,3
1,1005,A,False,False,,True,158.458649,-141.541351,28587.683388,True,...,12079.604125,True,182.991541,-200.008459,1259.570722,False,485.03952,153.03952,34112.306893,6
2,1008,B,False,False,,True,89.310862,-242.689138,811.731335,True,...,260.182787,True,90.587071,-214.412929,8067.763892,True,106.655506,-193.344494,12864.381295,8
3,1010,B,False,False,,False,602.11131,270.11131,50485.005781,True,...,1241.421129,True,162.011336,-142.988664,19731.738378,True,196.642525,-103.357475,204221.447502,7
4,1011,B,False,False,,True,102.925068,-229.074932,1938.900332,True,...,168.579552,False,368.540978,63.540978,38009.407203,True,217.615655,-82.384345,89012.480175,7
5,1013,A,False,False,,True,125.361833,-174.638167,7833.514987,True,...,341.004818,True,136.801561,-246.198439,10660.00999,False,438.497586,106.497586,46103.094697,8
6,1017,B,False,False,,True,261.689323,-70.310677,8249.317933,False,...,183.80275,False,432.588424,127.588424,13942.895589,True,289.393871,-10.606129,145367.606985,4
7,1018,B,False,False,,True,165.25444,-166.74556,1809.999763,True,...,168.51043,False,407.066795,102.066795,9939.922518,True,273.094684,-26.905316,133054.930744,7
8,1019,A,False,False,,True,100.74939,-199.25061,1711.418757,True,...,2014.464753,True,114.624974,-268.375026,3469.824522,False,470.427692,138.427692,110883.031111,7
9,1021,A,False,False,,False,398.568231,98.568231,21585.48253,True,...,895.43847,True,118.187025,-264.812975,1649.844078,False,447.441651,115.441651,59983.528064,6


In [119]:
# Display Collected Dyadic Experiment Data
anal = pd.read_csv('./Results/Dyadic/Dyadic_Performance_Analysis.csv');
anal

Unnamed: 0,SubjectID,Condition,Dyadic?,Leader?,SessionID,P1:Success,P1:TimeSec,P1:Time After Guide,P1:Distance Covered,P2:Success,...,P7:Distance Covered,P8:Success,P8:TimeSec,P8:Time After Guide,P8:Distance Covered,P9:Success,P9:TimeSec,P9:Time After Guide,P9:Distance Covered,#Success
0,1004,A,True,False,5575,True,187.532198,-112.467802,0,True,...,0,True,111.899616,-271.100384,0,True,119.37781,-212.62219,0,9
1,1005,B,True,True,5361,True,159.901736,-172.098264,0,True,...,0,True,105.293208,-199.706792,0,True,173.905779,-126.094221,0,8
2,1008,A,True,True,4957,True,117.755503,-182.244497,0,True,...,0,True,117.429136,-265.570864,0,True,258.906364,-73.093636,0,8
3,1011,A,True,True,1300,True,134.858309,-165.141691,0,True,...,0,True,119.176291,-263.823709,0,True,290.754437,-41.245563,0,7
4,1013,B,True,False,5062,True,99.181707,-232.818293,0,True,...,0,True,104.240194,-200.759806,0,True,97.303046,-202.696954,0,9
5,1017,A,True,False,1300,True,136.811322,-163.188678,0,True,...,0,True,122.440699,-260.559301,0,True,292.018808,-39.981192,0,9
6,1018,A,True,True,5128,True,263.787934,-36.212066,0,True,...,0,True,332.989794,-50.010206,0,True,111.840953,-220.159047,0,9
7,1021,B,True,True,7274,True,101.885863,-230.114137,0,True,...,0,True,62.674078,-242.325922,0,True,100.308087,-199.691913,0,9
8,1022,B,True,False,7274,True,103.525226,-228.474774,0,True,...,0,True,63.827329,-241.172671,0,True,101.237176,-198.762824,0,9
9,1023,B,True,True,5062,True,96.119958,-235.880042,0,True,...,0,True,103.161679,-201.838321,0,True,95.697365,-204.302635,0,9


In [120]:
# Display Collected Single Control Experiment Data
anal = pd.read_csv('./Results/SingleC/SingleC_Performance_Analysis.csv');
anal

Unnamed: 0,SubjectID,Condition,Dyadic?,Leader?,SessionID,P1:Success,P1:TimeSec,P1:Time After Guide,P1:Distance Covered,P2:Success,...,P7:Distance Covered,P8:Success,P8:TimeSec,P8:Time After Guide,P8:Distance Covered,P9:Success,P9:TimeSec,P9:Time After Guide,P9:Distance Covered,#Success
0,1010,A,False,False,,True,103.283452,-196.716548,0,True,...,0,True,149.935175,-233.064825,0,False,349.954884,17.954884,0,7
1,1019,B,False,False,,True,121.585427,-210.414573,0,True,...,0,True,62.756872,-242.243128,0,True,126.389681,-173.610319,0,9
2,1056,B,False,False,,True,99.185621,-232.814379,0,True,...,0,False,376.036466,71.036466,0,True,101.787637,-198.212363,0,7
3,1068,A,False,False,,True,129.268349,-170.731651,0,True,...,0,True,117.311292,-265.688708,0,False,474.19772,142.19772,0,7
4,1072,A,False,False,,True,106.815908,-193.184092,0,True,...,0,True,113.280268,-269.719732,0,True,150.568494,-181.431506,0,8
5,1073,A,False,False,,False,352.014228,52.014228,0,True,...,0,True,175.417123,-207.582877,0,False,454.078002,122.078002,0,5
6,1075,B,False,False,,True,96.997154,-235.002846,0,True,...,0,True,58.880095,-246.119905,0,True,97.31509,-202.68491,0,9
7,1077,A,False,False,,True,102.892725,-197.107275,0,True,...,0,True,211.720585,-171.279415,0,True,100.524325,-231.475675,0,7
8,1079,B,False,False,,True,294.374525,-37.625475,0,True,...,0,False,345.073664,40.073664,0,True,274.738379,-25.261621,0,7
9,1080,A,False,False,,True,137.255427,-162.744573,0,True,...,0,True,125.162962,-257.837038,0,False,432.332893,100.332893,0,6


### Single Pathfinding Statistics for Participants

### Single Pathfinding Descriptive Statistics for Paths

In [29]:

# Per Condition, Per Path: Average Time, SD Time, Average Success, Average Time after Fail
# Per Condition Success Average

# Columns for the descriptive statistics that we want to consider
path_anal_cols = ['PathNum', 'Target', '+ Rate', 'Limit', 'Avg. T (+)', 'SD T (+)', 'Avg. T (after -)', \
                  'SD T (after -)', 'Avg D (+)', 'SD D (+)', 'Avg S (+)', 'SD S (+)']

# Read single pathfinding performance data
anal = pd.read_csv('./Results/Single/Single_Performance_Analysis.csv');

# Separate into Conditions A and B
A_anal = anal[anal.Condition == 'A']
B_anal = anal[anal.Condition == 'B']

# Get average success rate and std across participants for both conditions
A_succ_avg = A_anal['#Success'].mean() 
A_anal_succ_std  = A_anal['#Success'].std()
B_succ_avg = B_anal['#Success'].mean()
B_anal_succ_std  = B_anal['#Success'].std()


# Starting extraction of statistics for condition A

# Filter information from the dataframe that pertains to path performance (1-9)
A_paths = A_anal.filter(regex="P.*")
A_path_stats = pd.DataFrame(columns = path_anal_cols);

curr_row = {};
successed = [];
fails = []
times = []

# Iterate through all the columns containing path information
# Each path has 4 columns
for idx, column in enumerate(A_paths):
    coltype = idx % 4;
    # Store the path number and the corresponding time limit
    curr_row['PathNum'] = int(idx / 4) + 1;
    curr_row['Target'] = target_house_names_A[curr_row['PathNum']];
    curr_row['Limit'] = path_time_limits_A[curr_row['PathNum'] - 1]
    
    
    # First column describes path success or not. Calculate success rate.
    if coltype == 0:
        successes = A_paths[column];
        fails = ~successes;
        curr_row['+ Rate'] = successes.sum() / successes.count();
        
    # Second column describes time taken. Compute average and SD time for all succesful paths
    elif coltype == 1:
        times = A_paths[column]
        curr_row['Avg. T (+)'] = A_paths[column][successes].mean();
        curr_row['SD T (+)'] = A_paths[column][successes].std();
        
    # Third column describes time taken after path time limit. Compute avg and SD for all failed paths
    elif coltype == 2:
        curr_row['Avg. T (after -)'] = A_paths[column][fails].mean();
        curr_row['SD T (after -)'] = A_paths[column][fails].std();
    
    # Fourth column describes distance covered. Compute avg and SD for all successes
    elif coltype == 3:
        curr_row['Avg D (+)'] = A_paths[column][successes].mean();
        curr_row['SD D (+)'] = A_paths[column][successes].std();
        
        curr_row['Avg S (+)'] = (A_paths[column][successes] / times[successes]).mean();
        curr_row['SD S (+)'] = (A_paths[column][successes] / times[successes]).std();
        
        A_path_stats = A_path_stats.append(curr_row, ignore_index=True)
        curr_row = {}

A_path_stats["Limit"] = A_path_stats["Limit"].astype(float)
# Save to CSV
A_path_stats.to_csv(PROCESSED_DATA_PATH + 'A_Path_Stats.csv');


# Starting extraction of descriptive statistics for paths in condition B
# Procedure is exactly the same as above, refer to above comments. 

B_paths = B_anal.filter(regex="P.*")
B_path_stats = pd.DataFrame(columns = path_anal_cols);

curr_row = {};
successed = [];
fails = []

for idx, column in enumerate(B_paths):
    coltype = idx % 4;
    curr_row['PathNum'] = int(idx / 4) + 1;
    curr_row['Target'] = target_house_names_B[curr_row['PathNum']];
    curr_row['Limit'] = path_time_limits_B[curr_row['PathNum'] - 1]
    if coltype == 0:
        successes = B_paths[column];
        fails = ~successes;
        curr_row['+ Rate'] = successes.sum() / successes.count();
    elif coltype == 1:
        times = B_paths[column]
        curr_row['Avg. T (+)'] = B_paths[column][successes].mean();
        curr_row['SD T (+)'] = B_paths[column][successes].std();
    elif coltype == 2:
        curr_row['Avg. T (after -)'] = B_paths[column][fails].mean();
        curr_row['SD T (after -)'] = B_paths[column][fails].std();
    elif coltype == 3:
        curr_row['Avg D (+)'] = B_paths[column][successes].mean();
        curr_row['SD D (+)'] = B_paths[column][successes].std();
        
        curr_row['Avg S (+)'] = (B_paths[column][successes] / times[successes]).mean();
        curr_row['SD S (+)'] = (B_paths[column][successes] / times[successes]).std();
        
        B_path_stats = B_path_stats.append(curr_row, ignore_index=True)
        curr_row = {}
B_path_stats["Limit"] = B_path_stats["Limit"].astype(float)
B_path_stats.to_csv(PROCESSED_DATA_PATH + 'B_Path_Stats.csv');

In [30]:
print('Condition A:')
print("\tNumber of subjects: " + str(len(A_anal)))
print('\tAvg Successes: ' + str(A_succ_avg));
print('\tSD Successes: ' + str(A_anal_succ_std));

corr_ = A_path_stats['+ Rate'].corr(A_path_stats['Limit']);
print('\nCorrelation between Success and Path Time: ' + str(corr_))
# Descriptive statistics per path for condition A
A_path_stats


Condition A:
	Number of subjects: 12
	Avg Successes: 5.666666666666667
	SD Successes: 2.0150945537631877

Correlation between Success and Path Time: -0.10093502226664555


Unnamed: 0,PathNum,Target,+ Rate,Limit,Avg. T (+),SD T (+),Avg. T (after -),SD T (after -),Avg D (+),SD D (+),Avg S (+),SD S (+)
0,1,book,0.583333,300.0,161.914439,54.236144,100.005741,38.805747,14489.038161,13790.543307,89.148476,89.711515
1,2,shark,0.916667,305.0,103.495294,41.733701,60.849056,,12092.726558,16605.521816,101.76639,119.453848
2,3,bear,0.833333,234.0,52.072322,35.897732,93.972566,103.42906,2110.807717,3485.934199,33.18307,43.0761
3,4,no graffiti,0.166667,332.0,196.542249,58.004837,120.673807,41.780004,23423.4375,21143.464914,108.006857,75.701509
4,5,banque strechnitz,0.5,268.0,79.653422,9.573732,96.395639,65.235597,3254.769135,4175.380607,38.256744,45.44677
5,6,foxes,0.833333,394.0,155.129403,71.633193,62.425426,9.019406,12175.639697,13567.577997,64.749264,39.459367
6,7,johny patisserie,0.75,152.0,57.555982,15.655423,150.006501,98.169669,968.12247,814.592723,17.780105,14.565747
7,8,gorilla,0.833333,383.0,159.703784,63.684079,48.994737,56.295677,3218.907246,3059.43571,21.435977,21.903112
8,9,remote house,0.25,332.0,98.339506,17.564382,132.52293,77.269232,5586.28578,8060.741891,48.968444,66.446017


In [34]:
print('Condition B:')
print("\tNumber of subjects: " + str(len(B_anal)))
print('\tAvg Successes: ' + str(B_succ_avg));
print('\tSD Successes: ' + str(B_anal_succ_std));

corr = B_path_stats['+ Rate'].corr(B_path_stats['Limit']);
print('\nCorrelation between Success and Path Time Limit: ' + str(corr))
print('\nCorrelation between Avg D and Path Time Limit: ' + str(B_path_stats['Avg D (+)'].corr(B_path_stats['Limit'])))
print('\nCorrelation between Avg D and Success: ' + str(B_path_stats['Avg D (+)'].corr(B_path_stats['+ Rate'])))
print('\nCorrelation between Avg S and Success: ' + str(B_path_stats['Avg S (+)'].corr(B_path_stats['+ Rate'])))

B_path_stats

Condition B:
	Number of subjects: 14
	Avg Successes: 6.214285714285714
	SD Successes: 1.5281246137553164

Correlation between Success and Path Time Limit: 0.2128210725066101

Correlation between Avg D and Path Time Limit: 0.3239231137340035

Correlation between Avg D and Success: 0.6165455323737759

Correlation between Avg S and Success: 0.5819947085525395

Correlation between Avg S and Success: 0.30187460813721645


Unnamed: 0,PathNum,Target,+ Rate,Limit,Avg. T (+),SD T (+),Avg. T (after -),SD T (after -),Avg D (+),SD D (+),Avg S (+),SD S (+)
0,1,gorilla,0.714286,332.0,146.107746,62.410763,153.37046,89.679451,5993.44962,4933.530987,39.650591,30.533748
1,2,johny patisserie,0.642857,383.0,161.914168,85.031437,123.339809,46.85477,15198.584109,8559.010364,98.798146,49.799319
2,3,foxes,0.571429,152.0,49.831135,3.748735,129.493369,57.54961,890.037648,1110.650313,18.535721,23.937663
3,4,banque strechnitz,0.928571,394.0,206.320719,97.617372,79.476636,,36119.440456,45778.985238,139.418945,133.794212
4,5,no graffiti,0.0,268.0,,,102.886259,74.88821,,,,
5,6,bear,0.714286,332.0,167.674285,58.569429,95.827939,12.060102,18654.999914,19802.551988,92.407841,73.138492
6,7,shark,1.0,234.0,62.683658,51.683104,,,6253.615225,14133.219143,51.758274,76.305428
7,8,book,0.642857,305.0,177.997099,78.36798,98.827377,25.200365,14164.015096,9556.468814,81.499518,42.423366
8,9,windmill,1.0,300.0,150.240288,67.216606,,,74154.452977,64789.373737,443.376278,298.229444


<img src="./Pics/paths.png">


### Path Difficulties per Condition by Success Rate

In [21]:
# Path difficulties on Average

def pretty_print_diffs(df):
    
    for idx, row in df.iterrows():
        print("{0:0.0f} \t {1:0.3f} \t {2}".format(row['PathNum'], row['+ Rate'], row['Target']))
    print('\n')

top_diff_A = A_path_stats.sort_values(by="+ Rate", ascending=True)[["PathNum", "+ Rate", "Target"]][0:3]
top_diff_B = B_path_stats.sort_values(by="+ Rate", ascending=True)[["PathNum", "+ Rate", "Target"]][0:3]
bot_diff_A = A_path_stats.sort_values(by="+ Rate", ascending=False)[["PathNum", "+ Rate", "Target"]][0:3]
bot_diff_B = B_path_stats.sort_values(by="+ Rate", ascending=False)[["PathNum", "+ Rate", "Target"]][0:3]
print("Most difficult paths in A:")
pretty_print_diffs(top_diff_A)
print("Most difficult paths in B:")
pretty_print_diffs(top_diff_B)
print("Most easy paths in A:")
pretty_print_diffs(bot_diff_A)
print("Most easy paths in B:")
pretty_print_diffs(bot_diff_B)

Most difficult paths in A:
4 	 0.167 	 no graffiti
9 	 0.250 	 remote house
5 	 0.500 	 banque strechnitz


Most difficult paths in B:
5 	 0.000 	 no graffiti
3 	 0.571 	 foxes
2 	 0.643 	 johny patisserie


Most easy paths in A:
2 	 0.917 	 shark
3 	 0.833 	 bear
6 	 0.833 	 foxes


Most easy paths in B:
7 	 1.000 	 shark
9 	 1.000 	 windmill
4 	 0.929 	 banque strechnitz




In [27]:
# Correlation between overlapping path successes

A_targ = A_path_stats[:8]
B_targ = B_path_stats[:8]
B_targ = B_targ.reindex(index=B_targ.index[::-1])

B_rev = B_path_stats.reindex(index=B_path_stats.index[::-1]);

print("Correlations:")
print("Success rates for same target house :{0:0.3f}".format(A_targ['+ Rate'].corr(B_targ['+ Rate'])));
print("Success time for same path (but reversed) :{0:0.3f}".format(A['Avg. T (+)'].corr(B_rev['Avg. T (+)'])));
print("Success dist for same path (but reversed) :{0:0.3f}".format(A['Avg D (+)'].corr(B_rev['Avg D (+)'])));

Correlations:
Success rates for same target house :-0.024
Success time for same path (but reversed) :0.921
Success dist for same path (but reversed) :0.785


In [121]:
# Averages for each category and standard deviation
# Look for outliers
# Average speed? - Distance over time - Compare to final performance
#    for individual paths and for participants overall

# TODO - think about the relevance of individual analysis and path analysis
# TODO - time before guide appeared for success - individual subject and for each path?
# TODO - same performance for dyadic, stare at statistics and generate some observations

Other possible path analysis to apply:
- Look at the heatmap of movement for each path in either conditions. See if there can be identified a most used route, compare perhaps to the shortest route, and see if the most used route differs in either condition.

<img src='Pics/paths.png'>