# Subject Data PreProcessing
* find subject data
* concatenate the inter-session data
* concatenate the intra-session data

## Configuration

In [1]:
# General configuration
import os

# data_directory: str
#     Path to a directory to store data.
data_directory = '.'

# install_missing_packages: bool
#     A flag indicating if missing packages should be automatically installed
install_missing_packages = True

# use_conda: bool
#     A flag indicating if conda should be used for software installation.
#     If False, pip will be used. The default is to use conda if jupyter
#     is run in a conda environment.
use_conda = 'CONDA_EXE' in os.environ

## Checking for missing packages

In [2]:
import importlib

def check_package(package, pip_pkg: str = None, conda_pkg: str = None):
    """Check if a given package is installed. If missing install
    it (if global flag `install_missing_packages` is True) either with
    pip or with conda (depending on `use_conda`).
    """
    if importlib.util.find_spec(package) is not None:
        return  # ok, package is already installed

    if not install_missing_packages:
        raise RuntimeError(f"{package} is not installed!")

    if use_conda:
        import conda.cli
        conda.cli.main('conda', 'install',  '-y', conda_pkg or package)
    else:
        import subprocess
        import sys            
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pip_pkg or package])
        
# This is to exit cells without error tracebacks (cosmetic purpose)
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

## Creating the required environment (skip if already done)

Running the following cell will create a file graphs.yml that can be used to setup a conda environment containing the required packages. If you already downloaded the file from my GitHub, skip the next cell and create the env directly from it.

In [None]:
%%writefile graphs.yml
name: graphs
channels:
  - conda-forge
  - defaults
dependencies:
  - python=3.6
  - jupyter
  - imageio
  - imageio-ffmpeg
  - matplotlib
  - scikit-image
  - opencv
  - networkx
  - pandas
  - statsmodels

### Environment Creation
To create the environment, open the terminal, go to the directory where you stored the graphs.yml file (the directory of the notebook) and type
conda env create -f graphs.yml
After running this command you have to activate the environment (Linux/MacOS: conda activate graphs, Windows: activate graphs) and then reopen the notebook in that environment.

## Main Part

### Imports and directory information

In [3]:
import os
import json
import numpy as np
import cv2 
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nX
import glob
import scipy.cluster.vq as clusters
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.preprocessing import normalize
from pandas.plotting import autocorrelation_plot as AC_plot 
from statsmodels.graphics import tsaplots
from statsmodels.tsa.stattools import acf
from skimage.filters import gaussian
from mpl_toolkits.mplot3d import Axes3D 


OG_DATA_PATH = './'
DATA_PATH = './Data Exploration/'
PROCESSED_DATA_PATH = './Results/'
# Getting the Folder without hidden files in ascending order 
DATA_FOLDER = sorted([f for f in os.listdir(DATA_PATH) if not f.startswith('.')], key=str.lower)

### Extracting all subject IDs from the data folder

In [4]:
subIDs = []
for sub in DATA_FOLDER:
    subIDs.append(int(sub[0:4]))
subIDs = np.unique(subIDs)
subIDs

array([1023])

### Combine the data
* Loop through all subjects
* extract the session data
* combine the data
* save it

In [173]:
#for subject in subIDs:

subject = subIDs[0] # change with the loop

# extract the exploration data
subject_data = sorted([f for f in DATA_FOLDER if f.startswith(str(subject)+'_Expl_')], key=str.lower)

# open the JSON file as dictionary
with open(DATA_PATH + subject_data[0]) as f:
    subject_session = json.loads(f.read())
    
# extract only the dataPoints
#subject_session = subject_session['trials']
#subject_pd = pd.DataFrame(list(subject_session['dataPoints']))
    


In [178]:
hitpoint_list = []
timestamp_start_list = []
timestamp_end_list = []

for each in subject_session['trials'][0]['dataPoints']:
    hitpoint_list.append(each['rayCastHitsCombinedEyes'])
    timestamp_start_list.append(each['timeStampDataPointStart'])
    timestamp_end_list.append(each['timeStampDataPointEnd'])
    
for index in range(len(hitpoint_list)):
    hitpoint_list[index].append({'timeStampDataPointStart':timestamp_start_list[index]})
    hitpoint_list[index].append({'timeStampDataPointEnd':timestamp_end_list[index]})
    
                                        
print('hitpoint_list: ' + str(len(hitpoint_list))) 
print('timestamp_start_list: ' + str(len(timestamp_start_list))) 
print('timestamp_end_list: ' + str(len(timestamp_end_list)))

hitpoint_list: 66823
timestamp_start_list: 66823
timestamp_end_list: 66823


In [183]:
hitpoints = pd.DataFrame()
# column name list 
col_names =  ['timeStampDataPointStart','timeStampDataPointEnd','hitObjectColliderName', 'ordinalOfHit', 'hitPointOnObject.x',
       'hitPointOnObject.y', 'hitPointOnObject.z',
       'hitObjectColliderBoundsCenter.x', 'hitObjectColliderBoundsCenter.y',
       'hitObjectColliderBoundsCenter.z']
# create an empty dataframe
# with columns
hitpoints_df  = pd.DataFrame(columns = col_names)


for each in hitpoint_list:
    hitpoints = hitpoints.append(pd.json_normalize(each))

In [141]:
hitpoints

ValueError: Length of values (66823) does not match length of index (123295)

In [72]:
test

[[{'hitPointOnObject': {'x': -64.1910629272461,
    'y': 1.429070234298706,
    'z': 71.21446990966797},
   'hitObjectColliderName': 'pavement_O.002',
   'hitObjectColliderBoundsCenter': {'x': -165.16314697265625,
    'y': 1.4516879320144653,
    'z': -15.555870056152344},
   'ordinalOfHit': 1},
  {'hitPointOnObject': {'x': -65.56773376464844,
    'y': 1.1634469032287598,
    'z': 81.047607421875},
   'hitObjectColliderName': 'terrain_S.001',
   'hitObjectColliderBoundsCenter': {'x': -60.39029312133789,
    'y': 1.153674840927124,
    'z': 100.80474090576172},
   'ordinalOfHit': 2}],
 [{'hitPointOnObject': {'x': -64.1910629272461,
    'y': 1.429070234298706,
    'z': 71.21446990966797},
   'hitObjectColliderName': 'pavement_O.002',
   'hitObjectColliderBoundsCenter': {'x': -165.16314697265625,
    'y': 1.4516879320144653,
    'z': -15.555870056152344},
   'ordinalOfHit': 1},
  {'hitPointOnObject': {'x': -65.56773376464844,
    'y': 1.1634469032287598,
    'z': 81.047607421875},
   'hit

In [80]:
subject_session['trials'][0]

{'trialId': -42069,
 'someRandomInformation': 'lel',
 'timeTrialMeasurementStarted': 1619690540.0533192,
 'timeTrialMeasurementStopped': 1619691140.4614377,
 'dataPoints': [{'timeStampDataPointStart': 1619690540.055303,
   'timeStampDataPointEnd': 1619690540.0617511,
   'timeStampGetVerboseData': 1619690540.0557992,
   'eyeOpennessLeft': 1.0,
   'eyeOpennessRight': 1.0,
   'pupilDiameterMillimetersLeft': 4.205169677734375,
   'pupilDiameterMillimetersRight': 4.043212890625,
   'eyePositionCombinedWorld': {'x': -59.086692810058594,
    'y': 2.4139397144317627,
    'z': 34.75539016723633},
   'eyeDirectionCombinedWorld': {'x': -0.13859999179840088,
    'y': -0.02674243040382862,
    'z': 0.98997962474823},
   'eyeDirectionCombinedLocal': {'x': -0.011444091796875,
    'y': -0.008209228515625,
    'z': 0.9998931884765625},
   'eyePositionLeftWorld': {'x': -59.0859260559082,
    'y': 2.4139771461486816,
    'z': 34.755313873291016},
   'eyeDirectionLeftWorld': {'x': -0.13816621899604797,
  

In [135]:
works_data = pd.json_normalize(data=subject_pd, record_path='rayCastHitsCombinedEyes')

KeyError: 'rayCastHitsCombinedEyes'

In [142]:
subject_pd['rayCastHitsCombinedEyes'].head()

0    [{'hitPointOnObject': {'x': -64.1910629272461, 'y': 1.429070234298706, 'z': 71.21446990966797}, 'hitObjectColliderName': 'pavement_O.002', 'hitObjectColliderBoundsCenter': {'x': -165.16314697265625, 'y': 1.4516879320144653, 'z': -15.555870056152344}, 'ordinalOfHit': 1}, {'hitPointOnObject': {'x': -65.56773376464844, 'y': 1.1634469032287598, 'z': 81.047607421875}, 'hitObjectColliderName': 'terrain_S.001', 'hitObjectColliderBoundsCenter': {'x': -60.39029312133789, 'y': 1.153674840927124, 'z': 100.80474090576172}, 'ordinalOfHit': 2}]          
1    [{'hitPointOnObject': {'x': -64.1910629272461, 'y': 1.429070234298706, 'z': 71.21446990966797}, 'hitObjectColliderName': 'pavement_O.002', 'hitObjectColliderBoundsCenter': {'x': -165.16314697265625, 'y': 1.4516879320144653, 'z': -15.555870056152344}, 'ordinalOfHit': 1}, {'hitPointOnObject': {'x': -65.56773376464844, 'y': 1.1634469032287598, 'z': 81.047607421875}, 'hitObjectColliderName': 'terrain_S.001', 'hitObjectColliderBoundsCenter': {'