In [1]:
import pandas as pd
import os
import json

In [2]:
# open json file as dictionary
firebase_data_path = '../data/neurocrypt-rtdb-export.json'
with open(firebase_data_path) as f:
    firebase_data = json.load(f)

### Data structure
```firebase_data``` contains the collected data stored in firebase. The dictionary is structured like this.

The first level key-value pairs are the user_id's and the corresponding user data respectively.

- ```firebase_data[user_id]```
  - session_id_1 (another dict)
    - block (a list of dictionaries for each sub-block)
      - gameNumber (subBlock number)
      - noiseMetrics (dict with performance on noise)
        - hitRate
        - hits
        - misses
      - passMetrics (dict with performance on pass-sequence)
      - subBlockMetrics (dict with performance on entire subBlock)
    - hitRate (hitrate during entire session)
    - hits
    - misses
    - session (one of "TRAIN", "AUTH1", "AUTH2")
  - session_id_2
  - session_id_3
  - GAMETYPE (one of "VIS", "AUD", "CONTROL")
  - noteGenerateLag
  - noteSpeed
- ... other users

In [3]:
firebase_data

{'0AT3q4mTMncZcXXdMv8pkQIXdqk1': {'-NvudMOZ3lNhqry-7uON': {'block': [{'gameNumber': 1,
     'noiseMetrics': {'hitRate': 0.6111111111111112, 'hits': 55, 'misses': 35},
     'passMetrics': {'hitRate': 0.6488888888888888,
      'hits': 292,
      'misses': 158},
     'subBlockMetrics': {'hitRate': 0.6425925925925926,
      'hits': 347,
      'misses': 193}},
    {'gameNumber': 2,
     'noiseMetrics': {'hitRate': 0.7888888888888889, 'hits': 71, 'misses': 19},
     'passMetrics': {'hitRate': 0.7666666666666667,
      'hits': 345,
      'misses': 105},
     'subBlockMetrics': {'hitRate': 0.7703703703703704,
      'hits': 416,
      'misses': 124}},
    {'gameNumber': 3,
     'noiseMetrics': {'hitRate': 0.6111111111111112, 'hits': 55, 'misses': 35},
     'passMetrics': {'hitRate': 0.7266666666666667,
      'hits': 327,
      'misses': 123},
     'subBlockMetrics': {'hitRate': 0.7074074074074074,
      'hits': 382,
      'misses': 158}},
    {'gameNumber': 4,
     'noiseMetrics': {'hitRate': 0

In [4]:
user_ids = list(firebase_data.keys())
len(user_ids) ##  check if consistent with firebase auth db (NR - yes letsgo)

88

In [5]:
def get_sessions(user_id):
    user_data = firebase_data[user_id]
    session_ids = list(user_data.keys())
    # only dictionaries in the first level are sessions (rest are stats)
    session_ids = [session_id for session_id in session_ids if isinstance(user_data[session_id], dict)]
    return [user_data[session_id] for session_id in session_ids]

print(user_ids[1])
print(len(get_sessions(user_ids[1])))

267spXKQrOQxNGhW9Mj19RVBZnF3
3


In [6]:
# Remove any user_ids that do not have 3 sessions in the data dict
valid_user_ids = [user_id for user_id in user_ids if len(get_sessions(user_id)) == 3]
print("After cleaning:", len(valid_user_ids))


After cleaning: 70


In [7]:
# show the invalid user_ids
invalid_user_ids = [user_id for user_id in user_ids if user_id not in valid_user_ids]
invalid_user_ids

['0AT3q4mTMncZcXXdMv8pkQIXdqk1',
 '61Alts16Drd976ojNN40qLI0OYf2',
 '9QlFWUaX8YUvnJbOzqLHj9Hm7m13',
 'F8h1ogBE7YWzfrMxHZXqVWRugDj1',
 'ILvZTNvwCJU2yelJbt2ec1frgTd2',
 'JX1MSnsjz5YX3YYpFzq8k7PNrkF2',
 'QBQ7IYCMNZgIrRo6xyIWaAE8zfh2',
 'ZKxy4N3cRheVWvkBidVPlhqpUys1',
 '_gamedata',
 'aCdtxO7Yt8NS4bUEcVIwO8hXxrp2',
 'al1J9obEr8RGWeegYm6pdMUukTy1',
 'cGCLDYJZjufGLDzs2FGvzB2K7G13',
 'iqDkwDQyBkbrSJuyNP8MtppVfw33',
 'jjvUl1JzWCd1tOZiMk7b4c3LxUg2',
 'knhgxWpBpgVuwufdvzxJp9tmhYD3',
 'nmEA2h7hZde6WDqxdHUBxndmiuz1',
 'oV4HGMAODpaxKUcCQHdIzfn3pPB3',
 'ydFbNfL3WSeGXeHYprAbWXhsihh2']

## Data Restructuring
Three dataframes to be made with the following structure:

<p align="center">
  <img src="../data/data_schema.png" />
</p>

In [56]:
# helper function to populate a sessionData dictionary in the required format
def sessionDataPopulator(session):
  sessionData = []
  for subBlock in session['block']:
      subBlockData = {}
      subBlockData['block'] = subBlock['gameNumber']
      subBlockData['noiseHits'] = subBlock['noiseMetrics']['hits']
      subBlockData['noiseMisses'] = subBlock['noiseMetrics']['misses']
      subBlockData['passHits'] = subBlock['passMetrics']['hits']
      subBlockData['passMisses'] = subBlock['passMetrics']['misses']
      sessionData.append(subBlockData)
  return sessionData

In [57]:
mainData = pd.DataFrame(index=range(len(valid_user_ids)), columns=['userID', 'modality', 'trainingData', 'auth1Data', 'auth2Data'])
mainData['userID'] = valid_user_ids

for i, user_id in enumerate(valid_user_ids):
    modality = firebase_data[user_id]['GAMETYPE']
    mainData.loc[i, 'modality'] = modality

    sessions = get_sessions(user_id)
    for session in sessions:
    
        if session['session'] == "TRAIN":
            trainingData = sessionDataPopulator(session)
            mainData.at[i, 'trainingData'] = trainingData

        elif session['session'] == "AUTH1":
            auth1Data = sessionDataPopulator(session)
            mainData.at[i, 'auth1Data'] = auth1Data
        
        elif session['session'] == "AUTH2":
            auth2Data = sessionDataPopulator(session)
            mainData.at[i, 'auth2Data'] = auth2Data


In [58]:
mainData.to_csv('../data/main_NC_Data.csv', index=False)

#### How to read the CSV

In [59]:
mainData = pd.read_csv('../data/main_NC_Data.csv')
mainData

Unnamed: 0,userID,modality,trainingData,auth1Data,auth2Data
0,267spXKQrOQxNGhW9Mj19RVBZnF3,AUD,"[{'block': 1, 'noiseHits': 67, 'noiseMisses': ...","[{'block': 1, 'noiseHits': 233, 'noiseMisses':...","[{'block': 1, 'noiseHits': 248, 'noiseMisses':..."
1,2IuHuexDUpgOdXCBYfDqO7ubtEC2,AUD,"[{'block': 1, 'noiseHits': 47, 'noiseMisses': ...","[{'block': 1, 'noiseHits': 211, 'noiseMisses':...","[{'block': 1, 'noiseHits': 240, 'noiseMisses':..."
2,3YKJchZqXXPUm7ouoYwzWlTp4D53,CONTROL,"[{'block': 1, 'noiseHits': 58, 'noiseMisses': ...","[{'block': 1, 'noiseHits': 266, 'noiseMisses':...","[{'block': 1, 'noiseHits': 244, 'noiseMisses':..."
3,4SQKvEIUBucwuWNoKkOVBX30H1P2,CONTROL,"[{'block': 1, 'noiseHits': 65, 'noiseMisses': ...","[{'block': 1, 'noiseHits': 258, 'noiseMisses':...","[{'block': 1, 'noiseHits': 252, 'noiseMisses':..."
4,5NToitXbnhXNNLn80hbxLBUSbfw2,VIS,"[{'block': 1, 'noiseHits': 66, 'noiseMisses': ...","[{'block': 1, 'noiseHits': 250, 'noiseMisses':...","[{'block': 1, 'noiseHits': 258, 'noiseMisses':..."
...,...,...,...,...,...
65,uEJUpBKufKO3DmepKtVcTfFT6WU2,VIS,"[{'block': 1, 'noiseHits': 74, 'noiseMisses': ...","[{'block': 1, 'noiseHits': 268, 'noiseMisses':...","[{'block': 1, 'noiseHits': 175, 'noiseMisses':..."
66,umG8DQRaeVd6RPOJfoLKNdclG8e2,CONTROL,"[{'block': 1, 'noiseHits': 66, 'noiseMisses': ...","[{'block': 1, 'noiseHits': 245, 'noiseMisses':...","[{'block': 1, 'noiseHits': 242, 'noiseMisses':..."
67,wFRIlVm6H0N0WjSBNg3EMMaTf0H3,CONTROL,"[{'block': 1, 'noiseHits': 61, 'noiseMisses': ...","[{'block': 1, 'noiseHits': 251, 'noiseMisses':...","[{'block': 1, 'noiseHits': 275, 'noiseMisses':..."
68,xTyF5REL0bUIT99Hkx2YCWC75572,AUD,"[{'block': 1, 'noiseHits': 59, 'noiseMisses': ...","[{'block': 1, 'noiseHits': 267, 'noiseMisses':...","[{'block': 1, 'noiseHits': 259, 'noiseMisses':..."


In [55]:
mainData['auth1Data'][0]

"[{'subBlock': 1, 'noiseHits': 233, 'noiseMisses': 127, 'passHits': 141, 'passMisses': 39}]"