In [11]:
import pickle
import h5py
import numpy as np
import os
import json
import pandas as pd
DATA_DIR = "../../data/logs"

In [3]:
def to_h5(fname='preprocessed.h5'):
    data = []
    with open(DATA_DIR + "/data.p", "rb") as f:
        while True:      
            try:  
                data += [pickle.load(f)]
            except EOFError:
                break

    # stack like types together
    nextImages = []
    currentImages = []
    metadata = []
    ids = []

    for record in data:
        nextImages.append(record['nextImage'])
        currentImages.append(record['currentImage'])
        metadata.append([
            record['action'], record['reward'], int(record['died']), record["didBoost"]
        ])
        ids.append(record.get('id', b"00000000-0000-0000-0000-000000000000"))

    nextImages = np.array(nextImages)
    currentImages = np.array(currentImages)
    metadata = np.array(metadata)
    ids = np.array(ids)

    assert nextImages.shape == currentImages.shape
    assert metadata.shape[0] == nextImages.shape[0]

    try:
        # read in existing and append
        with h5py.File(DATA_DIR + '/' + fname,'r') as hf:
            nextImages = np.vstack((hf['nextImage'][...], nextImages))
            currentImages = np.vstack((hf['currentImage'][...], currentImages))
            metadata = np.vstack((hf['metadata'][...], metadata))
            ids = np.hstack((hf['ids'][...], ids))
    except (FileNotFoundError, OSError):
        print("No file found, creating initial h5")
        pass

    # write new
    with h5py.File(DATA_DIR + '/' + fname,'w') as hf:
        hf.create_dataset('currentImage', data=currentImages)
        hf.create_dataset('nextImage', data=nextImages)
        hf.create_dataset('metadata', data=metadata)
        hf.create_dataset("ids", data=ids)

    
to_h5('preprocessed_try2.h5')

No file found, creating initial h5


In [17]:
playerlog = os.path.abspath(os.path.join(DATA_DIR, "Player.log"))

with open(playerlog,"r") as f:
    records = [json.loads(x).get('message') for x in f.readlines()]
records = [x for x in records if isinstance(x, dict)]
# records

In [18]:
df = pd.DataFrame(records)

In [19]:
df

Unnamed: 0,Timestep,Length,Dead,Id
0,1605664150,10,False,cd346ac4-db15-4878-8db1-09f4e29d2e55
1,1605664151,14,False,eb69ed7d-7538-41bf-88c9-8226abc21ad3
2,1605664151,14,False,0b2a884d-0f54-4499-8a60-6ea12418ed50
3,1605664152,20,False,6580a874-d224-45fb-b855-77877fa819a2
4,1605664153,19,False,0a6cd4a4-81b8-4e38-98c9-e1c1a9b243ce
...,...,...,...,...
50596,1605704106,76,False,0692472c-27b7-43a8-8fce-ca9a32c17b65
50597,1605704107,78,False,3b6339d5-f702-42bb-a9e6-18a6f5ce662d
50598,1605704108,74,False,fd39fbd3-1152-446a-9657-24c6fccd8a45
50599,1605704109,74,False,dc06e042-5380-42ea-aa64-035c6444d4ee
