In [43]:
import json
import pandas as pd
import numpy as np

In [44]:
df = pd.DataFrame({
    'person_id': [1,1,1,2,2,3],
    'abspos': [100.5, 200.123, 300.3, 400, 500, 600], # abspos in hours since Jan1, 2020 (see utils.py:calculate_abspos.py)
    'age': [10, 20, 30, 40, 50, 60], # age in years
    'event': [[5, 6], [7], [5, 7, 8, 10], [1,2,3], [1,2,3,4], [1]], # Tokenized tokens, where each list (uneven lengths) is an event, each element in list is token
    # Segment is not saved here and would require to be created in collate_fn (if in-memory, we can create it here)
})
vocabulary = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10}
targets = pd.DataFrame({
    'person_id': [1,2,3],
    'target': [1, 0, 1]
})

In [45]:
# generate a large dataexample following the structure from above
n_persons = 1000
data = []
for i in range(n_persons):
    for j in range(np.random.randint(1, 10)):
        data.append({
            'person_id': i,
            'abspos': 100.5 + j,
            'age': 10 + j,
            'event': list(np.random.randint(1, 11, np.random.randint(1, 10)))
        })
df = pd.DataFrame(data)
vocabulary = {'[PAD]': 0,'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10}

# for each person, get the sum of the event tokens over all event lists
person_sum = df.groupby('person_id').apply(lambda x: x['event'].apply(lambda x: sum(x)).sum())
target = (person_sum <= person_sum.quantile(0.5)).astype(int)

person_id = person_sum.index

targets = pd.DataFrame({
    'person_id': list(person_id),
    'target': target
})

In [46]:
df.to_parquet('fake_data/sequence_data.parquet', index=False)
targets.to_csv('fake_data/targets.csv', index=False)

In [47]:
with open('fake_data/vocab.json', 'w') as f:
    json.dump(vocabulary, f)