### Step 2.2: Data generation

__Step goals:__ Generate individual traces from aggregated data.

__Step overview:__
1. Load the data;
2. Sample individuals from the original number of entrances;
3. Construct the data frame and save the result.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from tqdm import tqdm

np.random.seed(12345) # fix random seed for reproducibility

1. Load the data: entry or exit

In [2]:
data_type = 'entry' 
gdf = gpd.read_file(f'../data/interim/{data_type}_station.json')

# Save station names
station_names = gdf['station'] 

# Select only the data
data = gdf.loc[:,'0200-0215':'0145-0200'] 

# Sum the traffic over the day for each station
N = np.sum(data.T).values

# Scale data
data = data.div(N, axis=0).values

2. Generate individual traces

In [3]:
col = []
mat = []
for i in tqdm(range(data.shape[0])):
    rate = N[i]
    jj = np.random.poisson(rate, 1)
    j = jj[0]
    if (j < 1):
        j = 1
    for j in range(j):
        vec = np.random.multinomial(1, data[i,:], size=1)
        col.append(i)
        mat.append(vec[0])

100%|████████████████████████████████████████████████████████████████████████████████| 264/264 [01:13<00:00,  3.61it/s]


3. Construct resulting data frame

In [4]:
# Resulting list is of 4571040 individuals
X = np.array(mat)

# Number of time stamps
t = np.arange(0, 96)

# Resulting matrix
Z = np.matmul(X, t)

# Resulting data frame
result = pd.DataFrame()
result["station_id"] = col
result["time_stamp"] = Z

# Merge the data to get station names
result = pd.merge(result, station_names, left_on='station_id', right_on=station_names.index)

In [5]:
# Save the data
result.to_csv('../data/interim/entry_individual.csv', index=False)