In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import dask

from dask.distributed import Client
from datetime import datetime

client = Client(n_workers=6)

In [None]:
patterns = dd.read_csv("../test-dir/safegraph/weekly-patterns/*.csv", dtype={'poi_cbg': 'float64'})

In [None]:
df = patterns.repartition(10)
df

In [None]:
import json

df = df[['safegraph_place_id', 'bucketed_dwell_times', 'raw_visit_counts', 'median_dwell']]

# Split the maps into arrays so we can sum over the 
def map_to_array(m):
    l = m.translate(str\
        .maketrans({'{':'', '}':'','"':''}))\
        .split(',')
    
    def splits(x):
        s = x.split(':')
        if len(s) == 2:
            return s[1]
        else:
            return 0
    
    l = [splits(x) for x in l]
    return np.array(l).astype(np.int32)
    
df['dwell_array'] = df['bucketed_dwell_times'].apply(map_to_array, meta=('bucketed_dwell_times', 'object'))
df.head()

In [None]:
histogram = df['dwell_array'].values.sum().compute()

In [None]:
histogram
vals = df['dwell_array'].values.compute()
len(vals)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(tight_layout=True)
ax.bar(["<5", "5-20", "21-60", "61-240", ">240"], histogram)

In [None]:
import scipy.stats as sts
# Ok, let's simulate
N_SIMULATIONS = 10000
# Bucket sizes, which are are based on the bucketed_dwell_time column
bins = np.array([0, 5,  20,  60, 240, 500])

def simulate(dwell_hist):
    dwell_hist = np.array(dwell_hist).astype(np.int32)
    # Sample from the input data to 
    sampled = np.random.choice(a=(bins[:-1] + bins[1:])/2, size=N_SIMULATIONS*5, p=dwell_hist / dwell_hist.sum())
    try:        
        # Create a linear space of 0-300 minutes
        space = np.linspace(0, 300)
        rkde = sts.gaussian_kde(sampled)
        # Create a custom distribution from the KDE PDF for 0-300 minutes
        pdf = rkde.pdf(space)
        custm = sts.rv_discrete(name='custm', a=space.min(), b=space.max(), values=(space, pdf / pdf.sum()))
        # Sample dwell times from custom distribution
        sampled = np.array([ custm.rvs() for i in range(N_SIMULATIONS)])
#         # Return the average dwell time
        return sampled.mean()
    except Exception as e:
        print(e)
        return 0
    
def prnt(x):
    return np.array(x).astype(np.int32).sum()
    
def simulate_partition(df):
    df['sampled'] = df.apply(lambda x: simulate(x.dwell_array), 1)
    df['contact_sim'] = df.raw_visit_counts * df.sampled
    df['contact_median'] = df.raw_visit_counts * df.median_dwell
    return df

test = df.drop('bucketed_dwell_times', axis=1)

test = test.map_partitions(simulate_partition, meta={'safegraph_place_id': 'str', 'raw_visit_counts': 'int32', 'median_dwell': 'float', 'dwell_array': 'object', 'sampled': 'float', 'contact_sim': 'float', 'contact_median': 'float'})

In [None]:
test.compute()

In [None]:
arry = np.array([2, 15, 7, 13, 22])
arry / arry.sum()

In [None]:
plt.figure(figsize=(8,6))
hist, e = np.histogram([2, 15, 7, 13, 22], bins=[0, 5, 20, 60, 240, 500], density=True)
plt.bar(["<5", "5-20", "21-60", "61-240", ">240"], hist)
x = np.linspace(0, 500, 500)
plt.plot(x, sim.pdf(x), '--', c='C3', lw=4, label='resampled KDE')
# plt.title('n = %d' % n)
plt.legend()
plt.show()
sim.pdf(x)

In [None]:
plt.figure(figsize=(8,6))
hist, e= np.histogram([2, 15, 7, 13, 22], bins=[5, 20, 60, 240, 500])
plt.hist(hist)

In [None]:
plt.figure(figsize=(8,6))
plt.bar(e[:-1], hist, width=[5, 10, 15, 20], ec='k', align='edge', label='histogram')

In [None]:
bins = np.array([0, 5,  20,  60, 240, 500])
(bins[:-1] + bins[1:])/2