In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
from functools import partial

import nomad.io.base as loader
import nomad.filters as filters
import nomad.stop_detection.grid_based as sd
import seaborn as sns

ModuleNotFoundError: No module named 'nomad'

In [2]:
def numerical_date_range(start_date: int, end_date: int):
    """Return every calendar day from start_date to end_date inclusive, each as an int YYYYMMDD."""
    s = str(start_date)
    e = str(end_date)
    start_dt = date(int(s[:4]), int(s[4:6]), int(s[6:]))
    end_dt   = date(int(e[:4]), int(e[4:6]), int(e[6:]))
    if start_dt > end_dt:
        raise ValueError("start_date must not be after end_date")
    span = (end_dt - start_dt).days
    return [int((start_dt + timedelta(days=i)).strftime("%Y%m%d")) for i in range(span + 1)]

In [3]:
loader.table_columns('BASELINE_PINGS_BY_NIGHTS', format='parquet')

Index(['cuebiq_id', 'event_timestamp', 'event_zoned_datetime', 'lng', 'lat',
       'accuracy_meters', 'classification_type', 'zipcode_id', 'provider',
       'processing_date'],
      dtype='object')

## 1. Downsample and attach h3 and zipcode

### baseline

In [None]:
num_dates = numerical_date_range(20241126, 20250107)
dates = pd.date_range('2024-11-26', '2025-01-07').astype(str).tolist()

skipped_dates = [dates[5*i] for i in range(9)]+ [dates[-1]]
skipped_dates_num = [num_dates[5*i] for i in range(9)]+ [num_dates[-1]]

filter_lists = [[("processing_date", ">=", skipped_dates_num[j]), ("event_zoned_datetime", ">=", skipped_dates[j]),
                 ("event_zoned_datetime", "<", skipped_dates[j+1])] for j in range(len(skipped_dates)-1)]

In [None]:
results = []
for fltrs in filter_lists:
    df = loader.from_file('BASELINE_PINGS_BY_NIGHTS',
                          format='parquet',
                          latitude='lat',
                          longitude='lng',
                          timestamp='event_timestamp',
                          datetime='event_zoned_datetime',
                          filters=fltrs)
    if len(df)==0:
        continue
    df = filters.downsample(df, periods=4, freq='min', timestamp='event_timestamp', user_id='cuebiq_id')
    df['h3_cell'] = filters.to_tessellation(df, index="h3", res=8, longitude="lng", latitude="lat")
    df['date'] = df.event_zoned_datetime.astype("str").str[:10]
    results += [df]

df = pd.concat(results, ignore_index=True)

In [None]:
loader.to_file(df, "./base_pings_h3/", format="parquet", partition_by=["date"],
               latitude="lat", longitude="lng", timestamp="event_timestamp")

### feb_apr

In [None]:
num_dates = numerical_date_range(20250219, 20250402)
dates = pd.date_range('2025-02-19', '2025-04-02').astype(str).tolist()

skipped_dates = [dates[5*i] for i in range(9)]+ [dates[-1]]
skipped_dates_num = [num_dates[5*i] for i in range(9)]+ [num_dates[-1]]

filter_lists = [[("processing_date", ">=", skipped_dates_num[j]), ("event_zoned_datetime", ">=", skipped_dates[j]),
                 ("event_zoned_datetime", "<", skipped_dates[j+1])] for j in range(len(skipped_dates)-1)]

In [None]:
results = []
for fltrs in filter_lists:
    df = loader.from_file('FEB_APR_PINGS_BY_NIGHTS',
                          format='parquet',
                          latitude='lat',
                          longitude='lng',
                          timestamp='event_timestamp',
                          datetime='event_zoned_datetime',
                          filters=fltrs)
    if len(df)==0:
        continue
    df = filters.downsample(df, periods=4, freq='min', timestamp='event_timestamp', user_id='cuebiq_id')
    df['h3_cell'] = filters.to_tessellation(df, index="h3", res=8, longitude="lng", latitude="lat")
    df['date'] = df.event_zoned_datetime.astype("str").str[:10]
    results += [df]

df = pd.concat(results, ignore_index=True)

In [None]:
loader.to_file(df, "./feb_apr_pings_h3/", format="parquet", partition_by=["date"],
               latitude="lat", longitude="lng", timestamp="event_timestamp")

### jan_feb

In [None]:
num_dates = numerical_date_range(20250112, 20250223)
dates = pd.date_range('2025-01-12', '2025-02-23').astype(str).tolist()

skipped_dates = [dates[5*i] for i in range(9)]+ [dates[-1]]
skipped_dates_num = [num_dates[5*i] for i in range(9)]+ [num_dates[-1]]

filter_lists = [[("processing_date", ">=", skipped_dates_num[j]), ("event_zoned_datetime", ">=", skipped_dates[j]),
                 ("event_zoned_datetime", "<", skipped_dates[j+1])] for j in range(len(skipped_dates)-1)]

In [None]:
results = []
for fltrs in filter_lists:
    df = loader.from_file('JAN_FEB_PINGS_BY_NIGHTS',
                          format='parquet',
                          latitude='lat',
                          longitude='lng',
                          timestamp='event_timestamp',
                          datetime='event_zoned_datetime',
                          filters=fltrs)
    if len(df)==0:
        continue
    df = filters.downsample(df, periods=4, freq='min', timestamp='event_timestamp', user_id='cuebiq_id')
    df['h3_cell'] = filters.to_tessellation(df, index="h3", res=8, longitude="lng", latitude="lat")
    df['date'] = df.event_zoned_datetime.astype("str").str[:10]
    results += [df]

df = pd.concat(results, ignore_index=True)

In [None]:
loader.to_file(df, "./jan_feb_pings_h3/", format="parquet", partition_by=["date"],
               latitude="lat", longitude="lng", timestamp="event_timestamp")

## 2. Produce plots of completeness

In [None]:
tc = dict(
      latitude='lat',
      longitude='lng',
      timestamp='event_timestamp',
      datetime='event_zoned_datetime',
      date='date',
      user_id='cuebiq_id'
)

### baseline period

In [None]:
df = loader.from_file('base_pings_h3',
                      format='parquet',
                      traj_cols=tc)

In [None]:
Q = filters.coverage_matrix(df, periods=1, freq='h', traj_cols=tc, str_from_time=True)

In [None]:
print(Q.index)
print(order.index)

In [None]:
Q = filters.coverage_matrix(df, periods=1, freq='h', traj_cols=tc, str_from_time=True)
completeness_hourly = Q.mean(1)
order = completeness_hourly.argsort(-1)
Q = Q.iloc[order]

fig, ax = plt.subplots(figsize=(12, 3))
Q.columns = pd.to_datetime(Q.columns)

ax.pcolormesh(Q.columns, range(len(Q)), Q.values,               # plot matrix
               cmap='Blues', shading='auto')

ax.set_yticks([])
ax.set_ylabel('User')
ax.set_title('Activity each hour (coverage matrix)')
plt.show()

In [None]:
#fig.savefig('completeness_matrix_baseline.svg', format='svg')

In [None]:
q_15m = filters.completeness(df, periods=15, freq='min', traj_cols=tc)
q_hourly = filters.completeness(df, periods=1, freq='h', traj_cols=tc)
q_daily = filters.completeness(df, periods=1, freq='d', traj_cols=tc)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(12, 3))

sns.kdeplot(q_15m, ax=ax1, bw_adjust=0.25, fill=True)
ax1.axvline(q_15m.median(), color='k', linestyle='--', linewidth=1)
ax1.set_title('Completeness (15 min)')
ax1.set_ylabel('Density')
ax1.set_xlim((0,1))

sns.kdeplot(q_hourly, ax=ax2, bw_adjust=0.25, fill=True)
ax2.axvline(q_hourly.median(), color='k', linestyle='--', linewidth=1)
ax2.set_title('Completeness (Hourly)')
ax2.set_xlim((0,1))

sns.kdeplot(q_daily, ax=ax3, bw_adjust=0.25, fill=True)
ax3.axvline(q_daily.median(), color='k', linestyle='--', linewidth=1)
ax3.set_title('Completeness (Daily)')
ax3.set_xlim((0,1))

[ax.grid() for ax in (ax1, ax2, ax3)]
plt.show()

In [None]:
#fig.savefig('completeness_density_baseline.svg', format='svg')

### jan-feb

In [None]:
df = loader.from_file('jan_feb_pings_h3',
                      format='parquet',
                      traj_cols=tc)

In [None]:
Q = filters.coverage_matrix(df, periods=1, freq='h', traj_cols=tc, str_from_time=True)
Q = Q.reindex(order.index).fillna(0).iloc[order]

fig, ax = plt.subplots(figsize=(15, 4))
Q.columns = pd.to_datetime(Q.columns)

ax.pcolormesh(Q.columns, range(len(Q)), Q.values,               # plot matrix
               cmap='Oranges', shading='auto')

ax.set_yticks([])
ax.set_ylabel('User')
ax.set_title('Activity each hour (coverage matrix)')
plt.show()

In [None]:
fig.savefig('completeness_matrix_jan_feb.svg', format='svg')

In [None]:
q_15m = filters.completeness(df, periods=15, freq='min', traj_cols=tc).reindex(order.index).fillna(0)
q_hourly = filters.completeness(df, periods=1, freq='h', traj_cols=tc).reindex(order.index).fillna(0)
q_daily = filters.completeness(df, periods=1, freq='d', traj_cols=tc).reindex(order.index).fillna(0)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(12, 3))

sns.kdeplot(q_15m, ax=ax1, bw_adjust=0.25, fill=True, color="orange")
ax1.axvline(q_15m.median(), color='k', linestyle='--', linewidth=1)
ax1.set_title('Completeness (15 min)')
ax1.set_ylabel('Density')
ax1.set_xlim((0,1))

sns.kdeplot(q_hourly, ax=ax2, bw_adjust=0.25, fill=True, color="orange")
ax2.axvline(q_hourly.median(), color='k', linestyle='--', linewidth=1)
ax2.set_title('Completeness (Hourly)')
ax2.set_xlim((0,1))

sns.kdeplot(q_daily, ax=ax3, bw_adjust=0.25, fill=True, color="orange")
ax3.axvline(q_daily.median(), color='k', linestyle='--', linewidth=1)
ax3.set_title('Completeness (Daily)')
ax3.set_xlim((0,1))

[ax.grid() for ax in (ax1, ax2, ax3)]
plt.show()

In [None]:
#fig.savefig('completeness_density_jan_feb.svg', format='svg')

### feb-apr

In [None]:
df = loader.from_file('feb_apr_pings_h3',
                      format='parquet',
                      traj_cols=tc)

In [None]:
Q = filters.coverage_matrix(df, periods=1, freq='h', traj_cols=tc, str_from_time=True)
Q = Q.reindex(order.index).fillna(0).iloc[order]

fig, ax = plt.subplots(figsize=(15, 4))
Q.columns = pd.to_datetime(Q.columns)

ax.pcolormesh(Q.columns, range(len(Q)), Q.values,               # plot matrix
               cmap='Greens', shading='auto')

ax.set_yticks([])
ax.set_ylabel('User')
ax.set_title('Activity each hour (coverage matrix)')
plt.show()

In [None]:
fig.savefig('completeness_matrix_feb_apr.svg', format='svg')

In [None]:
q_15m = filters.completeness(df, periods=15, freq='min', traj_cols=tc).reindex(order.index).fillna(0)
q_hourly = filters.completeness(df, periods=1, freq='h', traj_cols=tc).reindex(order.index).fillna(0)
q_daily = filters.completeness(df, periods=1, freq='d', traj_cols=tc).reindex(order.index).fillna(0)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(12, 3))

sns.kdeplot(q_15m, ax=ax1, bw_adjust=0.25, fill=True, color="green")
ax1.axvline(q_15m.median(), color='k', linestyle='--', linewidth=1)
ax1.set_title('Completeness (15 min)')
ax1.set_ylabel('Density')
ax1.set_xlim((0,1))

sns.kdeplot(q_hourly, ax=ax2, bw_adjust=0.25, fill=True, color="green")
ax2.axvline(q_hourly.median(), color='k', linestyle='--', linewidth=1)
ax2.set_title('Completeness (Hourly)')
ax2.set_xlim((0,1))

sns.kdeplot(q_daily, ax=ax3, bw_adjust=0.25, fill=True, color="green")
ax3.axvline(q_daily.median(), color='k', linestyle='--', linewidth=1)
ax3.set_title('Completeness (Daily)')
ax3.set_xlim((0,1))

[ax.grid() for ax in (ax1, ax2, ax3)]
plt.show()

In [None]:
fig.savefig('completeness_density_feb_apr.svg', format='svg')

# stop detection & h3 attribution

In [4]:
from nomad.stop_detection import lachesis
tc = dict(
      latitude='lat',
      longitude='lng',
      timestamp='event_timestamp',
      datetime='event_zoned_datetime',
      date='date',
      user_id='cuebiq_id'
)

### baseline period

In [5]:
df = loader.from_file('base_pings_h3',
                      format='parquet',
                      traj_cols=tc)



In [6]:
stops = lachesis.lachesis_per_user(
            df,
            dt_max=480,
            delta_roam=50,
            dur_min=5,
            complete_output=True,
            passthrough_cols=["classification_type", "tz_offset", "date"],
            traj_cols=tc,
        )

  return pd.concat(results, ignore_index=True)


In [7]:
loader.to_file(stops, "./base_stops/", format="parquet", partition_by=["date"], traj_cols=tc)



### jan-feb

In [8]:
df = loader.from_file('jan_feb_pings_h3',
                      format='parquet',
                      traj_cols=tc)



In [9]:
%%time
stops = lachesis.lachesis_per_user(
            df,
            dt_max=480,
            delta_roam=50,
            dur_min=5,
            complete_output=True,
            passthrough_cols=["classification_type", "tz_offset", "date"],
            traj_cols=tc,
        )

CPU times: user 4min 25s, sys: 99.9 ms, total: 4min 25s
Wall time: 4min 25s


  return pd.concat(results, ignore_index=True)


In [10]:
loader.to_file(stops, "./jan_feb_stops/", format="parquet", traj_cols=tc)



### feb-apr

In [11]:
df = loader.from_file('feb_apr_pings_h3',
                      format='parquet',
                      traj_cols=tc)



In [12]:
%%time
stops = lachesis.lachesis_per_user(
            df,
            dt_max=480,
            delta_roam=50,
            dur_min=5,
            complete_output=True,
            passthrough_cols=["classification_type", "tz_offset", "date"],
            traj_cols=tc,
        )

CPU times: user 3min 47s, sys: 76 ms, total: 3min 47s
Wall time: 3min 47s


  return pd.concat(results, ignore_index=True)


In [13]:
loader.to_file(stops, "./feb_apr_stops/", format="parquet", traj_cols=tc)

