In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
from functools import partial

import nomad.io.base as loader
from nomad.filters import _compute_q_stat
import nomad.stop_detection.grid_based as sd
import seaborn as sns

In [None]:
def _generate_Q_matrix(traj, traj_cols):
    '''
    generate Q daily matrix
    '''
    df = traj.copy()
    #Compute number of complete users over each window with 1-day timestep
    df['date_hour'] = df[traj_cols['datetime']].dt.floor('h')
    df['date'] = df[traj_cols['datetime']].dt.date
    df_downsampled_1hour = df[[traj_cols['user_id'],'date','date_hour']].drop_duplicates()
    df_date_nhours = df_downsampled_1hour.groupby([traj_cols['user_id'],'date']).size().reset_index()
    df_date_nhours.rename(columns = {0:'nhours'}, inplace = True)
    df_date_nhours['perc_hours'] = df_date_nhours['nhours']/24
    Q = df_date_nhours.pivot(index = 'date', 
                             columns = traj_cols['user_id'], 
                             values = 'perc_hours').fillna(0)
    return Q


def _compute_mean_q(Q, date, SW_width_days):
    '''
    compute average q over a single sliding window iteration 
    that is for a given day, take the window [day, day + SW_width_days] and compute individual q mean for all users
    '''
    #select index corresponding to the date
    i = np.argwhere(Q.index==date).ravel()[0]
    #compute the q mean over the specific sliding window iteration
    return Q[i:i+SW_width_days].mean(axis=0)

def numerical_date_range(start_date: int, end_date: int):
    """Return every calendar day from start_date to end_date inclusive, each as an int YYYYMMDD."""
    s = str(start_date)
    e = str(end_date)
    start_dt = date(int(s[:4]), int(s[4:6]), int(s[6:]))
    end_dt   = date(int(e[:4]), int(e[4:6]), int(e[6:]))
    if start_dt > end_dt:
        raise ValueError("start_date must not be after end_date")
    span = (end_dt - start_dt).days
    return [int((start_dt + timedelta(days=i)).strftime("%Y%m%d")) for i in range(span + 1)]

In [None]:
loader.table_columns('BASELINE_PINGS_BY_NIGHTS', format='parquet')

In [None]:
num_dates = numerical_date_range(20241126, 20250113)
dates = pd.date_range('2024-11-26', '2025-01-13').astype(str).tolist()

skipped_dates = [dates[5*i] for i in range(10)]+ [dates[-1]]
skipped_dates_num = [num_dates[5*i] for i in range(10)]+ [num_dates[-1]]

filter_lists = [[("processing_date", ">=", skipped_dates_num[j]), ("event_zoned_datetime", ">=", skipped_dates[j]), ("event_zoned_datetime", "<", skipped_dates[j+1])] for j in range(len(skipped_dates)-1)]

In [None]:
Q_matrices = []
for fltrs in filter_lists:
    df = loader.from_file('BASELINE_PINGS_BY_NIGHTS',
                          format='parquet',
                          latitude='lat',
                          longitude='lng',
                          timestamp='event_timestamp',
                          datetime='event_zoned_datetime',
                          filters=fltrs)
    Q_matrices += [ _generate_Q_matrix(df, traj_cols={'datetime':'event_zoned_datetime', 'user_id':'cuebiq_id'})]

In [None]:
Q = pd.concat(Q_matrices).fillna(0)

In [None]:
num_days = 42

q_hourly = Q.mean(axis=0)*100
q_daily = ((Q>0.0).sum(axis=0)/num_days)*100

In [None]:
# assume Q is your DataFrame and q_daily is the Series of mean completeness per user
sorted_cols = q_daily.sort_values(ascending=False).index
Qs = Q[sorted_cols]
binary = Qs.gt(0).astype(int)
cmap = plt.cm.Blues

fig, axes = plt.subplots(
    2, 2,
    figsize=(8, 8),
    gridspec_kw={'height_ratios': [1, 2.5]},
    sharey='row'
)

# marginal histograms
axes[0,0].hist(q_daily, bins=46)
axes[0,0].set_xlabel('% days with data')

axes[0,1].hist(q_hourly, bins=46)
axes[0,1].set_xlabel('% hours with data')

for ax, data in zip([axes[0,1], axes[0,0]], (q_hourly, q_daily)):
    ax.axvline(np.median(data), linestyle='--', color='red', lw=1)
# binary heatmap
sns.heatmap(
    binary.T,
    ax=axes[1,0],
    cmap=cmap,
    vmin=0, vmax=1,
    cbar=False,
    xticklabels=False,
    yticklabels=False
)
axes[1,0].set_ylabel('User')

# continuous heatmap
sns.heatmap(
    Qs.T,
    ax=axes[1, 1],
    cmap=cmap,
    vmin=0, vmax=1,
    cbar_kws={'label': 'Completeness'},
    xticklabels=False,
    yticklabels=False
)
axes[1,1].set_ylabel('')

# set date ticks on bottom row
dates = pd.to_datetime(Qs.index)
tick_locs = np.linspace(0, len(dates) - 1, 6, dtype=int)
tick_lbls = dates.strftime('%m-%d')[tick_locs]
for ax in [axes[1,0], axes[1,1]]:
    ax.set_xticks(tick_locs)
    ax.set_xticklabels(tick_lbls, rotation=45, ha='right')

#plt.tight_layout()
plt.show()

# Home attribution

In [None]:
traj_cols = {'user_id':'cuebiq_id',
             'latitude':'lat',
             'longitude':'lng',
             'timestamp':'event_timestamp',
             'datetime':'event_zoned_datetime'}
users = loader.sample_users('BASELINE_PINGS_BY_NIGHTS', format='parquet', size=1.0, traj_cols=traj_cols)
user_chunks = np.array_split(users.values, 10)
user_chunks[0]

In [None]:
all_stops = []
for user_chunk in user_chunks:
    df =loader.sample_from_file(
        'BASELINE_PINGS_BY_NIGHTS',
        format='parquet',
        users=user_chunk,
        traj_cols=traj_cols)
    stops = df.groupby('cuebiq_id').apply(grid_based_sd)

In [None]:
grid_based_sd = partial(sd.grid_based_labels, traj_cols=traj_cols, location_id='zipcode_id')

In [None]:
stops = df.groupby('cuebiq_id').apply(grid_based_sd).reset_index()

In [None]:
users