# This notebook is intended to let a lab member clean the summary csv of dates when participants are unregistered 

## This notebook takes advantage of the existing APIs to post-process data summary CSV results by truncating them to dates participants were registered


In [None]:
from datetime import datetime
import sys
#!{sys.executable} -m pip install mano

# You need 2 libraries installed to run this script, run `pip install orjson requests`
# requests is a (fantastic) library for making http requests
# orjson is a highly optimized library for parsing json, I assure you, you will want to use it.
import orjson
import requests
import data_summaries
import pandas as pd
from pandas import json_normalize 
from helper_functions import call_api

In [None]:
kr = data_summaries.read_keyring("keyring_studies.py")

In [None]:
# provide your Beiwe access keys here
access_key = kr.get("ACCESS_KEY")
secret_key = kr.get("SECRET_KEY")

# Enter the top-level component of your Beiwe server.
MY_BEIWE_SERVER = "https://studies.beiwe.org"
beiwe_study_id = ""

In [None]:
TARGET_ENDPOINT_URL = f"{MY_BEIWE_SERVER}/get-summary-statistics/v1"
    # Endoint takes many parameters, and has one required parameter, study_id.
    # This endpoint is identical to the Tableau API endpoint. It returns a json list of
    # dictionaries, based on the query parameters.  The query parameters are:
    # `end_date`, a dat of the form YYYY-MM-DD that specifies the last date to include in the search.
    # `start_date`, a date of the form YYYY-MM-DD that specifies the first date to include in the search.
    # `fields`, a comma separated list that of all specific summary statistic fields to return.
    #           Providing no value for fields will return all fields.
    # `limit`, an integer that specifies the maximum number of data points to return.
    # `ordered_by`, a field name that specifies the parameter to sort the output by.
    # `order_direction`, either "ascending" or "descending", specifies the order to sort in.

In [None]:
summary_data = call_api(TARGET_ENDPOINT_URL, beiwe_study_id, access_key, secret_key)

In [None]:
# Find the first and last data collection dates for each participant and truncate to those dates
bytes_columns = summary_data.columns[summary_data.columns.str.contains('bytes')]
valid_rows = summary_data[bytes_columns].apply(lambda row: row.notna() & row.notnull(), axis=1).any(axis=1)
filtered_df = summary_data[valid_rows]
valid_dates = filtered_df.groupby('participant_id')['date'].agg(['min', 'max']).reset_index()
valid_dates.rename(columns={'min': 'first_valid_date', 'max': 'last_valid_date'}, inplace=True)

# Filter out any dates outside of the acceptable range for each individual
summary_data = pd.merge(summary_data, valid_dates, on='participant_id')
summary_filtered = summary_data[
    (summary_data['date'] >= summary_data['first_valid_date']) & 
    (summary_data['date'] <= summary_data['last_valid_date'])
]


In [None]:
summary_filtered = summary_filtered.drop(columns=['first_valid_date',"last_valid_date"])
summary_filtered = summary_filtered.reset_index(drop=True)
summary_filtered = summary_filtered.sort_values(['participant_id', 'date'], ascending=[True, True])
summary_filtered

In [None]:
summary_filtered.to_csv(f"data_summary_{beiwe_study_id}.csv", index=False)