In [None]:
### This notebook is intended to let a new lab member quickly create the checkerboard plots that we use in the lab. 

## Once you enter the correct Beiwe ID, you should be able to create a somewhat decent looking pdf output by running the following in shell:

# jupyter nbconvert --to pdf --TemplateExporter.exclude_input=True  "data_volume_summaries_template.ipynb"

## make sure you replace "data_volume_summaries_template.ipynb" with the name of the noteobook if you changed the notebook's nanme. 

## (the notebook name is the title of the notebook, so the report looks nicer if you change the name to something better)

In [None]:
import sys
#Use this cell if you've moved this notebook somewhere else
#sys.path.insert(0, "/path/to/repo/beiwe/code")

In [None]:
import data_summaries

In [None]:
kr = data_summaries.read_keyring("keyring_studies.py")

In [None]:
data_summaries_file_path = "data_volume.csv"

In [None]:
import os
study_id = "" ## put the study ID here
data_summaries.get_data_summaries(study_id,
        output_file_path = data_summaries_file_path,
        keyring = kr)

In [None]:
download_folder = "raw_data"

In [None]:
from helper_functions import download_data
download_data(kr, study_id, download_folder, data_streams = ["identifiers"]) #1st time

In [None]:
import pandas as pd
import glob
import warnings
warnings.filterwarnings("ignore")

all_files = glob.glob(download_folder + "/**/*.csv", recursive=True)

lst = []

for filename in all_files:
    df = pd.read_csv(filename,index_col=False)
    lst.append(df)

# concatenate all identifier files
df = pd.concat(lst, axis=0, ignore_index=True).sort_values(by="timestamp")
# keep only the first registration record
df_wo_dup = df.drop_duplicates(subset=['patient_id'], keep = "first")

In [None]:
# turn into dictionary ("id":"registration date")
df_wo_dup['UTC time'] = [item.split('T')[0] for item in df_wo_dup['UTC time']]
registration_date_dict = dict(zip(df_wo_dup['patient_id'], df_wo_dup['UTC time']))

In [None]:
data_summary = pd.read_csv(data_summaries_file_path)
data_summary['date'] = pd.to_datetime(data_summary['date'])

In [None]:
## this will remove all the data before registration date
for key in registration_date_dict:
    registration_date_dict[key] = pd.to_datetime(registration_date_dict[key])

filtered_df = pd.DataFrame()
for p_id, earliest_date in registration_date_dict.items():
    temp_df = data_summary[(data_summary['participant_id'] == p_id) & (data_summary['date'] >= earliest_date)]
    filtered_df = pd.concat([filtered_df, temp_df])

filtered_df = filtered_df.reset_index(drop=True)

In [None]:
def filter_data(df, end_date = True, study_period_days = None):

    '''
        df: using the results from "filtered_df" from above that already truncated off the data before registration date
        end_date: True or False based on the need to specify "study_period_days" or not
        study_period_days: set "end_date" to False before specifying this, this is required in the form of integer days
    '''

    filtered_df_metrics = df.dropna(subset=df.columns[2:69], how="all")

    last_dates_with_data = filtered_df_metrics.groupby('participant_id')['date'].max().reset_index()
    last_dates_with_data.columns = ['participant_id', 'last_dates_with_data']

    df_with_last_date = df.merge(last_dates_with_data, on='participant_id', how='left')

    if end_date:
        filtered_df_new = df_with_last_date[df_with_last_date['date'] <= df_with_last_date['last_dates_with_data']]
        filtered_df_new = filtered_df_new.drop(columns=['last_dates_with_data'])
        filtered_df_new = filtered_df_new.reset_index(drop=True)

    elif study_period_days != None:
        filtered_df_new = pd.DataFrame()
        for p_id, earliest_date in registration_date_dict.items():
            end_date = df_with_last_date[(df_with_last_date['participant_id'] == p_id)].last_dates_with_data.unique()[0]

            desired_end_date = earliest_date + pd.Timedelta(days=study_period_days - 1)

            # consider if there are participants not finishing study collection yet
            end_date_to_use = min(desired_end_date, end_date)

            temp_df = df[(df['participant_id'] == p_id) & (df['date'] >= earliest_date) &
                        (df['date'] <= end_date_to_use)]
            filtered_df_new = pd.concat([filtered_df_new, temp_df])

    else:
        print("No modification occurred.")
        return df

    return filtered_df_new.reset_index(drop=True)

In [None]:
# test case
filtered_df_new = filter_data(filtered_df, end_date=False, study_period_days=2)
## or filtered_df_new = filter_data(filtered_df) if you just need the end date of data ever recorded

In [None]:
# saving file
filtered_df_new.to_csv("data_volume_truncted.csv", index=False)

### Data Volume Summaries for Study

##### Here are the number of users with at least one day of data by data stream

In [None]:
print(data_summaries.get_num_users(summaries_path = data_summaries_file_path).set_index("Data Type", drop = True))

### Summary Plots: X axis is time since study entry

In [None]:
data_plots_path = "data_volume_plots"
data_summaries.data_volume_plots(
        data_summaries_path = data_summaries_file_path, 
        output_dir = data_plots_path,
        display_plots = True, #this needs to be true for the notebook to run
        binary_heatmap = True, #if this is False, a continuous data volume measurement will be put on the heatmaps
        plot_study_time = True, #whether to use study time instead of calendar time
        overlay_surveys = True, #whether to overlay survey submissions on top of data
        include_y_labels = False # if you have a ton of users, don't include y labels so you can fit it on one page. 
)

### Summary Plots: X axis is calendar time

In [None]:
data_plots_path = "data_volume_plots"
data_summaries.data_volume_plots(
        data_summaries_path = data_summaries_file_path, 
        output_dir = data_plots_path,
        display_plots = True, 
        binary_heatmap = True, 
        plot_study_time = False,
        overlay_surveys = True,
        include_y_labels = False
)