In [None]:
%%capture
import os
from pathlib import Path

from dj_notebook import activate

env_file = os.environ["META_ENV"]
reports_folder = Path(os.environ["META_REPORTS_FOLDER"])
analysis_folder = Path(os.environ["META_ANALYSIS_FOLDER"])
pharmacy_folder = Path(os.environ["META_PHARMACY_FOLDER"])
plus = activate(dotenv_file=env_file)

In [None]:
from datetime import date

import numpy as np
import pandas as pd
from django.utils import timezone
from django_pandas.io import read_frame
from edc_model_to_dataframe.read_frame_edc import read_frame_edc
from edc_registration.models import RegisteredSubject

from meta_prn.models import EndOfStudy, OffSchedule
from meta_rando.models import RandomizationList
from meta_subject.models import FollowupVitals

In [None]:
df_rs = read_frame(RegisteredSubject.objects.values("subject_identifier", "gender", "dob").all(), verbose=False)
df_rs["dob"] = pd.to_datetime(df_rs["dob"])
today = pd.to_datetime(date.today())
age = today.year - df_rs['dob'].dt.year
df_rs['age_in_years'] = age

df_vitals = read_frame_edc(FollowupVitals.objects.all())
df_eos = read_frame_edc(EndOfStudy.objects.all())
df_offschedule = read_frame_edc(OffSchedule.objects.all())
df_rando =  read_frame(RandomizationList.objects.values("subject_identifier", "sid", "assignment", "randomizer_name").all(), verbose=False)

In [None]:
from meta_spfq.constants import GTE_35__LTE_49, GTE_50, LT_35

df_rs = df_rs.merge(df_rando[["subject_identifier", "sid", "assignment"]], on="subject_identifier", how="left")

# merge vitals, registered subject
df = pd.merge(df_rs, df_vitals, on="subject_identifier")

# merge vitals, registered subject
df = pd.merge(df_rs, df_vitals, on="subject_identifier")

# select the last record for each subject
df['report_datetime'] = pd.to_datetime(df['report_datetime'])
df = df.sort_values(by=['subject_identifier', 'report_datetime'])
df = df.drop_duplicates(subset='subject_identifier', keep='last')

# drop any subjects off study
df = df[~df['subject_identifier'].isin(df_eos['subject_identifier'].unique())]

# drop any subjects off-schedule (but still on study)
df = df[~df['subject_identifier'].isin(df_offschedule['subject_identifier'].unique())]

# cut/bin weight
df['weight'] = pd.to_numeric(df['weight'], errors='coerce')
bins = [-np.inf, 35, 50, np.inf]
labels = [LT_35, GTE_35__LTE_49, GTE_50]
df['weight_bin'] = pd.cut(df['weight'], bins=bins, labels=labels, right=False)


In [None]:
df[['subject_identifier', 'assignment', 'visit_code', 'report_datetime', 'site_id', 'gender', 'weight', 'weight_bin']]


In [None]:
# sample 20% subjects
stratified_sample = df.groupby(
    ['assignment','site_id', 'gender', 'weight_bin', 'age_in_years'],
    group_keys=False,
    observed=True
).sample(
    frac=0.30,
    random_state=981
)

In [None]:
stratified_sample

In [None]:
# randomize the stratified sample by site
randomized_df = stratified_sample.groupby(
    'site_id',
    group_keys=False,
    observed=True
).sample(n=20, random_state=876).reset_index()

In [None]:
# add sid column, ...
randomized_df["sid"] = randomized_df.index+42000
randomized_df["date_generated"] = pd.Timestamp.now()
randomized_df = randomized_df.rename(columns={"visit_code": "last_visit_code", "report_datetime": "last_appt_datetime"})


In [None]:
# review
randomized_df[['sid', 'subject_identifier', 'assignment', 'last_visit_code', 'last_appt_datetime', 'site_id', 'gender', 'age_in_years', 'weight_bin', 'date_generated']]

In [None]:
# review
randomized_df.site_id.value_counts(normalize=True)

In [None]:
# review
randomized_df.site_id.value_counts()

In [None]:
randomized_df.assignment.value_counts()


In [None]:
# review
read_frame_edc(RegisteredSubject.objects.values("subject_identifier", "site_id").all()).site_id.value_counts(normalize=True)

In [None]:
# export
dte_string = timezone.now().strftime('%Y%m%d%H%M%S')
randomized_df[['sid', 'subject_identifier','last_visit_code', 'last_appt_datetime', 'site_id', 'gender', 'age_in_years', 'weight_bin', 'date_generated']].to_csv(analysis_folder / f"nanda_list_frac_{dte_string}.csv")

In [None]:
# method 2
stratified_sample = df.groupby(
    ['site_id', 'assignment', 'gender', 'weight_bin', 'age_in_years'],
    group_keys=False,
    observed=True
).sample(
    n=1,
    random_state=22102
)
# randomize the stratified sample by site
randomized_df = stratified_sample.groupby(
    ['site_id'],
    group_keys=False,
    observed=True
).sample(n=20, random_state=43221).reset_index()

# add sid column, ...
randomized_df["sid"] = randomized_df.index + 42000
randomized_df["date_generated"] = pd.Timestamp.utcnow()
randomized_df['report_datetime'] = randomized_df['report_datetime'].dt.tz_localize('UTC')
randomized_df = randomized_df.rename(columns={"visit_code": "last_visit_code",
                                              "report_datetime": "last_appt_datetime"})

In [None]:
dte_string = timezone.now().strftime('%Y%m%d%H%M%S')
randomized_df[['sid', 'subject_identifier', 'last_visit_code', 'last_appt_datetime', 'site_id', 'gender', 'age_in_years', 'weight_bin', 'date_generated']].to_csv(analysis_folder / f"nanda_list_n_{dte_string}.csv", index=False)


In [None]:
randomized_df.site_id.value_counts(normalize=True)


In [None]:
randomized_df.site_id.value_counts()


In [None]:
randomized_df.assignment.value_counts()


In [None]:
randomized_df.groupby(["site_id", "assignment"]).size()


In [None]:
randomized_df.groupby(["site_id", "gender"]).size()


In [None]:
randomized_df.subject_identifier.nunique()

In [None]:
len(randomized_df)

In [None]:
randomized_df[['sid', 'assignment', 'subject_identifier', 'last_visit_code', 'last_appt_datetime', 'site_id', 'gender', 'age_in_years', 'weight_bin', 'date_generated']]

In [156]:
# for tests
# method 2
stratified_sample = df.groupby(
    ['site_id', 'assignment', 'gender', 'weight_bin', 'age_in_years'],
    group_keys=False,
    observed=True
).sample(
    n=1,
    random_state=111
)
# randomize the stratified sample by site
randomized_df = stratified_sample.groupby(
    ['site_id'],
    group_keys=False,
    observed=True
).sample(n=6, random_state=222).reset_index()

# add sid column, ...
randomized_df["sid"] = randomized_df.index + 42000
randomized_df["date_generated"] = pd.Timestamp.utcnow()
randomized_df['report_datetime'] = randomized_df[
    'report_datetime'].dt.tz_localize('UTC')
randomized_df = randomized_df.rename(columns={"visit_code": "last_visit_code",
                                              "report_datetime": "last_appt_datetime"})
dte_string = timezone.now().strftime('%Y%m%d%H%M%S')
randomized_df[
    ['sid', 'subject_identifier', 'last_visit_code', 'last_appt_datetime', 'site_id',
     'gender', 'age_in_years', 'weight_bin',
     'date_generated']].to_csv(analysis_folder / f"nanda_test_list_{dte_string}.csv", index=False)