In [130]:
%%capture
import os
from pathlib import Path

from dj_notebook import activate

env_file = os.environ["META_ENV"]
reports_folder = Path(os.environ["META_REPORTS_FOLDER"])
analysis_folder = Path(os.environ["META_ANALYSIS_FOLDER"])
pharmacy_folder = Path(os.environ["META_PHARMACY_FOLDER"])
plus = activate(dotenv_file=env_file)

In [131]:
import numpy as np
import pandas as pd
from django.utils import timezone

from datetime import date
from django_pandas.io import read_frame

from edc_model_to_dataframe.read_frame_edc import read_frame_edc
from edc_registration.models import RegisteredSubject
from meta_rando.models import RandomizationList
from meta_prn.models import EndOfStudy, OffSchedule
from meta_subject.models import FollowupVitals

In [132]:
df_rs = read_frame(RegisteredSubject.objects.values("subject_identifier", "gender", "dob").all(), verbose=False)
df_rs["dob"] = pd.to_datetime(df_rs["dob"])
today = pd.to_datetime(date.today())
age = today.year - df_rs['dob'].dt.year
df_rs['age_in_years'] = age

df_vitals = read_frame_edc(FollowupVitals.objects.all())
df_eos = read_frame_edc(EndOfStudy.objects.all())
df_offschedule = read_frame_edc(OffSchedule.objects.all())
df_rando =  read_frame(RandomizationList.objects.values("subject_identifier", "sid", "assignment", "randomizer_name").all(), verbose=False)

In [133]:
from meta_spfq.constants import GTE_35__LTE_49, GTE_50, LT_35

df_rs = df_rs.merge(df_rando[["subject_identifier", "sid", "assignment"]], on="subject_identifier", how="left")

# merge vitals, registered subject
df = pd.merge(df_rs, df_vitals, on="subject_identifier")

# merge vitals, registered subject
df = pd.merge(df_rs, df_vitals, on="subject_identifier")

# select the last record for each subject
df['report_datetime'] = pd.to_datetime(df['report_datetime'])
df = df.sort_values(by=['subject_identifier', 'report_datetime'])
df = df.drop_duplicates(subset='subject_identifier', keep='last')

# drop any subjects off study
df = df[~df['subject_identifier'].isin(df_eos['subject_identifier'].unique())]

# drop any subjects off-schedule (but still on study)
df = df[~df['subject_identifier'].isin(df_offschedule['subject_identifier'].unique())]

# cut/bin weight
df['weight'] = pd.to_numeric(df['weight'], errors='coerce')
bins = [-np.inf, 35, 50, np.inf]
labels = [LT_35, GTE_35__LTE_49, GTE_50]
df['weight_bin'] = pd.cut(df['weight'], bins=bins, labels=labels, right=False)


In [134]:
df[['subject_identifier', 'assignment', 'visit_code', 'report_datetime', 'site_id', 'gender', 'weight', 'weight_bin']]


Unnamed: 0,subject_identifier,assignment,visit_code,report_datetime,site_id,gender,weight,weight_bin
7589,105-10-0001-4,placebo,1450,2025-08-21 07:48:25,10,F,85.3,gte_50
15731,105-10-0004-8,placebo,1450,2025-09-11 06:21:42,10,F,66.1,gte_50
17144,105-10-0005-5,active,1420,2025-06-02 07:30:00,10,F,99.7,gte_50
17343,105-10-0007-1,placebo,1330,2024-09-03 08:31:30,10,M,87.0,gte_50
8640,105-10-0008-9,active,1420,2025-06-14 06:47:50,10,F,75.0,gte_50
...,...,...,...,...,...,...,...,...
16314,105-60-0226-2,active,1150,2025-02-24 07:22:24,60,F,56.4,gte_50
15518,105-60-0227-0,active,1210,2025-09-01 07:30:00,60,F,65.0,gte_50
13243,105-60-0228-8,placebo,1180,2025-06-23 07:30:00,60,F,65.0,gte_50
5979,105-60-0229-6,active,1180,2025-07-02 07:33:37,60,F,70.5,gte_50


In [135]:
# sample 20% subjects
stratified_sample = df.groupby(
    ['assignment','site_id', 'gender', 'weight_bin', 'age_in_years'],
    group_keys=False,
    observed=True
).sample(
    frac=0.30,
    random_state=981
)

In [136]:
stratified_sample

Unnamed: 0,subject_identifier,gender,dob,age_in_years,sid,assignment,id,site_id,consent_model,consent_version,...,temperature,weight_determination,appointment_datetime,visit_code,visit_code_sequence,visit_datetime,visit_reason,site_name,site,weight_bin
6096,105-10-0177-2,F,1983-10-10,42,12130,active,cd752ced-15e5-4422-8e8f-1a9bcada0abd,10,meta_consent.subjectconsentv1,1,...,36.8,,2025-07-24 07:12:01,1210,0,2025-07-24 07:12:20,scheduled,hindu_mandal,10,gte_50
1091,105-10-0128-5,F,1979-02-13,46,12094,active,a7d30b92-78e3-4777-b5de-ab1db7a06417,10,meta_consent.subjectconsentv1,1,...,36.7,,2025-05-28 07:19:05,1270,0,2025-05-28 07:19:05,scheduled,hindu_mandal,10,gte_50
7965,105-10-0113-7,F,1978-09-23,47,12083,active,653ef26d-f519-436b-a4c7-4622aa151056,10,meta_consent.subjectconsentv1,1,...,36.3,,2025-08-06 06:13:35,1300,0,2025-08-06 06:13:35,scheduled,hindu_mandal,10,gte_50
3570,105-10-0121-0,F,1977-02-10,48,12089,active,747dece4-7bc3-4ec8-8c72-ff1dfe35556b,10,meta_consent.subjectconsentv1,1,...,36.4,,2025-09-02 07:22:13,1300,0,2025-09-02 07:22:13,scheduled,hindu_mandal,10,gte_50
11112,105-10-0028-7,F,1975-06-26,50,12022,active,ec3a8203-5815-455d-8042-811bdd4f5aa8,10,meta_consent.subjectconsentv1,1.1,...,36.7,,2025-06-04 07:00:46,1390,0,2025-06-04 07:00:46,scheduled,hindu_mandal,10,gte_50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4155,105-60-0086-0,M,1977-05-28,48,61020,placebo,fc760783-a8de-420a-9a07-b81f3aa0d7c5,60,meta_consent.subjectconsentv1,1,...,37.0,,2025-08-25 05:02:14,1330,0,2025-08-25 05:02:14,scheduled,mnazi_moja,60,gte_50
16462,105-60-0067-0,M,1971-12-31,54,61018,placebo,73e914f1-e3f0-46c2-ba54-8f9e01a55195,60,meta_consent.subjectconsentv1,1,...,37.0,,2025-07-28 06:52:06,1330,0,2025-07-28 06:52:06,scheduled,mnazi_moja,60,gte_50
13342,105-60-0095-1,M,1968-03-02,57,61021,placebo,89f29d9f-1a07-4450-8b9b-a7b153141a4d,60,meta_consent.subjectconsentv1,1,...,36.5,,2025-03-07 05:36:37,1270,0,2025-03-07 05:36:37,scheduled,mnazi_moja,60,gte_50
8345,105-60-0159-5,M,1966-07-15,59,61036,placebo,bc5078a0-919c-4298-8f9a-64fb25adae98,60,meta_consent.subjectconsentv1,1,...,36.5,,2025-04-14 06:50:44,1240,0,2025-04-14 06:50:44,scheduled,mnazi_moja,60,gte_50


In [137]:
# randomize the stratified sample by site
randomized_df = stratified_sample.groupby(
    'site_id',
    group_keys=False,
    observed=True
).sample(n=20, random_state=876).reset_index()

In [138]:
# add sid column, ...
randomized_df["sid"] = randomized_df.index+42000
randomized_df["date_generated"] = pd.Timestamp.now()
randomized_df = randomized_df.rename(columns={"visit_code": "last_visit_code", "report_datetime": "last_appt_datetime"})


In [139]:
# review
randomized_df[['sid', 'subject_identifier', 'assignment', 'last_visit_code', 'last_appt_datetime', 'site_id', 'gender', 'age_in_years', 'weight_bin', 'date_generated']]

Unnamed: 0,sid,subject_identifier,assignment,last_visit_code,last_appt_datetime,site_id,gender,age_in_years,weight_bin,date_generated
0,42000,105-10-0110-3,placebo,1300,2025-07-23 07:30:00,10,F,51,gte_50,2025-10-02 00:04:27.166172
1,42001,105-10-0052-7,active,1360,2025-07-14 05:54:08,10,F,54,gte_50,2025-10-02 00:04:27.166172
2,42002,105-10-0041-0,placebo,1360,2025-06-13 06:30:56,10,F,53,gte_50,2025-10-02 00:04:27.166172
3,42003,105-10-0113-7,active,1300,2025-08-06 06:13:35,10,F,47,gte_50,2025-10-02 00:04:27.166172
4,42004,105-10-0135-0,placebo,1300,2025-09-08 04:45:09,10,M,57,gte_50,2025-10-02 00:04:27.166172
...,...,...,...,...,...,...,...,...,...,...
95,42095,105-60-0095-1,placebo,1270,2025-03-07 05:36:37,60,M,57,gte_50,2025-10-02 00:04:27.166172
96,42096,105-60-0158-7,active,1240,2025-05-12 09:57:17,60,F,56,gte_50,2025-10-02 00:04:27.166172
97,42097,105-60-0137-1,active,1300,2025-09-03 08:37:35,60,F,44,gte_50,2025-10-02 00:04:27.166172
98,42098,105-60-0189-2,placebo,1210,2025-03-25 06:47:41,60,F,37,gte_50,2025-10-02 00:04:27.166172


In [140]:
# review
randomized_df.site_id.value_counts(normalize=True)

site_id
10    0.2
20    0.2
30    0.2
40    0.2
60    0.2
Name: proportion, dtype: float64

In [141]:
# review
randomized_df.site_id.value_counts()

site_id
10    20
20    20
30    20
40    20
60    20
Name: count, dtype: int64

In [142]:
randomized_df.assignment.value_counts()


assignment
placebo    51
active     49
Name: count, dtype: int64

In [143]:
# review
read_frame_edc(RegisteredSubject.objects.values("subject_identifier", "site_id").all()).site_id.value_counts(normalize=True)

site_id
40    0.322295
20    0.231224
30    0.201064
60    0.136014
10    0.109403
Name: proportion, dtype: float64

In [144]:
# export
dte_string = timezone.now().strftime('%Y%m%d%H%M%S')
randomized_df[['sid', 'subject_identifier','last_visit_code', 'last_appt_datetime', 'site_id', 'gender', 'age_in_years', 'weight_bin', 'date_generated']].to_csv(analysis_folder / f"nanda_list_frac_{dte_string}.csv")

In [145]:
# method 2
stratified_sample = df.groupby(
    ['site_id', 'assignment', 'gender', 'weight_bin', 'age_in_years'],
    group_keys=False,
    observed=True
).sample(
    n=1,
    random_state=22102
)
# randomize the stratified sample by site
randomized_df = stratified_sample.groupby(
    ['site_id'],
    group_keys=False,
    observed=True
).sample(n=20, random_state=43221).reset_index()

# add sid column, ...
randomized_df["sid"] = randomized_df.index + 42000
randomized_df["date_generated"] = pd.Timestamp.utcnow()
randomized_df['report_datetime'] = randomized_df['report_datetime'].dt.tz_localize('UTC')
randomized_df = randomized_df.rename(columns={"visit_code": "last_visit_code",
                                              "report_datetime": "last_appt_datetime"})

In [146]:
dte_string = timezone.now().strftime('%Y%m%d%H%M%S')
randomized_df[['sid', 'subject_identifier', 'last_visit_code', 'last_appt_datetime', 'site_id', 'gender', 'age_in_years', 'weight_bin', 'date_generated']].to_csv(analysis_folder / f"nanda_list_n_{dte_string}.csv", index=False)


In [147]:
randomized_df.site_id.value_counts(normalize=True)


site_id
10    0.2
20    0.2
30    0.2
40    0.2
60    0.2
Name: proportion, dtype: float64

In [148]:
randomized_df.site_id.value_counts()


site_id
10    20
20    20
30    20
40    20
60    20
Name: count, dtype: int64

In [149]:
randomized_df.assignment.value_counts()


assignment
active     52
placebo    48
Name: count, dtype: int64

In [150]:
randomized_df.groupby(["site_id", "assignment"]).size()


site_id  assignment
10       active         8
         placebo       12
20       active         7
         placebo       13
30       active         9
         placebo       11
40       active        15
         placebo        5
60       active        13
         placebo        7
dtype: int64

In [151]:
randomized_df.groupby(["site_id", "gender"]).size()


site_id  gender
10       F          8
         M         12
20       F         14
         M          6
30       F         15
         M          5
40       F         10
         M         10
60       F         12
         M          8
dtype: int64

In [152]:
randomized_df.subject_identifier.nunique()

100

In [153]:
len(randomized_df)

100

In [154]:
randomized_df[['sid', 'assignment', 'subject_identifier', 'last_visit_code', 'last_appt_datetime', 'site_id', 'gender', 'age_in_years', 'weight_bin', 'date_generated']]

Unnamed: 0,sid,assignment,subject_identifier,last_visit_code,last_appt_datetime,site_id,gender,age_in_years,weight_bin,date_generated
0,42000,placebo,105-10-0030-3,1390,2025-06-19 08:13:32+00:00,10,F,65,gte_50,2025-10-01 21:04:28.094512+00:00
1,42001,active,105-10-0140-0,1270,2025-06-23 08:37:50+00:00,10,M,56,gte_50,2025-10-01 21:04:28.094512+00:00
2,42002,placebo,105-10-0123-6,1300,2025-08-14 06:14:02+00:00,10,F,52,gte_50,2025-10-01 21:04:28.094512+00:00
3,42003,placebo,105-10-0145-9,1270,2025-08-04 05:56:59+00:00,10,F,46,gte_50,2025-10-01 21:04:28.094512+00:00
4,42004,active,105-10-0079-0,1330,2025-07-14 06:15:16+00:00,10,M,66,gte_50,2025-10-01 21:04:28.094512+00:00
...,...,...,...,...,...,...,...,...,...,...
95,42095,active,105-60-0096-9,1330,2025-09-08 07:46:33+00:00,60,M,44,gte_50,2025-10-01 21:04:28.094512+00:00
96,42096,active,105-60-0084-5,1330,2025-08-11 09:16:39+00:00,60,F,35,gte_50,2025-10-01 21:04:28.094512+00:00
97,42097,active,105-60-0043-1,1330,2025-07-01 08:15:12+00:00,60,M,58,gte_50,2025-10-01 21:04:28.094512+00:00
98,42098,active,105-60-0065-4,1330,2025-07-24 06:57:57+00:00,60,F,54,gte_50,2025-10-01 21:04:28.094512+00:00
