In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.stats import norm, normaltest

In [20]:
def normality_test(x, alpha):
    k2, p = normaltest(x)
    s = 'p = ' + str(round(p, 6)) + ' --> '
    if p < alpha:  # null hypothesis: x comes from a normal distribution
        s += 'p < ' + str(alpha) + '\n'
        s += 'The null hypothesis (that data come from normal dist) CAN be rejected.'
    else:
        s += 'p >= ' + str(alpha) + '\n'
        s += 'The null hypothesis (that data come from normal dist) CANNOT be rejected.'
    return s, p

def create_plot(val, group, min_duration, write_fn):
    if group == 'combined':
        x = list(df[val])
    elif group == 'treatment':
        x = list(df.loc[~df.control][val])
    elif group == 'control':
        x = list(df.loc[df.control][val])
    else:
        print('Error! Value group must be one of: combined, treatment, control.')
    fig = plt.figure()
    x_axis = np.arange(np.min(x), np.max(x), 0.001)
    mean = np.mean(x)
    std = np.std(x)
    sns.histplot(x, stat='density')
    plt.plot(x_axis, norm.pdf(x_axis, loc=mean, scale=std), color='r')
    plt.title(val + ', ' + group + ', days >= ' + str(min_duration) + ', N = ' + str(len(x)))
    s_normality, p = normality_test(x, .05)
    s = 'mean: ' + str(round(mean, 3)) + ', std: ' + str(round(std, 3)) + '\n'
    s += s_normality
    print(val + ', ' + group + ', p: ' + str(p))
    fig.text(.5, -.05, s, ha='center')
    plt.savefig(write_fn, bbox_inches='tight')
    plt.clf()
    plt.close()

In [16]:
# read in dataframe
read_fn = '../results/2023-05-26/clean_and_split_data/split/train.pkl'
df = pd.read_pickle(read_fn)
# convert list to rows of datapoints
df = df[['sample', 'drug', 'log(V_V0+1)_obs']].explode('log(V_V0+1)_obs')
df = df.rename(columns={'log(V_V0+1)_obs': 'log(V_V0+1)'})
# compute sample mean
df = df.merge(df.groupby(['sample', 'drug'])['log(V_V0+1)'].mean().reset_index(name='log(V_V0+1)_sm'),
              on=['sample', 'drug'],
              validate='many_to_one')
# compute mean-centered measurements
df['log(V_V0+1)_cen'] = df['log(V_V0+1)'] - df['log(V_V0+1)_sm']
# add control column
df['control'] = df['drug'] == 'Vehicle'

Unnamed: 0,sample,drug,log(V_V0+1),log(V_V0+1)_sm,log(V_V0+1)_cen,control
0,HCI-019,Birinapant,4.249973,3.733228,0.516746,False
1,HCI-019,Birinapant,3.774035,3.733228,0.040807,False
2,HCI-019,Birinapant,4.703832,3.733228,0.970604,False
3,HCI-019,Birinapant,3.214331,3.733228,-0.518896,False
4,HCI-019,Birinapant,2.723967,3.733228,-1.009261,False


In [21]:
val = 'log(V_V0+1)_cen'
group = 'combined'
min_duration = 21
write_fn = '../test_out.png'
create_plot(val, 'combined', min_duration, '../tt_combined.png')
create_plot(val, 'treatment', min_duration, '../tt_treatment.png')
create_plot(val, 'control', min_duration, '../tt_control.png')

log(V_V0+1)_cen, combined, p: 0.01881645328921245
log(V_V0+1)_cen, treatment, p: 0.0744831071620139
log(V_V0+1)_cen, control, p: 0.5695087974291817
