In [1]:
import numpy as np
import pandas as pd

In [8]:
df = pd.read_csv('data/welm_pdx_clean_mid.csv')

In [9]:
df['test'] = df.Volume.div(df.Day)

In [10]:
df.head()

Unnamed: 0,MID,Sample,Drug,Day,Volume,test
0,0,HCI-010,Navitoclax,1.0,163.9208,163.9208
1,0,HCI-010,Navitoclax,4.0,158.374381,39.593595
2,0,HCI-010,Navitoclax,8.0,197.154048,24.644256
3,0,HCI-010,Navitoclax,11.0,158.582177,14.416562
4,0,HCI-010,Navitoclax,15.0,176.645,11.776333


In [11]:
def add_duration(df):
    num_mids = df.MID.nunique()
    # create columns with earliest day + size
    start_day = df.loc[df.groupby('MID')['Day'].idxmin()]
    start_day = start_day.rename(columns = {'Day': 'start', 'Volume': 'start_vol'})
    # create columns with latest day + size
    end_day = df.loc[df.groupby('MID')['Day'].idxmax()]
    end_day = end_day.rename(columns = {'Day': 'end', 'Volume': 'end_vol'})
    d = df[['MID', 'Sample', 'Drug']].drop_duplicates()
    # verify that all frames have the same length
    assert len(start_day) == num_mids
    assert len(end_day) == num_mids
    assert len(d) == num_mids
    # merge frames and take difference to find duration
    d = d.merge(start_day[['MID', 'start', 'start_vol']], on='MID', validate='one_to_one')
    d = d.merge(end_day[['MID', 'end', 'end_vol']], on='MID', validate='one_to_one')
    assert len(d) == num_mids
    d['duration'] = d['end'] - d['start']
    return d[['MID', 'Sample', 'Drug', 'start_vol', 'end_vol', 'duration']]

df = add_duration(df)
df

Unnamed: 0,MID,Sample,Drug,start_vol,end_vol,duration
0,0,HCI-010,Navitoclax,163.920800,230.687446,21.0
1,1,HCI-010,Navitoclax,119.794563,111.587111,21.0
2,2,HCI-010,Navitoclax,132.027026,110.372796,21.0
3,3,HCI-024,Navitoclax,285.770000,367.335000,21.0
4,4,HCI-024,Navitoclax,176.157000,213.016400,21.0
...,...,...,...,...,...,...
259,259,HCI-017,Fulvestrant (200 mg/kg),79.768000,1.000000,46.0
260,260,HCI-017,Fulvestrant (200 mg/kg),80.190000,13.500000,46.0
261,261,HCI-017,Vehicle,43.218000,249.444000,46.0
262,262,HCI-017,Vehicle,55.016000,172.826500,46.0


In [12]:
# select for duration above minimum
def threshold_duration(df, min_duration):
    return df.loc[df.duration >= min_duration]

# compute log(V_V0)
def compute_log_volume(df):
    df['log(V_V0)'] = np.log2(df['end_vol'] / df['start_vol'])
    return df

# compute log(V_V0)_cen

In [13]:
df = df.loc[df.duration >= 21]
df['V_V0'] = df['end_vol'].div(df['start_vol'])
df['log(V_V0+1)'] = np.log2(df['V_V0'] + 1)
df = df.merge(df.groupby(['Sample', 'Drug'])['log(V_V0+1)'].mean().reset_index(name='log(V_V0+1)_sm'), 
              on=['Sample', 'Drug'], 
              validate='many_to_one')
df['log(V_V0+1)_cen'] = df['log(V_V0+1)'] - df['log(V_V0+1)_sm']
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['V_V0'] = df['end_vol'].div(df['start_vol'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['log(V_V0+1)'] = np.log2(df['V_V0'] + 1)


Unnamed: 0,MID,Sample,Drug,start_vol,end_vol,duration,V_V0,log(V_V0+1),log(V_V0+1)_sm,log(V_V0+1)_cen
0,0,HCI-010,Navitoclax,163.9208,230.687446,21.0,1.40731,1.267422,1.03123,0.236192
1,1,HCI-010,Navitoclax,119.794563,111.587111,21.0,0.931487,0.949712,1.03123,-0.081518
2,2,HCI-010,Navitoclax,132.027026,110.372796,21.0,0.835986,0.876555,1.03123,-0.154675
3,3,HCI-024,Navitoclax,285.77,367.335,21.0,1.285422,1.19246,1.169276,0.023184
4,4,HCI-024,Navitoclax,176.157,213.0164,21.0,1.209242,1.143551,1.169276,-0.025725


In [16]:
cols = ['MID', 'Sample', 'Drug', 'log(V_V0+1)', 'log(V_V0+1)_cen']
df[cols].to_csv('data/welm_pdx_clean_mid_volume.csv', index=False)