# Summary

Create an all-by-all matrix.


| **drug1 / drug2 / cell line** | **drug1 / drug2 / cell line** | **drug1 / drug2 / cell line** |...
-|-|- 
**drug1 / drug2 / cell line**| *synergy change* | *synergy change* | *synergy change* | ...
**drug1 / drug2 / cell line**| *synergy change* | *synergy change* | *synergy change* | ...
**drug1 / drug2 / cell line**| *synergy change* | *synergy change* | *synergy change* | ...

# Notes

#  ToDo

- Excluding everything with `QA != 1`. Maybe there is a smart way to include these values?



# Imports

In [1]:
from biodata import *



In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import functions as fn

# Functions

In [4]:
def convert_features_to_differences(df):
    df.columns = [c.lower() for c in df.columns]
    suffix = [c for c in df.columns if c.startswith('max_conc_a')][0].replace('max_conc_a', '')

    columns_to_drop = []
    prefixes = ['max_conc', 'ic50', 'h', 'einf']
    df_columns = df.columns.copy()
    for column in df_columns:
        if not any([column.startswith(p) for p in prefixes]):
            continue
        column_pair = column.replace('_a', '_b')
        if column_pair not in df.columns or (column == column_pair):
            continue
        df[column.replace('_a', '') + '_mean'] = df[[column, column_pair]].mean(axis=1)
        df[column.replace('_a', '') + '_diff'] = (df[column_pair] - df[column]).abs()
        columns_to_drop.extend([column, column_pair])

    print('dropping the following columns: {}'.format(columns_to_drop))
    df.drop(pd.Index(columns_to_drop), axis=1, inplace=True)
    assert not any([c in df for c in columns_to_drop])
    
    df.rename(columns={
        'compound_a': 'd_1',
        'compound_b': 'd_2',
        'cell_line': 'c',
    }, inplace=True)
    
    return df

# Load data

## Training data

In [5]:
ch1_train_combination_and_monotherapy = pd.read_csv(
    '../downloads/challenge_data/drug_synergy_data/ch1_train_combination_and_monotherapy.csv/'
    'ch1_train_combination_and_monoTherapy.csv', sep=','
)
ch1_train_combination_and_monotherapy['source'] = 'train'

In [6]:
ch1_leaderboard_monotherapy = pd.read_csv(
    '../downloads/challenge_data/drug_synergy_data/ch1_leaderboard_monotherapy.csv/'
    'ch1_leaderBoard_monoTherapy.csv', sep=','
)
ch1_leaderboard_monotherapy['source'] = 'ch1_validate'

In [7]:
ch1_test_monotherapy = pd.read_csv(
    '../downloads/challenge_data/drug_synergy_data/ch1_test_monotherapy.csv/'
    'ch1_test_monoTherapy.csv', sep=','
)
ch1_test_monotherapy['source'] = 'ch1_test'

In [8]:
ch2_leaderboard_monotherapy = pd.read_csv(
    '../downloads/challenge_data/drug_synergy_data/ch2_leaderboard_monotherapy.csv/'
    'ch2_leaderBoard_monoTherapy.csv', sep=','
)
ch2_leaderboard_monotherapy['source'] = 'ch2_validate'

In [9]:
ch2_test_monotherapy = pd.read_csv(
    '../downloads/challenge_data/drug_synergy_data/ch2_test_monotherapy.csv/'
    'ch2_test_monoTherapy.csv', sep=','
)
ch2_test_monotherapy['source'] = 'ch2_test'

#### New data

In [10]:
ch1_leaderboard_monotherapy_wsyn = pd.read_csv(
    '../downloads/challenge_data/drug_synergy_data/leaderboard_set/'
    'ch1_LB.csv', sep=','
)
ch1_leaderboard_monotherapy_wsyn['source'] = 'ch1_validate'

assert ch1_leaderboard_monotherapy.drop('SYNERGY_SCORE', axis=1).merge(ch1_leaderboard_monotherapy_wsyn).shape == \
    ch1_leaderboard_monotherapy_wsyn.shape

In [11]:
ch2_leaderboard_monotherapy_wsyn = pd.read_csv(
    '../downloads/challenge_data/drug_synergy_data/leaderboard_set/'
    'ch2_LB.csv', sep=','
)
ch2_leaderboard_monotherapy_wsyn['source'] = 'ch2_validate'

assert ch2_leaderboard_monotherapy.drop('SYNERGY_SCORE', axis=1).merge(ch2_leaderboard_monotherapy_wsyn).shape == \
    ch2_leaderboard_monotherapy_wsyn.shape

#### Combine

In [12]:
ALL_TRAINING_DATA = pd.concat([
        ch1_train_combination_and_monotherapy,
        ch1_leaderboard_monotherapy_wsyn,
        ch1_test_monotherapy,
        ch2_leaderboard_monotherapy_wsyn,
        ch2_test_monotherapy
    ], ignore_index=True)

In [13]:
if 'COMBINATION_ID' in ALL_TRAINING_DATA:
    ALL_TRAINING_DATA.drop('COMBINATION_ID', axis=1, inplace=True)

In [14]:
ALL_TRAINING_DATA[ALL_TRAINING_DATA['SYNERGY_SCORE'].isnull()].head()

Unnamed: 0,CELL_LINE,COMPOUND_A,COMPOUND_B,MAX_CONC_A,MAX_CONC_B,IC50_A,H_A,Einf_A,IC50_B,H_B,Einf_B,SYNERGY_SCORE,QA,source
2790,CAMA-1,ADAM17,AKT,1,75,0.126186,0.993706,3.688533,1.0,0.0,100.0,,1,ch1_test
2791,HCC1395,ADAM17,AKT,1,75,0.910569,0.552451,0.0,10.90541,0.239641,82.740876,,1,ch1_test
2792,Hs-578-T,ADAM17,AKT,1,75,1.0,0.258989,17.701706,0.54088,4.851464,73.495692,,1,ch1_test
2793,MDA-MB-157,ADAM17,AKT,1,75,1.0,0.215696,5.095356,0.010219,0.227937,52.12146,,1,ch1_test
2794,MDA-MB-468,ADAM17,AKT,1,75,0.50942,0.941494,26.827759,7.328496,10.0,94.799897,,1,ch1_test


### Copy synergy data

..and do everything else on data without synergy scores.

In [15]:
if 'SYNERGY_SCORE' in ALL_TRAINING_DATA:
    ALL_TRAINING_DATA_WSYNERGY = ALL_TRAINING_DATA.copy()
    ALL_TRAINING_DATA.drop('SYNERGY_SCORE', axis=1, inplace=True)

display(ALL_TRAINING_DATA_WSYNERGY.head(2))
print(ALL_TRAINING_DATA_WSYNERGY.shape)

display(ALL_TRAINING_DATA.head(2))
print(ALL_TRAINING_DATA.shape)

Unnamed: 0,CELL_LINE,COMPOUND_A,COMPOUND_B,MAX_CONC_A,MAX_CONC_B,IC50_A,H_A,Einf_A,IC50_B,H_B,Einf_B,SYNERGY_SCORE,QA,source
0,BT-20,ADAM17,AKT,1,75,1.0,0.809002,59.122436,9.639714,0.757977,91.593425,29.54039,1,train
1,CAL-120,ADAM17,AKT,1,75,0.183214,2.503678,60.411999,1.0,0.0,100.0,4.40141,-1,train


(11575, 14)


Unnamed: 0,CELL_LINE,COMPOUND_A,COMPOUND_B,MAX_CONC_A,MAX_CONC_B,IC50_A,H_A,Einf_A,IC50_B,H_B,Einf_B,QA,source
0,BT-20,ADAM17,AKT,1,75,1.0,0.809002,59.122436,9.639714,0.757977,91.593425,1,train
1,CAL-120,ADAM17,AKT,1,75,0.183214,2.503678,60.411999,1.0,0.0,100.0,-1,train


(11575, 13)


### Keep only QA == 1

In [16]:
ALL_TRAINING_DATA = ALL_TRAINING_DATA[
    (ALL_TRAINING_DATA['QA'] == 1)
]
ALL_TRAINING_DATA.drop('QA', axis=1, inplace=True)
print(ALL_TRAINING_DATA.shape)

(11171, 12)


### D

Don't really need or use this, but keep for legacy reasons.

In [17]:
compound_data = (
    pd.concat([
        ALL_TRAINING_DATA
        [[c for c in ALL_TRAINING_DATA.columns if c.endswith('_A')]]
        .rename(columns={c: c[:-2] for c in ALL_TRAINING_DATA.columns if c.endswith('_A')}),
        ALL_TRAINING_DATA
        [[c for c in ALL_TRAINING_DATA.columns if c.endswith('_B')]]
        .rename(columns={c: c[:-2] for c in ALL_TRAINING_DATA.columns if c.endswith('_B')}),
    ], ignore_index=True)
)

In [18]:
compound_data.head()

Unnamed: 0,COMPOUND,MAX_CONC,IC50,H,Einf
0,ADAM17,1,1.0,0.809002,59.122436
1,ADAM17,1,1.0,0.726984,11.150843
2,ADAM17,1,0.410029,1.496901,51.921265
3,ADAM17,1,0.313992,10.0,63.399635
4,ADAM17,1,1.0,0.77183,41.529554


In [19]:
ALL_TRAINING_DATA_GBD = fn.my_groupby(
    compound_data.rename(columns={'COMPOUND': 'd'}), by='d', name='gbd')

In [20]:
ALL_TRAINING_DATA_GBD.head()

Unnamed: 0,d,max_conc_gbd_min,max_conc_gbd_max,max_conc_gbd_mean,max_conc_gbd_std,ic50_gbd_min,ic50_gbd_max,ic50_gbd_mean,ic50_gbd_std,h_gbd_min,h_gbd_max,h_gbd_mean,h_gbd_std,einf_gbd_min,einf_gbd_max,einf_gbd_mean,einf_gbd_std,count_gbd
0,ADAM17,1.0,3,1.333333,0.746996,0.0001,3,0.678982,0.670769,0,10,2.359189,3.263725,0,100,49.203516,33.978373,228
1,AKT,0.003,75,2.977014,9.948914,1e-06,75,0.878123,3.40208,0,10,3.124647,3.847184,0,100,53.261517,35.90222,1041
2,AKT_1,0.1,75,4.175816,13.133638,1e-05,75,1.203137,5.241192,0,10,3.104658,3.896415,0,100,56.36519,35.167022,674
3,AKT_PIK3C,1.0,10,1.885246,2.702458,0.0001,10,0.624808,1.284514,0,10,2.808945,3.615563,0,100,59.542931,35.384669,61
4,AKT_SGK,1.0,1,1.0,0.0,0.0001,1,0.490824,0.371681,0,10,3.062068,3.647373,0,100,51.176495,31.941165,180


### DC

In [21]:
compound_cell_line_data = (
    pd.concat([
        ALL_TRAINING_DATA
        [['CELL_LINE'] + [c for c in ALL_TRAINING_DATA.columns if c.endswith('_A')]]
        .rename(columns={c: c[:-2] for c in ALL_TRAINING_DATA.columns if c.endswith('_A')}),
        ALL_TRAINING_DATA
        [['CELL_LINE'] + [c for c in ALL_TRAINING_DATA.columns if c.endswith('_B')]]
        .rename(columns={c: c[:-2] for c in ALL_TRAINING_DATA.columns if c.endswith('_B')}),
    ])
)

In [22]:
compound_cell_line_data.head()

Unnamed: 0,CELL_LINE,COMPOUND,MAX_CONC,IC50,H,Einf
0,BT-20,ADAM17,1,1.0,0.809002,59.122436
2,CAL-51,ADAM17,1,1.0,0.726984,11.150843
5,HCC1187,ADAM17,1,0.410029,1.496901,51.921265
7,HCC1806,ADAM17,1,0.313992,10.0,63.399635
8,HCC1937,ADAM17,1,1.0,0.77183,41.529554


In [23]:
ALL_TRAINING_DATA_GBDCL = fn.my_groupby(
    compound_cell_line_data.rename(columns={'COMPOUND': 'd', 'CELL_LINE': 'c'}), 
    by=['d', 'c'], 
    name='gbdc'
)
ALL_TRAINING_DATA_GBDCL.head()

Unnamed: 0,d,c,max_conc_gbdc_min,max_conc_gbdc_max,max_conc_gbdc_mean,max_conc_gbdc_std,ic50_gbdc_min,ic50_gbdc_max,ic50_gbdc_mean,ic50_gbdc_std,h_gbdc_min,h_gbdc_max,h_gbdc_mean,h_gbdc_std,einf_gbdc_min,einf_gbdc_max,einf_gbdc_mean,einf_gbdc_std,count_gbdc
0,ADAM17,A549,1,3,2.333333,1.154701,1.0,3.0,1.666667,1.154701,0.0,1.140817,0.380272,0.658651,10.136795,100.0,70.045598,51.882545,3
1,ADAM17,BT-20,1,1,1.0,0.0,0.10344,1.0,0.764106,0.30637,0.383972,1.490744,0.973217,0.403961,0.0,82.067345,39.904304,29.518787,8
2,ADAM17,BT-549,1,1,1.0,0.0,0.0001,1.0,0.321746,0.324553,0.1,4.159118,1.323676,1.281327,0.0,81.024292,49.267057,26.993345,8
3,ADAM17,CAL-120,1,1,1.0,0.0,0.09451,1.0,0.415736,0.506823,0.0,3.681096,1.737925,1.849111,44.192363,100.0,70.709807,28.006949,3
4,ADAM17,CAL-148,1,1,1.0,0.0,0.011726,0.331856,0.204963,0.131471,1.500265,10.0,3.61638,3.596905,1.100779,92.371276,40.196914,40.613068,5


### C

In [24]:
ALL_TRAINING_DATA_GBCL = fn.my_groupby(
    compound_cell_line_data.rename(columns={'COMPOUND': 'd', 'CELL_LINE': 'c'}).drop('d', axis=1), 
    by='c', 
    name='gbc'
)
ALL_TRAINING_DATA_GBCL.head()

Unnamed: 0,c,max_conc_gbc_min,max_conc_gbc_max,max_conc_gbc_mean,max_conc_gbc_std,ic50_gbc_min,ic50_gbc_max,ic50_gbc_mean,ic50_gbc_std,h_gbc_min,h_gbc_max,h_gbc_mean,h_gbc_std,einf_gbc_min,einf_gbc_max,einf_gbc_mean,einf_gbc_std,count_gbc
0,22RV1,1.0,10,4.5,4.229526,0.0003,10,1.659355,2.561437,0,10,4.110479,4.284195,23.235716,100,64.816692,22.818861,28
1,647-V,0.003,3,2.064613,1.269595,1e-05,3,1.028795,0.989565,0,10,3.777782,3.694869,0.0,100,43.452267,37.343984,382
2,A549,0.003,10,2.181102,1.673714,0.0003,3,1.139836,1.127212,0,10,3.039925,3.570279,0.0,100,44.840635,38.620013,166
3,BFTC-905,0.003,3,2.493151,1.039579,0.0003,3,0.873172,1.003254,0,10,2.850973,3.232453,0.0,100,28.370663,28.84384,372
4,BT-20,0.01,75,2.950508,10.701853,8.8e-05,75,1.800594,8.697959,0,10,2.381551,3.097089,0.0,100,47.513079,30.121828,374


### DD

In [25]:
ALL_TRAINING_DATA_GBDP = fn.my_groupby(
    ALL_TRAINING_DATA.rename(columns={'COMPOUND_A': 'd_1', 'COMPOUND_B': 'd_2'}), 
    by=['d_1', 'd_2'], 
    name='gbdd'
)
ALL_TRAINING_DATA_GBDP.head()

Unnamed: 0,d_1,d_2,max_conc_a_gbdd_min,max_conc_a_gbdd_max,max_conc_a_gbdd_mean,max_conc_a_gbdd_std,max_conc_b_gbdd_min,max_conc_b_gbdd_max,max_conc_b_gbdd_mean,max_conc_b_gbdd_std,ic50_a_gbdd_min,ic50_a_gbdd_max,ic50_a_gbdd_mean,ic50_a_gbdd_std,h_a_gbdd_min,h_a_gbdd_max,h_a_gbdd_mean,h_a_gbdd_std,einf_a_gbdd_min,einf_a_gbdd_max,einf_a_gbdd_mean,einf_a_gbdd_std,ic50_b_gbdd_min,ic50_b_gbdd_max,ic50_b_gbdd_mean,ic50_b_gbdd_std,h_b_gbdd_min,h_b_gbdd_max,h_b_gbdd_mean,h_b_gbdd_std,einf_b_gbdd_min,einf_b_gbdd_max,einf_b_gbdd_mean,einf_b_gbdd_std,count_gbdd
0,ADAM17,AKT,1,1,1,0,10,75,71.75,14.534442,0.126186,1,0.575453,0.367001,0.215696,10,2.839292,3.280994,0.0,75.76808,32.685664,25.946161,0.0075,75,9.661006,22.561773,0,10,0.963896,2.391543,48.650054,100,88.44709,16.163458,20
1,ADAM17,AKT_1,1,1,1,0,75,75,75.0,0.0,0.033958,1,0.522182,0.384989,0.0,10,1.849612,2.718638,0.0,100.0,45.95228,32.631481,0.0075,75,14.136863,25.992584,0,10,1.116272,2.919348,35.104195,100,86.956282,20.800173,22
2,ADAM17,BCL2_BCL2L1,1,1,1,0,75,75,75.0,0.0,0.0001,1,0.56254,0.433605,0.0,10,2.474769,3.688161,0.074954,100.0,55.634417,31.277824,0.0075,75,19.323405,31.866789,0,10,1.395987,2.95993,7.400386,100,85.174268,24.30123,22
3,ADAM17,EGFR_2,1,1,1,0,10,10,10.0,0.0,0.082335,1,0.690256,0.380578,0.0,10,3.474409,4.153039,7.122209,100.0,79.522193,22.151552,0.001,10,5.794356,4.573942,0,10,2.133355,3.663599,0.0,100,73.957455,27.071963,19
4,ADAM17,ERBB,3,3,3,0,10,10,10.0,0.0,0.0003,3,1.443599,1.198901,0.0,10,1.188095,2.145512,0.0,100.0,55.431472,31.679955,0.064802,10,5.869894,4.50135,0,10,2.4887,3.88479,0.0,100,69.868643,26.82428,20


In [26]:
ALL_TRAINING_DATA_GBDP = convert_features_to_differences(ALL_TRAINING_DATA_GBDP)
ALL_TRAINING_DATA_GBDP.head()

dropping the following columns: ['max_conc_a_gbdd_min', 'max_conc_b_gbdd_min', 'max_conc_a_gbdd_max', 'max_conc_b_gbdd_max', 'max_conc_a_gbdd_mean', 'max_conc_b_gbdd_mean', 'max_conc_a_gbdd_std', 'max_conc_b_gbdd_std', 'ic50_a_gbdd_min', 'ic50_b_gbdd_min', 'ic50_a_gbdd_max', 'ic50_b_gbdd_max', 'ic50_a_gbdd_mean', 'ic50_b_gbdd_mean', 'ic50_a_gbdd_std', 'ic50_b_gbdd_std', 'h_a_gbdd_min', 'h_b_gbdd_min', 'h_a_gbdd_max', 'h_b_gbdd_max', 'h_a_gbdd_mean', 'h_b_gbdd_mean', 'h_a_gbdd_std', 'h_b_gbdd_std', 'einf_a_gbdd_min', 'einf_b_gbdd_min', 'einf_a_gbdd_max', 'einf_b_gbdd_max', 'einf_a_gbdd_mean', 'einf_b_gbdd_mean', 'einf_a_gbdd_std', 'einf_b_gbdd_std']


Unnamed: 0,d_1,d_2,count_gbdd,max_conc_gbdd_min_mean,max_conc_gbdd_min_diff,max_conc_gbdd_max_mean,max_conc_gbdd_max_diff,max_conc_gbdd_mean_mean,max_conc_gbdd_mean_diff,max_conc_gbdd_std_mean,max_conc_gbdd_std_diff,ic50_gbdd_min_mean,ic50_gbdd_min_diff,ic50_gbdd_max_mean,ic50_gbdd_max_diff,ic50_gbdd_mean_mean,ic50_gbdd_mean_diff,ic50_gbdd_std_mean,ic50_gbdd_std_diff,h_gbdd_min_mean,h_gbdd_min_diff,h_gbdd_max_mean,h_gbdd_max_diff,h_gbdd_mean_mean,h_gbdd_mean_diff,h_gbdd_std_mean,h_gbdd_std_diff,einf_gbdd_min_mean,einf_gbdd_min_diff,einf_gbdd_max_mean,einf_gbdd_max_diff,einf_gbdd_mean_mean,einf_gbdd_mean_diff,einf_gbdd_std_mean,einf_gbdd_std_diff
0,ADAM17,AKT,20,5.5,9,38.0,74,36.375,70.75,7.267221,14.534442,0.066843,0.118686,38.0,74,5.11823,9.085554,11.464387,22.194771,0.107848,0.215696,10,0,1.901594,1.875396,2.836268,0.889451,24.325027,48.650054,87.88404,24.23192,60.566377,55.761426,21.05481,9.782703
1,ADAM17,AKT_1,22,38.0,74,38.0,74,38.0,74.0,0.0,0.0,0.020729,0.026458,38.0,74,7.329522,13.614682,13.188787,25.607595,0.0,0.0,10,0,1.482942,0.733341,2.818993,0.20071,17.552098,35.104195,100.0,0.0,66.454281,41.004002,26.715827,11.831308
2,ADAM17,BCL2_BCL2L1,22,38.0,74,38.0,74,38.0,74.0,0.0,0.0,0.0038,0.0074,38.0,74,9.942972,18.760865,16.150197,31.433183,0.0,0.0,10,0,1.935378,1.078782,3.324046,0.728231,3.73767,7.325433,100.0,0.0,70.404342,29.539851,27.789527,6.976594
3,ADAM17,EGFR_2,19,5.5,9,5.5,9,5.5,9.0,0.0,0.0,0.041668,0.081335,5.5,9,3.242306,5.104101,2.47726,4.193364,0.0,0.0,10,0,2.803882,1.341054,3.908319,0.48944,3.561105,7.122209,100.0,0.0,76.739824,5.564739,24.611758,4.920411
4,ADAM17,ERBB,20,6.5,7,6.5,7,6.5,7.0,0.0,0.0,0.032551,0.064502,6.5,7,3.656746,4.426296,2.850126,3.302449,0.0,0.0,10,0,1.838397,1.300605,3.015151,1.739278,0.0,0.0,100.0,0.0,62.650057,14.437171,29.252118,4.855676


### To differences 

#### ALL_TRAINING_DATA

In [27]:
ALL_TRAINING_DATA = convert_features_to_differences(ALL_TRAINING_DATA)
ALL_TRAINING_DATA.head()

dropping the following columns: ['max_conc_a', 'max_conc_b', 'ic50_a', 'ic50_b', 'h_a', 'h_b', 'einf_a', 'einf_b']


Unnamed: 0,c,d_1,d_2,source,max_conc_mean,max_conc_diff,ic50_mean,ic50_diff,h_mean,h_diff,einf_mean,einf_diff
0,BT-20,ADAM17,AKT,train,38,74,5.319857,8.639714,0.78349,0.051025,75.35793,32.470988
2,CAL-51,ADAM17,AKT,train,38,74,38.0,74.0,0.551013,0.351941,43.903661,65.505636
5,HCC1187,ADAM17,AKT,train,38,74,0.705015,0.589971,0.74845,1.496901,75.960632,48.078735
7,HCC1806,ADAM17,AKT,train,38,74,37.656996,74.686008,5.196285,9.60743,56.024845,14.749581
8,HCC1937,ADAM17,AKT,train,38,74,1.338962,0.677924,0.945659,0.347657,66.858792,50.658475


In [28]:
assert not (ALL_TRAINING_DATA['d_1'].str.lower() > ALL_TRAINING_DATA['d_2'].str.lower()).any()

In [29]:
ALL_TRAINING_DATA[ALL_TRAINING_DATA['d_1'] == ALL_TRAINING_DATA['d_2']]

Unnamed: 0,c,d_1,d_2,source,max_conc_mean,max_conc_diff,ic50_mean,ic50_diff,h_mean,h_diff,einf_mean,einf_diff
8333,C32,CHEK1,CHEK1,ch2_test,1.0,0,0.594893,0.465442,10.0,0.0,7.198338,14.396676
8334,COLO-205,CHEK1,CHEK1,ch2_test,1.0,0,0.368795,0.116022,2.582486,0.83928,0.0,0.0
8335,HT-29,CHEK1,CHEK1,ch2_test,1.0,0,0.138658,0.00442,3.463446,0.651387,3.395651,1.124138
8336,LS-513,CHEK1,CHEK1,ch2_test,0.3,0,0.3,0.0,10.0,0.0,19.300204,5.400285
8337,SW48,CHEK1,CHEK1,ch2_test,0.3,0,0.112245,0.014646,2.618789,0.810626,5.403952,7.946475
8338,SW620,CHEK1,CHEK1,ch2_test,0.3,0,0.181262,0.025201,1.840922,0.999279,0.0,0.0
8339,SW837,CHEK1,CHEK1,ch2_test,0.3,0,0.062493,0.000585,1.60761,0.345784,1.26003,2.52006
8340,SW948,CHEK1,CHEK1,ch2_test,1.0,0,0.311202,0.002641,10.0,0.0,3.149091,0.70008
9712,HCC1187,MTOR_1,MTOR_1,ch2_test,1.0,0,1.0,0.0,5.0,10.0,75.900005,48.199991


In [30]:
ALL_TRAINING_DATA.shape

(11171, 12)

#### ALL_TRAINING_DATA_WSYNERGY

In [31]:
ALL_TRAINING_DATA_WSYNERGY = convert_features_to_differences(ALL_TRAINING_DATA_WSYNERGY)

dropping the following columns: ['max_conc_a', 'max_conc_b', 'ic50_a', 'ic50_b', 'h_a', 'h_b', 'einf_a', 'einf_b']


In [32]:
ALL_TRAINING_DATA_WSYNERGY.head()

Unnamed: 0,c,d_1,d_2,synergy_score,qa,source,max_conc_mean,max_conc_diff,ic50_mean,ic50_diff,h_mean,h_diff,einf_mean,einf_diff
0,BT-20,ADAM17,AKT,29.54039,1,train,38,74,5.319857,8.639714,0.78349,0.051025,75.35793,32.470988
1,CAL-120,ADAM17,AKT,4.40141,-1,train,38,74,0.591607,0.816786,1.251839,2.503678,80.206,39.588001
2,CAL-51,ADAM17,AKT,0.315422,1,train,38,74,38.0,74.0,0.551013,0.351941,43.903661,65.505636
3,DU-4475,ADAM17,AKT,-41.73409,-1,train,38,74,37.660767,74.678467,5.997933,8.004134,32.385247,52.428481
4,HCC1143,ADAM17,AKT,35.53277,-1,train,38,74,0.699337,0.601327,5.0,10.0,94.549447,10.901106


In [33]:
assert not (ALL_TRAINING_DATA_WSYNERGY['d_1'].str.lower() > ALL_TRAINING_DATA_WSYNERGY['d_2'].str.lower()).any()

In [34]:
ALL_TRAINING_DATA_WSYNERGY[ALL_TRAINING_DATA_WSYNERGY['d_1'] == ALL_TRAINING_DATA_WSYNERGY['d_2']]

Unnamed: 0,c,d_1,d_2,synergy_score,qa,source,max_conc_mean,max_conc_diff,ic50_mean,ic50_diff,h_mean,h_diff,einf_mean,einf_diff
1905,VCaP,PIK3CB_PIK3CD,PIK3CB_PIK3CD,-128.0491,-3,train,1.0,0,0.505312,0.989375,5.112702,9.774596,28.317023,56.634046
8333,C32,CHEK1,CHEK1,,1,ch2_test,1.0,0,0.594893,0.465442,10.0,0.0,7.198338,14.396676
8334,COLO-205,CHEK1,CHEK1,,1,ch2_test,1.0,0,0.368795,0.116022,2.582486,0.83928,0.0,0.0
8335,HT-29,CHEK1,CHEK1,,1,ch2_test,1.0,0,0.138658,0.00442,3.463446,0.651387,3.395651,1.124138
8336,LS-513,CHEK1,CHEK1,,1,ch2_test,0.3,0,0.3,0.0,10.0,0.0,19.300204,5.400285
8337,SW48,CHEK1,CHEK1,,1,ch2_test,0.3,0,0.112245,0.014646,2.618789,0.810626,5.403952,7.946475
8338,SW620,CHEK1,CHEK1,,1,ch2_test,0.3,0,0.181262,0.025201,1.840922,0.999279,0.0,0.0
8339,SW837,CHEK1,CHEK1,,1,ch2_test,0.3,0,0.062493,0.000585,1.60761,0.345784,1.26003,2.52006
8340,SW948,CHEK1,CHEK1,,1,ch2_test,1.0,0,0.311202,0.002641,10.0,0.0,3.149091,0.70008
9712,HCC1187,MTOR_1,MTOR_1,,1,ch2_test,1.0,0,1.0,0.0,5.0,10.0,75.900005,48.199991


In [35]:
ALL_TRAINING_DATA_WSYNERGY.shape

(11575, 14)

## DD-DD

In [36]:
tmp_df = ALL_TRAINING_DATA_WSYNERGY[
    (ALL_TRAINING_DATA_WSYNERGY['synergy_score'].notnull()) &
    (ALL_TRAINING_DATA_WSYNERGY['qa'] == 1)
]

ALL_TRAINING_DATA_PAIR = (
    tmp_df
    .merge(tmp_df, on=['c'], suffixes=('_x', '_y'))
)

In [37]:
ALL_TRAINING_DATA_PAIR = fn.get_differences(ALL_TRAINING_DATA_PAIR, keep=0)

Skipping column 'd_1_x' because it appears to be a string...
Skipping column 'd_2_x' because it appears to be a string...
Skipping column 'source_x' because it appears to be a string...


In [38]:
display(ALL_TRAINING_DATA_PAIR.head())
print(ALL_TRAINING_DATA_PAIR.shape)

Unnamed: 0,c,d_1_x,d_2_x,synergy_score_x,source_x,d_1_y,d_2_y,synergy_score_y,source_y,synergy_score_diff,qa_diff,max_conc_mean_diff,max_conc_diff_diff,ic50_mean_diff,ic50_diff_diff,h_mean_diff,h_diff_diff,einf_mean_diff,einf_diff_diff
0,BT-20,ADAM17,AKT,29.54039,train,ADAM17,AKT,29.54039,train,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,BT-20,ADAM17,AKT,29.54039,train,ADAM17,MTOR_1,18.30981,train,-11.23058,0,0,0,32.482987,65.754599,0.273133,0.366647,-41.24452,35.755832
2,BT-20,ADAM17,AKT,29.54039,train,AKT,PIK3CA_4,12.91164,train,-16.62875,0,-37,-74,-5.120333,-8.620869,1.468704,1.927025,-6.587349,-12.521772
3,BT-20,ADAM17,AKT,29.54039,train,AKT,PIK3C_2,3.443067,train,-26.097323,0,-37,-74,-4.389948,-8.499532,0.22663,0.501371,-53.339294,11.566284
4,BT-20,ADAM17,AKT,29.54039,train,AKT,AKT_1,20.26494,train,-9.27545,0,-37,-74,-4.961664,-8.440988,1.523976,0.638628,-2.985506,-15.592117


(650078, 19)


In [39]:
ALL_TRAINING_DATA_PAIR_GBDDDD = fn.my_groupby(
    ALL_TRAINING_DATA_PAIR
    .drop(pd.Index(['synergy_score_x', 'synergy_score_y', 'qa_diff']), axis=1),
    by=['d_1_x', 'd_2_x', 'd_1_y', 'd_2_y'],
    name='gbdddd',
)

In [40]:
display(ALL_TRAINING_DATA_PAIR_GBDDDD.head())
print(ALL_TRAINING_DATA_PAIR_GBDDDD.shape)

Unnamed: 0,d_1_x,d_2_x,d_1_y,d_2_y,synergy_score_diff_gbdddd_min,synergy_score_diff_gbdddd_max,synergy_score_diff_gbdddd_mean,synergy_score_diff_gbdddd_std,max_conc_mean_diff_gbdddd_min,max_conc_mean_diff_gbdddd_max,max_conc_mean_diff_gbdddd_mean,max_conc_mean_diff_gbdddd_std,max_conc_diff_diff_gbdddd_min,max_conc_diff_diff_gbdddd_max,max_conc_diff_diff_gbdddd_mean,max_conc_diff_diff_gbdddd_std,ic50_mean_diff_gbdddd_min,ic50_mean_diff_gbdddd_max,ic50_mean_diff_gbdddd_mean,ic50_mean_diff_gbdddd_std,ic50_diff_diff_gbdddd_min,ic50_diff_diff_gbdddd_max,ic50_diff_diff_gbdddd_mean,ic50_diff_diff_gbdddd_std,h_mean_diff_gbdddd_min,h_mean_diff_gbdddd_max,h_mean_diff_gbdddd_mean,h_mean_diff_gbdddd_std,h_diff_diff_gbdddd_min,h_diff_diff_gbdddd_max,h_diff_diff_gbdddd_mean,h_diff_diff_gbdddd_std,einf_mean_diff_gbdddd_min,einf_mean_diff_gbdddd_max,einf_mean_diff_gbdddd_mean,einf_mean_diff_gbdddd_std,einf_diff_diff_gbdddd_min,einf_diff_diff_gbdddd_max,einf_diff_diff_gbdddd_mean,einf_diff_diff_gbdddd_std,count_gbdddd
0,ADAM17,AKT,ADAM17,AKT,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14
1,ADAM17,AKT,ADAM17,AKT_1,-13.727708,59.61906,10.69852,18.063062,0,0,0,0,0,0,0,0,-37.109566,36.844716,3.425643,18.461227,-74.017703,74.310569,7.137066,37.010933,-3.979602,5.0,-0.20068,2.03174,-10.0,0.923857,-2.241657,3.752806,-24.997457,23.789659,2.053373,13.966635,-43.539781,56.783258,-6.807273,26.576543,12
2,ADAM17,AKT,ADAM17,BCL2_BCL2L1,-30.838276,29.270519,-8.928698,19.555292,0,0,0,0,0,0,0,0,-36.656996,32.680143,-0.577546,20.041916,-74.686008,65.360286,-1.659157,40.481845,-5.196285,3.452462,-1.313444,2.979203,-9.60743,6.904924,-2.576442,5.765,-20.31812,43.975155,8.347151,25.161015,-82.558103,28.429751,-14.350618,34.703481,7
3,ADAM17,AKT,ADAM17,FGFR,-47.0803,8.028154,-17.597569,15.725436,0,0,0,0,0,0,0,0,-0.560949,36.180508,5.555523,12.666529,-0.768083,74.283136,11.117236,26.10908,-5.0,8.868272,-0.29892,4.145075,-10.0,-0.13917,-3.550664,3.224377,-10.965267,41.279052,12.626308,17.611581,-82.558103,-1.497775,-31.971915,26.481145,8
4,ADAM17,AKT,ADAM17,MAP2K_1,-57.92772,10.094198,-14.3823,20.876312,0,0,0,0,0,0,0,0,-37.367642,36.173713,-0.133243,19.657553,-73.264716,74.296727,0.097005,39.439729,-5.0,4.448987,-0.461334,2.573877,-10.0,9.648058,-0.686826,5.392154,-15.992621,52.045898,18.212607,24.991134,-91.270496,31.985242,-31.438697,43.512866,8


(119531, 41)


## C-C

In [41]:
tmp_df = ALL_TRAINING_DATA_WSYNERGY[
    (ALL_TRAINING_DATA_WSYNERGY['synergy_score'].notnull()) &
    (ALL_TRAINING_DATA_WSYNERGY['qa'] == 1)
]

ALL_TRAINING_DATA_PAIR = (
    tmp_df
    .merge(tmp_df, on=['d_1', 'd_2'], suffixes=('_x', '_y'))
)

In [42]:
ALL_TRAINING_DATA_PAIR = fn.get_differences(ALL_TRAINING_DATA_PAIR, keep=0)

Skipping column 'c_x' because it appears to be a string...
Skipping column 'd_1' because it appears to be a string...
Skipping column 'source_x' because it appears to be a string...


In [43]:
display(ALL_TRAINING_DATA_PAIR.head())
print(ALL_TRAINING_DATA_PAIR.shape)

Unnamed: 0,c_x,d_1,d_2,synergy_score_x,source_x,c_y,synergy_score_y,source_y,synergy_score_diff,qa_diff,max_conc_mean_diff,max_conc_diff_diff,ic50_mean_diff,ic50_diff_diff,h_mean_diff,h_diff_diff,einf_mean_diff,einf_diff_diff
0,BT-20,ADAM17,AKT,29.54039,train,BT-20,29.54039,train,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,BT-20,ADAM17,AKT,29.54039,train,CAL-51,0.315422,train,-29.224968,0,0,0,32.680143,65.360286,-0.232476,0.300916,-31.45427,33.034648
2,BT-20,ADAM17,AKT,29.54039,train,HCC1187,-0.801993,train,-30.342383,0,0,0,-4.614842,-8.049743,-0.035039,1.445876,0.602702,15.607747
3,BT-20,ADAM17,AKT,29.54039,train,HCC1806,-26.73325,train,-56.27364,0,0,0,32.337139,66.046294,4.412795,9.556405,-19.333086,-17.721407
4,BT-20,ADAM17,AKT,29.54039,train,HCC1937,32.16552,train,2.62513,0,0,0,-3.980895,-7.96179,0.162169,0.296632,-8.499139,18.187487


(98266, 18)


In [44]:
ALL_TRAINING_DATA_PAIR_GBCC = fn.my_groupby(
    ALL_TRAINING_DATA_PAIR
    .drop(pd.Index(['synergy_score_x', 'synergy_score_y', 'qa_diff']), axis=1),
    by=['c_x', 'c_y'],
    name='gbcc',
)

In [45]:
display(ALL_TRAINING_DATA_PAIR_GBCC.head())
print(ALL_TRAINING_DATA_PAIR_GBCC.shape)

Unnamed: 0,c_x,c_y,synergy_score_diff_gbcc_min,synergy_score_diff_gbcc_max,synergy_score_diff_gbcc_mean,synergy_score_diff_gbcc_std,max_conc_mean_diff_gbcc_min,max_conc_mean_diff_gbcc_max,max_conc_mean_diff_gbcc_mean,max_conc_mean_diff_gbcc_std,max_conc_diff_diff_gbcc_min,max_conc_diff_diff_gbcc_max,max_conc_diff_diff_gbcc_mean,max_conc_diff_diff_gbcc_std,ic50_mean_diff_gbcc_min,ic50_mean_diff_gbcc_max,ic50_mean_diff_gbcc_mean,ic50_mean_diff_gbcc_std,ic50_diff_diff_gbcc_min,ic50_diff_diff_gbcc_max,ic50_diff_diff_gbcc_mean,ic50_diff_diff_gbcc_std,h_mean_diff_gbcc_min,h_mean_diff_gbcc_max,h_mean_diff_gbcc_mean,h_mean_diff_gbcc_std,h_diff_diff_gbcc_min,h_diff_diff_gbcc_max,h_diff_diff_gbcc_mean,h_diff_diff_gbcc_std,einf_mean_diff_gbcc_min,einf_mean_diff_gbcc_max,einf_mean_diff_gbcc_mean,einf_mean_diff_gbcc_std,einf_diff_diff_gbcc_min,einf_diff_diff_gbcc_max,einf_diff_diff_gbcc_mean,einf_diff_diff_gbcc_std,count_gbcc
0,22RV1,22RV1,-10.434812,10.434812,0.0,5.577642,0.0,0,0.0,0.0,0,0,0.0,0.0,-0.073434,0.073434,0.0,0.039252,-0.146868,0.146868,0.0,0.078504,-7.425779,7.425779,0.0,3.969246,-2.923574,2.923574,0.0,1.562716,-16.218178,16.218178,0.0,8.668981,-10.697618,10.697618,0.0,5.718117,8
1,22RV1,647-V,-38.495162,-0.911056,-21.443179,15.9788,-2.5,2,0.375,1.973787,-9,0,-3.25,3.947573,-0.792704,0.427882,-0.3107,0.567997,-2.56146,0.334965,-1.320819,1.406851,-9.109026,-1.683247,-4.922672,3.125422,-9.078683,1.781947,-3.900027,5.077758,-55.023462,7.237651,-21.246165,26.778663,-48.481514,89.156957,40.925158,61.168521,4
2,22RV1,BFTC-905,-97.788232,25.303103,-47.05829,57.101456,-2.5,2,0.375,1.973787,-9,0,-3.25,3.947573,-0.507477,-0.006623,-0.182412,0.223045,-0.604303,0.160113,-0.076305,0.357542,-8.929053,0.016018,-3.683094,3.930894,-9.533634,0.170093,-3.037265,4.531442,-41.104218,-24.886039,-30.445163,7.41111,-44.594786,23.102822,-14.972879,28.402871,4
3,22RV1,BT-20,-40.450913,13.351193,-3.691292,24.881288,-4.5,0,-1.625,1.973787,-9,0,-3.25,3.947573,-0.622884,0.208182,-0.354822,0.381565,-2.901099,0.774364,-1.350787,1.786615,-9.552197,0.190321,-3.964428,4.164429,-7.041037,0.038093,-2.567267,3.249877,-19.015786,25.408437,1.169447,19.448251,-48.418107,26.369906,-13.238823,30.948683,4
4,22RV1,BT-549,-13.89004,51.14379,19.860594,31.038996,-4.5,0,-1.625,1.973787,-9,0,-3.25,3.947573,-1.487993,0.311025,-0.6723,0.915319,-2.833588,0.568679,-1.32177,1.702479,-7.904045,-0.478266,-4.536142,3.064348,-9.301178,2.03535,-4.135657,5.579639,-58.022301,11.644696,-22.977385,32.087677,-50.755073,7.962792,-15.359549,25.343474,4


(5755, 39)


## All data

### Drugs

In [46]:
# Read text file
drug_info_release = pd.read_csv(
    '../downloads/challenge_data/drug_synergy_data/drug_info_release.csv/'
    'Drug_info_release.csv', sep=',')

drug_info_release_columns = list(drug_info_release.columns)

In [47]:
display(drug_info_release.head())
print(drug_info_release.shape)

Unnamed: 0,ChallengeName,Target(Official Symbol),HBA,cLogP,HBD,Lipinski,SMILES or PubChem ID,MW
0,ADAM17,ADAM17,,,,,,
1,AKT,"AKT1, AKT2, AKT3",8.0,1.18,5.0,0.0,c1cc(ccc1[C@H](CCO)NC(=O)C2(CCN(CC2)c3c4cc[nH]...,428.9
2,AKT_1,AKT*,6.0,3.24,3.0,0.0,c1ccc(cc1)c2cc3c(ccn4c3n[nH]c4=O)nc2c5ccc(cc5)...,407.5
3,AKT_PIK3C,"AKT*,PIK3C*",,,,,,
4,AKT_SGK,"AKT*,SGK*",,,,,,


(119, 8)


In [48]:
DRUGS = drug_info_release['ChallengeName'].drop_duplicates()

In [49]:
print(DRUGS.shape)
assert len(DRUGS) == 119

(119,)


### Drug pairs

In [50]:
DRUGS_tmp = DRUGS.reset_index()
DRUGS_tmp['index'] = 1
DRUG_PAIRS = (
    DRUGS_tmp
    .merge(DRUGS_tmp, on=['index'])
    .drop(['index'], axis=1)
    .rename(columns={'ChallengeName_x': 'd_1', 'ChallengeName_y': 'd_2'})
)
# For some odd reason, sometimes they give a synergy score where a and b are the same... :/ (<= instead of <)
DRUG_PAIRS = DRUG_PAIRS[DRUG_PAIRS['d_1'].str.lower() <= DRUG_PAIRS['d_2'].str.lower()]

In [51]:
display(DRUG_PAIRS.head())
print(DRUG_PAIRS.shape)
assert len(DRUG_PAIRS) == len(DRUGS) * len(DRUGS) / 2 + len(DRUGS) / 2 

Unnamed: 0,d_1,d_2
0,ADAM17,ADAM17
1,ADAM17,AKT
2,ADAM17,AKT_1
3,ADAM17,AKT_PIK3C
4,ADAM17,AKT_SGK


(7140, 2)


### Cell lines

In [52]:
ch1_train_combination_and_monotherapy = pd.read_csv(
    '../downloads/challenge_data/drug_synergy_data/ch1_train_combination_and_monotherapy.csv/'
    'ch1_train_combination_and_monoTherapy.csv', sep=','
)

In [53]:
display(ch1_train_combination_and_monotherapy.head())
print(ch1_train_combination_and_monotherapy.shape)

Unnamed: 0,CELL_LINE,COMPOUND_A,COMPOUND_B,MAX_CONC_A,MAX_CONC_B,IC50_A,H_A,Einf_A,IC50_B,H_B,Einf_B,SYNERGY_SCORE,QA,COMBINATION_ID
0,BT-20,ADAM17,AKT,1,75,1.0,0.809002,59.122436,9.639714,0.757977,91.593425,29.54039,1,ADAM17.AKT
1,CAL-120,ADAM17,AKT,1,75,0.183214,2.503678,60.411999,1.0,0.0,100.0,4.40141,-1,ADAM17.AKT
2,CAL-51,ADAM17,AKT,1,75,1.0,0.726984,11.150843,75.0,0.375043,76.656479,0.315422,1,ADAM17.AKT
3,DU-4475,ADAM17,AKT,1,75,0.321533,10.0,58.599487,75.0,1.995866,6.171007,-41.73409,-1,ADAM17.AKT
4,HCC1143,ADAM17,AKT,1,75,0.398673,10.0,89.098894,1.0,0.0,100.0,35.53277,-1,ADAM17.AKT


(2199, 14)


In [54]:
CELL_LINES = ch1_train_combination_and_monotherapy['CELL_LINE'].drop_duplicates()

In [55]:
display(CELL_LINES.head())
print(CELL_LINES.shape)
assert len(CELL_LINES) == 85

0      BT-20
1    CAL-120
2     CAL-51
3    DU-4475
4    HCC1143
Name: CELL_LINE, dtype: object

(85,)


### DDC

In [56]:
DRUG_PAIRS_tmp = DRUG_PAIRS.copy()
CELL_LINES_tmp = CELL_LINES.reset_index().rename(columns={'CELL_LINE': 'c'})

DRUG_PAIRS_tmp['index'] = 1
CELL_LINES_tmp['index'] = 1

In [57]:
DRUG_PAIRS_CL = DRUG_PAIRS_tmp.merge(CELL_LINES_tmp, on=['index']).drop('index', axis=1)

In [58]:
DRUG_PAIRS_CL.head()

Unnamed: 0,d_1,d_2,c
0,ADAM17,ADAM17,BT-20
1,ADAM17,ADAM17,CAL-120
2,ADAM17,ADAM17,CAL-51
3,ADAM17,ADAM17,DU-4475
4,ADAM17,ADAM17,HCC1143


In [59]:
print(DRUG_PAIRS_CL.shape)
assert len(DRUG_PAIRS_CL) == len(DRUG_PAIRS) * len(CELL_LINES)

(606900, 3)


In [60]:
assert not (DRUG_PAIRS_CL['d_1'].str.lower() > DRUG_PAIRS_CL['d_2'].str.lower()).any()

In [61]:
# Make sure there are no training unique_ids that are missing from the all-by-all
assert not (
    set(ALL_TRAINING_DATA[['d_1', 'd_2', 'c']].apply('.'.join, axis=1)) - 
    set(DRUG_PAIRS_CL[['d_1', 'd_2', 'c']].apply('.'.join, axis=1))
)

In [62]:
ALL_TRAINING_DATA.head()

Unnamed: 0,c,d_1,d_2,source,max_conc_mean,max_conc_diff,ic50_mean,ic50_diff,h_mean,h_diff,einf_mean,einf_diff
0,BT-20,ADAM17,AKT,train,38,74,5.319857,8.639714,0.78349,0.051025,75.35793,32.470988
2,CAL-51,ADAM17,AKT,train,38,74,38.0,74.0,0.551013,0.351941,43.903661,65.505636
5,HCC1187,ADAM17,AKT,train,38,74,0.705015,0.589971,0.74845,1.496901,75.960632,48.078735
7,HCC1806,ADAM17,AKT,train,38,74,37.656996,74.686008,5.196285,9.60743,56.024845,14.749581
8,HCC1937,ADAM17,AKT,train,38,74,1.338962,0.677924,0.945659,0.347657,66.858792,50.658475


In [63]:
DRUG_PAIRS_CL.head()

Unnamed: 0,d_1,d_2,c
0,ADAM17,ADAM17,BT-20
1,ADAM17,ADAM17,CAL-120
2,ADAM17,ADAM17,CAL-51
3,ADAM17,ADAM17,DU-4475
4,ADAM17,ADAM17,HCC1143


## Submission data

### DRUG_PAIRS_CL_SUBMISSION

In [64]:
DEMO_SUBMISSIONS_DIR = '../downloads/challenge_resources/demo_submissions'

X1 = './subchallenge_1/final_submission/combination_priority.csv/combination_priority.csv'
X2 = './subchallenge_1/final_submission/prediction.csv/prediction.csv'
X3 = './subchallenge_2/final_submission/confidence_matrix.csv/confidence_matrix.csv'
X4 = './subchallenge_2/final_submission/synergy_matrix.csv/synergy_matrix.csv'

X1_data = pd.read_csv(op.join(DEMO_SUBMISSIONS_DIR, X1))
X2_data = pd.read_csv(op.join(DEMO_SUBMISSIONS_DIR, X2))
X3_data = pd.read_csv(op.join(DEMO_SUBMISSIONS_DIR, X3))
X4_data = pd.read_csv(op.join(DEMO_SUBMISSIONS_DIR, X4))

In [65]:
X2_data.head(2)

Unnamed: 0,CELL_LINE,COMBINATION_ID,PREDICTION
0,CAMA-1,ADAM17.AKT,0
1,HCC1395,ADAM17.AKT,0


In [66]:
X4_data_stack = (
    X4_data.set_index('Unnamed: 0').stack().reset_index()
    .rename(columns={'Unnamed: 0': 'COMBINATION_ID', 'level_1': 'CELL_LINE'})
)
X4_data_stack.head(2)

Unnamed: 0,COMBINATION_ID,CELL_LINE,0
0,AKT.AKT,BT-20,0
1,AKT.AKT,BT-549,0


In [67]:
DRUG_PAIRS_CL_SUBMISSION = (
    pd.concat([
        X2_data[['COMBINATION_ID', 'CELL_LINE']],
        X4_data_stack[['COMBINATION_ID', 'CELL_LINE']]
    ], ignore_index=True)
    .drop_duplicates()
)
DRUG_PAIRS_CL_SUBMISSION.head()

Unnamed: 0,COMBINATION_ID,CELL_LINE
0,ADAM17.AKT,CAMA-1
1,ADAM17.AKT,HCC1395
2,ADAM17.AKT,Hs-578-T
3,ADAM17.AKT,MDA-MB-157
4,ADAM17.AKT,MDA-MB-468


In [68]:
DRUG_PAIRS_CL_SUBMISSION['d_1'], DRUG_PAIRS_CL_SUBMISSION['d_2'] = list(zip(*
    DRUG_PAIRS_CL_SUBMISSION['COMBINATION_ID'].apply(lambda x: x.split('.'))
))
DRUG_PAIRS_CL_SUBMISSION = (
    DRUG_PAIRS_CL_SUBMISSION
    .drop('COMBINATION_ID', axis=1)
    .rename(columns={'CELL_LINE': 'c'})
    .reindex_axis(['d_1', 'd_2', 'c'], axis=1)
)
DRUG_PAIRS_CL_SUBMISSION.head()

Unnamed: 0,d_1,d_2,c
0,ADAM17,AKT,CAMA-1
1,ADAM17,AKT,HCC1395
2,ADAM17,AKT,Hs-578-T
3,ADAM17,AKT,MDA-MB-157
4,ADAM17,AKT,MDA-MB-468


### Missing

In [69]:
display(ALL_TRAINING_DATA_WSYNERGY.head())
print(ALL_TRAINING_DATA_WSYNERGY.shape)

Unnamed: 0,c,d_1,d_2,synergy_score,qa,source,max_conc_mean,max_conc_diff,ic50_mean,ic50_diff,h_mean,h_diff,einf_mean,einf_diff
0,BT-20,ADAM17,AKT,29.54039,1,train,38,74,5.319857,8.639714,0.78349,0.051025,75.35793,32.470988
1,CAL-120,ADAM17,AKT,4.40141,-1,train,38,74,0.591607,0.816786,1.251839,2.503678,80.206,39.588001
2,CAL-51,ADAM17,AKT,0.315422,1,train,38,74,38.0,74.0,0.551013,0.351941,43.903661,65.505636
3,DU-4475,ADAM17,AKT,-41.73409,-1,train,38,74,37.660767,74.678467,5.997933,8.004134,32.385247,52.428481
4,HCC1143,ADAM17,AKT,35.53277,-1,train,38,74,0.699337,0.601327,5.0,10.0,94.549447,10.901106


(11575, 14)


In [70]:
Counter(ALL_TRAINING_DATA_WSYNERGY['source'])

Counter({'ch1_test': 1089,
         'ch1_validate': 591,
         'ch2_test': 3826,
         'ch2_validate': 3870,
         'train': 2199})

In [71]:
unique_ids_present = set(ALL_TRAINING_DATA_WSYNERGY[['d_1', 'd_2', 'c']].apply('.'.join, axis=1))
unque_ids_required = set(DRUG_PAIRS_CL_SUBMISSION[['d_1', 'd_2', 'c']].apply('.'.join, axis=1))
unique_ids_missing = unque_ids_required - unique_ids_present

In [72]:
DRUG_PAIRS_CL['unique_id'] = DRUG_PAIRS_CL[['d_1', 'd_2', 'c']].apply('.'.join, axis=1)

In [73]:
DRUG_PAIRS_CL_MISSING = (
    DRUG_PAIRS_CL
    [DRUG_PAIRS_CL['unique_id'].isin(unique_ids_missing)]
    .drop('unique_id', axis=1)
)
DRUG_PAIRS_CL_MISSING['source'] = 'ch2_validate_extra'

In [74]:
DRUG_PAIRS_CL_MISSING.head()

Unnamed: 0,d_1,d_2,c,source
3485,ADAM17,ERBB,BT-20,ch2_validate_extra
3486,ADAM17,ERBB,CAL-120,ch2_validate_extra
3487,ADAM17,ERBB,CAL-51,ch2_validate_extra
3488,ADAM17,ERBB,DU-4475,ch2_validate_extra
3489,ADAM17,ERBB,HCC1143,ch2_validate_extra


In [75]:
ALL_TRAINING_DATA_WSYNERGY = (
    ALL_TRAINING_DATA_WSYNERGY
    .merge(DRUG_PAIRS_CL_MISSING, on=['d_1', 'd_2', 'c', 'source'], how='outer')
)

# Save results

In [80]:
from importlib import reload
import common
reload(common)
common.configure_logging(level='warning')
import csv2sql
reload(csv2sql)
db = csv2sql.DataFrameToMySQL(
    'mysql://strokach:@192.168.6.19:3306/az_dream_2015', 
    'all_by_all', 
    '192.168.233.20', 
    echo=False
)

#### ALL_TRAINING_DATA

In [81]:
ALL_TRAINING_DATA.tail(2)

Unnamed: 0,c,d_1,d_2,source,max_conc_mean,max_conc_diff,ic50_mean,ic50_diff,h_mean,h_diff,einf_mean,einf_diff
11573,TCCSUP,ATR_4,Gemcitabine,ch2_test,1.55,2.9,0.371314,0.737354,2.448096,0.891287,11.292497,7.308207
11574,UM-UC-3,ATR_4,Gemcitabine,ch2_test,1.55,2.9,0.519804,1.019486,6.498083,7.003834,25.21861,48.560481


In [82]:
ALL_TRAINING_DATA.shape

(11171, 12)

In [83]:
db.import_table(
    ALL_TRAINING_DATA, 'ALL_TRAINING_DATA', 
    [(('d_1', 'd_2', 'c', 'ic50_mean'), True), 
     (('c', 'd_1', 'd_2'), False)]
)

FML
FML
FML
FML


####  ALL_TRAINING_DATA_WSYNERGY

In [84]:
ALL_TRAINING_DATA_WSYNERGY.head()

Unnamed: 0,c,d_1,d_2,synergy_score,qa,source,max_conc_mean,max_conc_diff,ic50_mean,ic50_diff,h_mean,h_diff,einf_mean,einf_diff
0,BT-20,ADAM17,AKT,29.54039,1,train,38,74,5.319857,8.639714,0.78349,0.051025,75.35793,32.470988
1,CAL-120,ADAM17,AKT,4.40141,-1,train,38,74,0.591607,0.816786,1.251839,2.503678,80.206,39.588001
2,CAL-51,ADAM17,AKT,0.315422,1,train,38,74,38.0,74.0,0.551013,0.351941,43.903661,65.505636
3,DU-4475,ADAM17,AKT,-41.73409,-1,train,38,74,37.660767,74.678467,5.997933,8.004134,32.385247,52.428481
4,HCC1143,ADAM17,AKT,35.53277,-1,train,38,74,0.699337,0.601327,5.0,10.0,94.549447,10.901106


In [88]:
# assert ALL_TRAINING_DATA_WSYNERGY['synergy_score'].notnull().sum() == 2199
assert ALL_TRAINING_DATA_WSYNERGY['synergy_score'].notnull().sum() == 6660

In [89]:
db.import_table(
    ALL_TRAINING_DATA_WSYNERGY, 'ALL_TRAINING_DATA_WSYNERGY', [
        (('d_1', 'd_2', 'c'), False), 
        (('c', 'd_1', 'd_2'), False),
        (('source', 'd_1', 'd_2', 'c'), False), 
        (('source', 'qa', 'd_1', 'd_2', 'c'), False), 
])

FML
FML
FML
FML


#### ALL_TRAINING_DATA_GBD

In [90]:
ALL_TRAINING_DATA_GBD_tmp = ALL_TRAINING_DATA_GBD.rename(columns={'COMPOUND': 'd'})

In [91]:
ALL_TRAINING_DATA_GBD_tmp.head(2)

Unnamed: 0,d,max_conc_gbd_min,max_conc_gbd_max,max_conc_gbd_mean,max_conc_gbd_std,ic50_gbd_min,ic50_gbd_max,ic50_gbd_mean,ic50_gbd_std,h_gbd_min,h_gbd_max,h_gbd_mean,h_gbd_std,einf_gbd_min,einf_gbd_max,einf_gbd_mean,einf_gbd_std,count_gbd
0,ADAM17,1.0,3,1.333333,0.746996,0.0001,3,0.678982,0.670769,0,10,2.359189,3.263725,0,100,49.203516,33.978373,228
1,AKT,0.003,75,2.977014,9.948914,1e-06,75,0.878123,3.40208,0,10,3.124647,3.847184,0,100,53.261517,35.90222,1041


In [92]:
db.import_table(
    ALL_TRAINING_DATA_GBD_tmp, 
    'ALL_TRAINING_DATA_GBD', 
    [(('d',), True)]
)

#### ALL_TRAINING_DATA_GBDCL

In [93]:
ALL_TRAINING_DATA_GBDCL_tmp = ALL_TRAINING_DATA_GBDCL.rename(columns={
    'COMPOUND': 'd',
    'CELL_LINE': 'c',
})

In [94]:
ALL_TRAINING_DATA_GBDCL_tmp.head(2)

Unnamed: 0,d,c,max_conc_gbdc_min,max_conc_gbdc_max,max_conc_gbdc_mean,max_conc_gbdc_std,ic50_gbdc_min,ic50_gbdc_max,ic50_gbdc_mean,ic50_gbdc_std,h_gbdc_min,h_gbdc_max,h_gbdc_mean,h_gbdc_std,einf_gbdc_min,einf_gbdc_max,einf_gbdc_mean,einf_gbdc_std,count_gbdc
0,ADAM17,A549,1,3,2.333333,1.154701,1.0,3,1.666667,1.154701,0.0,1.140817,0.380272,0.658651,10.136795,100.0,70.045598,51.882545,3
1,ADAM17,BT-20,1,1,1.0,0.0,0.10344,1,0.764106,0.30637,0.383972,1.490744,0.973217,0.403961,0.0,82.067345,39.904304,29.518787,8


In [95]:
db.import_table(
    ALL_TRAINING_DATA_GBDCL_tmp, 
    'ALL_TRAINING_DATA_GBDC', 
    [(('d', 'c'), True)]
)

#### ALL_TRAINING_DATA_GBCL

In [96]:
ALL_TRAINING_DATA_GBCL.head(2)

Unnamed: 0,c,max_conc_gbc_min,max_conc_gbc_max,max_conc_gbc_mean,max_conc_gbc_std,ic50_gbc_min,ic50_gbc_max,ic50_gbc_mean,ic50_gbc_std,h_gbc_min,h_gbc_max,h_gbc_mean,h_gbc_std,einf_gbc_min,einf_gbc_max,einf_gbc_mean,einf_gbc_std,count_gbc
0,22RV1,1.0,10,4.5,4.229526,0.0003,10,1.659355,2.561437,0,10,4.110479,4.284195,23.235716,100,64.816692,22.818861,28
1,647-V,0.003,3,2.064613,1.269595,1e-05,3,1.028795,0.989565,0,10,3.777782,3.694869,0.0,100,43.452267,37.343984,382


In [97]:
db.import_table(
    ALL_TRAINING_DATA_GBCL, 
    'ALL_TRAINING_DATA_GBC', 
    [(('c', ), True)]
)

#### ALL_TRAINING_DATA_GBDP

In [98]:
ALL_TRAINING_DATA_GBDP.head(2)

Unnamed: 0,d_1,d_2,count_gbdd,max_conc_gbdd_min_mean,max_conc_gbdd_min_diff,max_conc_gbdd_max_mean,max_conc_gbdd_max_diff,max_conc_gbdd_mean_mean,max_conc_gbdd_mean_diff,max_conc_gbdd_std_mean,max_conc_gbdd_std_diff,ic50_gbdd_min_mean,ic50_gbdd_min_diff,ic50_gbdd_max_mean,ic50_gbdd_max_diff,ic50_gbdd_mean_mean,ic50_gbdd_mean_diff,ic50_gbdd_std_mean,ic50_gbdd_std_diff,h_gbdd_min_mean,h_gbdd_min_diff,h_gbdd_max_mean,h_gbdd_max_diff,h_gbdd_mean_mean,h_gbdd_mean_diff,h_gbdd_std_mean,h_gbdd_std_diff,einf_gbdd_min_mean,einf_gbdd_min_diff,einf_gbdd_max_mean,einf_gbdd_max_diff,einf_gbdd_mean_mean,einf_gbdd_mean_diff,einf_gbdd_std_mean,einf_gbdd_std_diff
0,ADAM17,AKT,20,5.5,9,38,74,36.375,70.75,7.267221,14.534442,0.066843,0.118686,38,74,5.11823,9.085554,11.464387,22.194771,0.107848,0.215696,10,0,1.901594,1.875396,2.836268,0.889451,24.325027,48.650054,87.88404,24.23192,60.566377,55.761426,21.05481,9.782703
1,ADAM17,AKT_1,22,38.0,74,38,74,38.0,74.0,0.0,0.0,0.020729,0.026458,38,74,7.329522,13.614682,13.188787,25.607595,0.0,0.0,10,0,1.482942,0.733341,2.818993,0.20071,17.552098,35.104195,100.0,0.0,66.454281,41.004002,26.715827,11.831308


In [99]:
db.import_table(
    ALL_TRAINING_DATA_GBDP, 
    'ALL_TRAINING_DATA_GBDD', 
    [(('d_1', 'd_2'), True)]
)

#### DRUGS

In [100]:
DRUGS_tmp = pd.DataFrame(DRUGS).rename(columns={'ChallengeName': 'd'})

In [101]:
DRUGS_tmp.head()

Unnamed: 0,d
0,ADAM17
1,AKT
2,AKT_1
3,AKT_PIK3C
4,AKT_SGK


In [102]:
db.import_table(
    DRUGS_tmp, 
    'DRUGS', 
    [(('d',), True)]
)

#### DRUG_PAIRS

In [103]:
DRUG_PAIRS.head(2)

Unnamed: 0,d_1,d_2
0,ADAM17,ADAM17
1,ADAM17,AKT


In [104]:
db.import_table(
    DRUG_PAIRS, 
    'DRUG_PAIRS', 
    [(('d_1', 'd_2'), True), (('d_2', 'd_1'), False)]
)

#### CELL LINES

In [105]:
CELL_LINES_tmp = pd.DataFrame(CELL_LINES).rename(columns={'CELL_LINE': 'c'})

In [106]:
CELL_LINES_tmp.head(2)

Unnamed: 0,c
0,BT-20
1,CAL-120


In [107]:
db.import_table(
    CELL_LINES_tmp, 'CELL_LINES', 
    [(('c',), True)]
)

#### DRUG_PAIRS_CL

In [108]:
DRUG_PAIRS_CL.head(2)

Unnamed: 0,d_1,d_2,c,unique_id
0,ADAM17,ADAM17,BT-20,ADAM17.ADAM17.BT-20
1,ADAM17,ADAM17,CAL-120,ADAM17.ADAM17.CAL-120


In [109]:
db.import_table(
    DRUG_PAIRS_CL, 
    'DRUG_PAIRS_CL', 
    [(('d_1', 'd_2', 'c'), True), (('c', 'd_1', 'd_2'), False)]
)

#### ALL_TRAINING_DATA_PAIR_GBCC

In [110]:
ALL_TRAINING_DATA_PAIR_GBCC.head(2)

Unnamed: 0,c_x,c_y,synergy_score_diff_gbcc_min,synergy_score_diff_gbcc_max,synergy_score_diff_gbcc_mean,synergy_score_diff_gbcc_std,max_conc_mean_diff_gbcc_min,max_conc_mean_diff_gbcc_max,max_conc_mean_diff_gbcc_mean,max_conc_mean_diff_gbcc_std,max_conc_diff_diff_gbcc_min,max_conc_diff_diff_gbcc_max,max_conc_diff_diff_gbcc_mean,max_conc_diff_diff_gbcc_std,ic50_mean_diff_gbcc_min,ic50_mean_diff_gbcc_max,ic50_mean_diff_gbcc_mean,ic50_mean_diff_gbcc_std,ic50_diff_diff_gbcc_min,ic50_diff_diff_gbcc_max,ic50_diff_diff_gbcc_mean,ic50_diff_diff_gbcc_std,h_mean_diff_gbcc_min,h_mean_diff_gbcc_max,h_mean_diff_gbcc_mean,h_mean_diff_gbcc_std,h_diff_diff_gbcc_min,h_diff_diff_gbcc_max,h_diff_diff_gbcc_mean,h_diff_diff_gbcc_std,einf_mean_diff_gbcc_min,einf_mean_diff_gbcc_max,einf_mean_diff_gbcc_mean,einf_mean_diff_gbcc_std,einf_diff_diff_gbcc_min,einf_diff_diff_gbcc_max,einf_diff_diff_gbcc_mean,einf_diff_diff_gbcc_std,count_gbcc
0,22RV1,22RV1,-10.434812,10.434812,0.0,5.577642,0.0,0,0.0,0.0,0,0,0.0,0.0,-0.073434,0.073434,0.0,0.039252,-0.146868,0.146868,0.0,0.078504,-7.425779,7.425779,0.0,3.969246,-2.923574,2.923574,0.0,1.562716,-16.218178,16.218178,0.0,8.668981,-10.697618,10.697618,0.0,5.718117,8
1,22RV1,647-V,-38.495162,-0.911056,-21.443179,15.9788,-2.5,2,0.375,1.973787,-9,0,-3.25,3.947573,-0.792704,0.427882,-0.3107,0.567997,-2.56146,0.334965,-1.320819,1.406851,-9.109026,-1.683247,-4.922672,3.125422,-9.078683,1.781947,-3.900027,5.077758,-55.023462,7.237651,-21.246165,26.778663,-48.481514,89.156957,40.925158,61.168521,4


In [111]:
db.import_table(
    ALL_TRAINING_DATA_PAIR_GBCC, 
    'ALL_TRAINING_DATA_PAIR_GBCC', 
    [(('c_x', 'c_y'), True), ]
)

In [112]:
ALL_TRAINING_DATA_PAIR_GBCC.shape

(5755, 39)

#### ALL_TRAINING_DATA_PAIR_GBDDDD

In [113]:
ALL_TRAINING_DATA_PAIR_GBDDDD.head(2)

Unnamed: 0,d_1_x,d_2_x,d_1_y,d_2_y,synergy_score_diff_gbdddd_min,synergy_score_diff_gbdddd_max,synergy_score_diff_gbdddd_mean,synergy_score_diff_gbdddd_std,max_conc_mean_diff_gbdddd_min,max_conc_mean_diff_gbdddd_max,max_conc_mean_diff_gbdddd_mean,max_conc_mean_diff_gbdddd_std,max_conc_diff_diff_gbdddd_min,max_conc_diff_diff_gbdddd_max,max_conc_diff_diff_gbdddd_mean,max_conc_diff_diff_gbdddd_std,ic50_mean_diff_gbdddd_min,ic50_mean_diff_gbdddd_max,ic50_mean_diff_gbdddd_mean,ic50_mean_diff_gbdddd_std,ic50_diff_diff_gbdddd_min,ic50_diff_diff_gbdddd_max,ic50_diff_diff_gbdddd_mean,ic50_diff_diff_gbdddd_std,h_mean_diff_gbdddd_min,h_mean_diff_gbdddd_max,h_mean_diff_gbdddd_mean,h_mean_diff_gbdddd_std,h_diff_diff_gbdddd_min,h_diff_diff_gbdddd_max,h_diff_diff_gbdddd_mean,h_diff_diff_gbdddd_std,einf_mean_diff_gbdddd_min,einf_mean_diff_gbdddd_max,einf_mean_diff_gbdddd_mean,einf_mean_diff_gbdddd_std,einf_diff_diff_gbdddd_min,einf_diff_diff_gbdddd_max,einf_diff_diff_gbdddd_mean,einf_diff_diff_gbdddd_std,count_gbdddd
0,ADAM17,AKT,ADAM17,AKT,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14
1,ADAM17,AKT,ADAM17,AKT_1,-13.727708,59.61906,10.69852,18.063062,0,0,0,0,0,0,0,0,-37.109566,36.844716,3.425643,18.461227,-74.017703,74.310569,7.137066,37.010933,-3.979602,5,-0.20068,2.03174,-10,0.923857,-2.241657,3.752806,-24.997457,23.789659,2.053373,13.966635,-43.539781,56.783258,-6.807273,26.576543,12


In [114]:
db.import_table(
    ALL_TRAINING_DATA_PAIR_GBDDDD, 
    'ALL_TRAINING_DATA_PAIR_GBDDDD', 
    [(('d_1_x', 'd_2_x', 'd_1_y', 'd_2_y'), True), ]
)

FML
FML
FML
FML


# Finalize

In [115]:
print(datetime.datetime.now())

2016-03-03 18:36:32.532869
