# Summary

- Methylation data is provided in chromosome coordinates.
  - We need to find a way to map cancer gene census / target genes to chromosome locations before we can do any sort of GSEA.

# Intro

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import common
common.configure_logging(level='debug')

DEBUG:root:Done configuring logging!


In [4]:
from biodata import *



In [5]:
import functions as fn

In [6]:
%matplotlib inline

# Load data

## Cell line

In [7]:
cell_info = pd.read_csv(
    '../downloads/challenge_data/sanger_molecular_data/'
    'cell_info.csv/cell_info.csv',
    sep=',',
)

In [8]:
cell_info = cell_info.rename(columns={
    'Sanger.Name': 'cell_line',
    'CCLE.Name': 'cell_line_ccle',
    'Alternative.Name': 'cell_line_alternative',
    'Disease.Area': 'cell_line_disease_area',
    'Tissue..General.': 'cell_line_tissue',
    'COSMIC': 'cell_line_cosmic',
})

In [9]:
cell_info.head(2)

Unnamed: 0,cell_line,cell_line_ccle,cell_line_alternative,cell_line_disease_area,cell_line_tissue,cell_line_cosmic
0,22RV1,22RV1_PROSTATE,22RV1,Urology,male genital system,924100
1,647-V,647V_URINARY_TRACT,647V,Urology,urinary tract,906797


## CSS

In [10]:
css = pd.read_sql_query("""\
SELECT c, avg(synergy_score) synergy_score
FROM az_dream_2015.ALL_TRAINING_DATA_WSYNERGY
where synergy_score is not null
group by c;
""", sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream_2015'))

In [11]:
css = css.set_index('c')['synergy_score']

In [12]:
css.head()

c
22RV1       -8.601674
647-V       13.834063
A549        21.671942
BFTC-905    21.839952
BT-20       22.283808
Name: synergy_score, dtype: float64

## Methylation

In [13]:
methylation = dict()

### methyl_ilse_beta

In [14]:
methylation['methyl_ilse_beta'] = pd.read_csv(
    '../downloads/challenge_data/sanger_molecular_data/methyl/'
    'cpg_isle_level/methyl_ilse_beta.csv/methyl_ilse_beta.csv',
    sep=',', low_memory=False, index_col=0,
)

In [15]:
display(methylation['methyl_ilse_beta'].head())
print(methylation['methyl_ilse_beta'].shape)

Unnamed: 0,C32,HT-29,HCT-116,NCI-H23,MDA-MB-231,COLO-205,MCF7,T47D,BT-549,NCI-H226,A549,MDA-MB-468,SW48,NCI-H1437,DMS-114,NCI-H1299,NCI-H1975,Calu-3,RKO,MDA-MB-436,HCC38,BT-474,SW837,UACC-812,NCI-H1703,NCI-H3122,NCI-H2291,NCI-H520,NCI-H2228,NCI-H358,SW900,NCI-H522,NCI-H2170,NCI-H2085,Calu-6,LS-513,SW948,NCI-H747,CAL-148,MDA-MB-453,HCC1428,CAL-120,CAMA-1,MFM-223,CAL-51,BT-20,HCC1806,HCC70,HCC1954,HCC1187,EVSA-T,MDA-MB-157,MDA-MB-361,HCC1569,HCC1500,DU-4475,TCCSUP,UM-UC-3,HT-1197,647-V,BFTC-905,KU-19-19,VM-CUB-1,HT-1376,SW780,J82,T-24,RT4,VCaP,NCI-H1563,NCI-H838,KATOIII,HCC1419,HCC1395,NCI-H1793,NCI-SNU-16,MDA-MB-415,Hs-578-T,22RV1,HCC1143,M14,HCC1937
chr1:91190489-91192804,0.818692,0.747434,0.770135,0.831321,0.752044,0.738565,0.793441,0.644643,0.631357,0.57186,0.757174,0.621555,0.780302,0.652129,0.518821,0.772011,0.769333,0.781309,0.775392,0.811639,0.621215,0.595175,0.777035,0.821878,0.83207,0.84456,0.814596,0.857496,0.812744,0.765576,0.838035,0.744727,0.794033,0.713938,0.835898,0.709889,0.693786,0.776606,0.636477,0.73593,0.748294,0.70836,0.758822,0.777161,0.768753,0.776307,0.648416,0.533339,0.709706,0.448094,0.5675,0.792164,0.735302,0.800499,0.723169,0.832142,0.584588,0.761377,0.598622,0.79485,0.670391,0.72765,0.752399,0.44208,0.715816,0.629161,0.77252,0.64852,0.634385,0.797586,0.715364,0.700697,0.661265,0.695466,0.634773,0.690834,0.479305,0.6108,0.721448,0.852441,0.738772,0.721409
chr1:230561103-230562702,0.529199,0.629131,0.638518,0.352502,0.406421,0.674365,0.493063,0.377423,0.487595,0.399472,0.37954,0.419318,0.667205,0.409336,0.459739,0.521404,0.574174,0.317646,0.811931,0.462392,0.476604,0.560515,0.352446,0.532013,0.40696,0.379695,0.500116,0.428532,0.397898,0.512607,0.551876,0.214435,0.375646,0.408757,0.409111,0.760019,0.309111,0.499633,0.473723,0.769252,0.506791,0.503532,0.611617,0.522693,0.457915,0.599632,0.388905,0.368648,0.570484,0.370431,0.620084,0.528289,0.552287,0.418511,0.500443,0.274403,0.319003,0.422074,0.377268,0.438499,0.397155,0.48342,0.654325,0.386625,0.389635,0.467959,0.415686,0.478564,0.352268,0.529152,0.500194,0.781096,0.359598,0.522585,0.355272,0.507066,0.42036,0.489846,0.548634,0.456865,0.442124,0.477455
chr1:5937157-5937392,0.829281,0.855925,0.787249,0.848706,0.869549,0.861002,0.840902,0.891672,0.867833,0.814065,0.865125,0.846108,0.634343,0.837588,0.843177,0.810742,0.831324,0.859321,0.819267,0.836294,0.855802,0.82953,0.827683,0.847513,0.829978,0.833436,0.796635,0.818446,0.868557,0.846995,0.843012,0.859087,0.882207,0.855609,0.842565,0.815215,0.87633,0.849579,0.841697,0.830429,0.868263,0.863972,0.835195,0.830093,0.89259,0.799932,0.891277,0.912636,0.866775,0.856623,0.874383,0.8816,0.851847,0.834086,0.89748,0.828288,0.912338,0.815598,0.802033,0.882623,0.833689,0.861152,0.873546,0.903786,0.884518,0.880131,0.871788,0.846568,0.824353,0.820083,0.874672,0.813249,0.871405,0.825545,0.886077,0.881931,0.91053,0.871004,0.855258,0.829174,0.879268,0.821178
chr1:166958220-166958683,0.706106,0.772909,0.811876,0.862066,0.838579,0.820489,0.798494,0.670178,0.815781,0.864727,0.75553,0.764795,0.797445,0.712789,0.858336,0.792503,0.787871,0.845121,0.779418,0.778065,0.875629,0.817331,0.835246,0.811266,0.811324,0.825212,0.810689,0.86425,0.776804,0.473341,0.866354,0.806389,0.897688,0.726421,0.752364,0.830673,0.769079,0.787532,0.846426,0.683228,0.854746,0.815127,0.832747,0.711017,0.822913,0.511313,0.719431,0.730314,0.748209,0.825043,0.625049,0.786015,0.764594,0.785406,0.620309,0.836876,0.662619,0.666448,0.760309,0.843234,0.821604,0.887138,0.774044,0.558672,0.757272,0.855675,0.763839,0.755742,0.863199,0.760776,0.781905,0.644047,0.745478,0.79017,0.687767,0.792714,0.678349,0.571026,0.735134,0.803246,0.68383,0.745292
chr1:43832814-43833073,0.334399,0.276667,0.250471,0.300225,0.300426,0.287454,0.315132,0.352019,0.344933,0.357982,0.274094,0.353612,0.250904,0.336054,0.263514,0.27719,0.317749,0.265515,0.267579,0.238946,0.327718,0.34005,0.285086,0.276487,0.223273,0.355733,0.251709,0.48586,0.311383,0.306318,0.295572,0.379071,0.354547,0.329652,0.346192,0.284396,0.356639,0.341252,0.340005,0.30358,0.341565,0.279333,0.350952,0.292434,0.365615,0.266673,0.285719,0.354566,0.387627,0.347668,0.402443,0.34091,0.299553,0.270068,0.295559,0.252653,0.391685,0.293689,0.293474,0.342766,0.291829,0.25683,0.326064,0.361737,0.33247,0.382653,0.392305,0.373568,0.374557,0.337011,0.304708,0.26389,0.352861,0.371149,0.400604,0.258033,0.448773,0.394566,0.373835,0.357936,0.347671,0.36944


(26313, 82)


### methyl_ilse_m

In [16]:
methylation['methyl_ilse_m'] = pd.read_csv(
    '../downloads/challenge_data/sanger_molecular_data/methyl/'
    'cpg_isle_level/methyl_ilse_m.csv/methyl_ilse_m.csv',
    sep=',', low_memory=False, index_col=0,
)

In [17]:
display(methylation['methyl_ilse_m'].head())
print(methylation['methyl_ilse_m'].shape)

Unnamed: 0,C32,HT-29,HCT-116,NCI-H23,MDA-MB-231,COLO-205,MCF7,T47D,BT-549,NCI-H226,A549,MDA-MB-468,SW48,NCI-H1437,DMS-114,NCI-H1299,NCI-H1975,Calu-3,RKO,MDA-MB-436,HCC38,BT-474,SW837,UACC-812,NCI-H1703,NCI-H3122,NCI-H2291,NCI-H520,NCI-H2228,NCI-H358,SW900,NCI-H522,NCI-H2170,NCI-H2085,Calu-6,LS-513,SW948,NCI-H747,CAL-148,MDA-MB-453,HCC1428,CAL-120,CAMA-1,MFM-223,CAL-51,BT-20,HCC1806,HCC70,HCC1954,HCC1187,EVSA-T,MDA-MB-157,MDA-MB-361,HCC1569,HCC1500,DU-4475,TCCSUP,UM-UC-3,HT-1197,647-V,BFTC-905,KU-19-19,VM-CUB-1,HT-1376,SW780,J82,T-24,RT4,VCaP,NCI-H1563,NCI-H838,KATOIII,HCC1419,HCC1395,NCI-H1793,NCI-SNU-16,MDA-MB-415,Hs-578-T,22RV1,HCC1143,M14,HCC1937
chr1:91190489-91192804,2.353701,1.790654,1.931536,2.619265,1.814634,1.759176,2.129647,1.002813,0.943968,0.493091,1.911764,0.854247,1.991149,1.084373,0.227093,1.929129,1.964763,2.167263,1.984686,2.406965,0.957164,0.705533,2.119034,2.387922,2.53312,2.683866,2.323575,2.788748,2.341939,1.936658,2.523791,1.743635,2.421964,1.572417,2.522083,1.529139,1.479127,2.056281,0.982828,1.712921,1.854292,1.613024,1.913041,2.019972,2.038517,1.977719,1.228223,0.254822,1.555834,-0.26507,0.553806,2.201262,1.642951,2.308031,1.715844,2.565367,0.59103,1.89079,0.756525,2.256518,1.274308,1.752123,1.919246,-0.438285,1.544783,1.025284,1.968314,1.117171,1.089377,2.175671,1.644683,1.543806,1.171372,1.41283,1.004504,1.41902,-0.149904,0.783752,1.729559,2.780557,1.735849,1.583468
chr1:230561103-230562702,0.210043,0.938507,0.923477,-1.060885,-0.611479,1.229674,-0.130173,-0.952859,-0.092966,-0.628429,-0.863419,-0.524698,1.119186,-0.683625,-0.324717,0.111335,0.535146,-1.294266,2.184292,-0.289234,-0.213569,0.456949,-1.055667,0.300644,-0.643884,-0.834863,0.013011,-0.580105,-0.69021,0.032921,0.296655,-1.9793,-0.845963,-0.70871,-0.666048,1.789656,-1.369382,-0.004235,-0.234442,1.83754,-0.015654,0.000946,0.764769,0.138586,-0.34661,0.661619,-0.862985,-0.91509,0.51311,-0.90765,0.926707,0.184584,0.393681,-0.657165,0.064181,-1.779332,-1.25781,-0.520087,-0.846949,-0.426593,-0.666536,-0.10861,1.004322,-0.769245,-0.75391,-0.277252,-0.539438,-0.197077,-1.081279,0.208855,0.09525,2.182155,-0.984887,0.142603,-0.998274,0.080612,-0.52273,-0.080695,0.446791,-0.257238,-0.280136,-0.139851
chr1:5937157-5937392,2.467365,2.770209,2.028302,2.652915,2.854169,2.786662,2.578192,3.173538,2.9032,2.380995,2.7995,2.715967,0.936713,2.4928,2.606962,2.189508,2.506775,2.756015,2.318485,2.479487,2.758492,2.412377,2.430869,2.590877,2.4027,2.433792,2.105848,2.37761,2.903531,2.601308,2.611812,2.720215,3.064915,2.715731,2.583208,2.35283,2.914015,2.6788,2.60759,2.434069,2.854185,2.776785,2.441792,2.421838,3.191292,2.141854,3.143992,3.499969,2.803569,2.685408,2.938246,2.987608,2.671008,2.520553,3.239831,2.4291,3.466138,2.315925,2.208422,2.995538,2.490346,2.728808,2.910831,3.364515,3.034931,2.990238,2.911815,2.599768,2.375654,2.330885,2.911892,2.427881,2.878608,2.413554,3.120092,3.070323,3.459577,2.903062,2.716392,2.414762,2.980085,2.322514
chr1:166958220-166958683,1.502515,1.907049,2.173114,2.739207,2.466914,2.325486,2.06725,1.115511,2.364693,2.822421,1.832792,1.852776,2.062907,1.564941,2.799496,2.036086,2.025165,2.510714,1.935119,1.872041,2.956686,2.264142,2.433699,2.317339,2.327725,2.421781,2.215849,2.857335,2.100173,-0.146655,2.793871,2.278093,3.228557,1.73891,1.832489,2.45705,1.954957,2.094514,2.566057,1.248527,2.636879,2.399871,2.386079,1.402172,2.363389,0.142125,1.734153,1.733647,1.681457,2.429019,0.886949,2.098846,1.801081,2.106774,0.912082,2.45193,1.068544,1.131592,1.836202,2.506029,2.296823,3.033071,1.939997,0.38733,1.761429,2.631229,1.893921,1.803446,2.778579,1.752853,2.10723,1.032566,1.711503,2.125111,1.37304,2.160326,1.338803,0.516433,1.592483,2.187379,1.368896,1.749523
chr1:43832814-43833073,-1.476841,-1.955262,-2.233512,-1.733963,-1.691363,-1.806963,-1.70372,-1.329738,-1.375699,-1.420138,-1.9897,-1.291876,-2.2179,-1.290188,-1.879697,-1.7844,-1.595565,-1.948562,-2.065075,-2.379075,-1.478959,-1.429353,-1.78195,-1.805978,-2.317292,-1.340825,-2.3241,-0.211243,-1.47965,-1.6247,-1.641538,-0.91017,-1.131389,-1.465036,-1.254217,-1.842975,-1.175992,-1.294655,-1.435088,-1.6214,-1.435227,-1.842537,-1.34905,-1.835609,-1.127865,-2.0543,-1.748212,-1.186209,-0.998212,-1.383491,-0.770388,-1.344271,-1.734134,-2.037263,-1.700163,-2.13348,-0.777599,-1.7981,-1.7163,-1.375179,-1.777362,-2.119125,-1.551044,-1.09715,-1.301381,-1.058938,-0.92465,-1.195325,-1.1442,-1.438783,-1.5743,-1.81921,-1.265661,-1.257275,-0.809137,-1.974647,-0.179825,-0.824287,-1.046875,-1.215875,-1.163634,-1.182925


(26313, 82)


### methyl_probe_beta

In [18]:
methylation['methyl_probe_beta'] = pd.read_csv(
    '../downloads/challenge_data/sanger_molecular_data/methyl/'
    'cpg_probe_level/methyl_probe_beta.csv/methyl_probe_beta.csv',
    sep=',', low_memory=False, index_col=0,
)

In [19]:
display(methylation['methyl_probe_beta'].head())
print(methylation['methyl_probe_beta'].shape)

Unnamed: 0,C32,HT-29,HCT-116,NCI-H23,MDA-MB-231,COLO-205,MCF7,T47D,BT-549,NCI-H226,A549,MDA-MB-468,SW48,NCI-H1437,DMS-114,NCI-H1299,NCI-H1975,Calu-3,RKO,MDA-MB-436,HCC38,BT-474,SW837,UACC-812,NCI-H1703,NCI-H3122,NCI-H2291,NCI-H520,NCI-H2228,NCI-H358,SW900,NCI-H522,NCI-H2170,NCI-H2085,Calu-6,LS-513,SW948,NCI-H747,CAL-148,MDA-MB-453,HCC1428,CAL-120,CAMA-1,MFM-223,CAL-51,BT-20,HCC1806,HCC70,HCC1954,HCC1187,EVSA-T,MDA-MB-157,MDA-MB-361,HCC1569,HCC1500,DU-4475,TCCSUP,UM-UC-3,HT-1197,647-V,BFTC-905,KU-19-19,VM-CUB-1,HT-1376,SW780,J82,T-24,RT4,VCaP,NCI-H1563,NCI-H838,KATOIII,HCC1419,HCC1395,NCI-H1793,NCI-SNU-16,MDA-MB-415,Hs-578-T,22RV1,HCC1143,M14,HCC1937
cg12045430,0.3619,0.30025,0.3097,0.41275,0.31632,0.2961,0.32286,0.37835,0.3571,0.25466,0.36949,0.36014,0.25659,0.41541,0.44993,0.34324,0.35446,0.39053,0.23956,0.35555,0.36678,0.36249,0.31129,0.3356,0.34464,0.36682,0.34407,0.40442,0.39034,0.34106,0.33092,0.33882,0.40635,0.36118,0.36266,0.33269,0.33132,0.33222,0.34216,0.28571,0.37663,0.28181,0.29789,0.26574,0.33981,0.27959,0.38501,0.4068,0.29324,0.33416,0.35516,0.40945,0.34941,0.33565,0.32864,0.33692,0.45092,0.24424,0.39391,0.36911,0.38882,0.36137,0.32383,0.42135,0.36075,0.33002,0.37126,0.32401,0.364,0.27153,0.34408,0.26916,0.31349,0.28572,0.35737,0.39315,0.29263,0.25621,0.23261,0.35525,0.2135,0.26062
cg20826792,0.53019,0.39629,0.36708,0.52835,0.44587,0.39041,0.40988,0.48578,0.53678,0.44862,0.47444,0.45491,0.30454,0.54046,0.54311,0.47184,0.4638,0.45701,0.28982,0.43645,0.48102,0.4302,0.39711,0.4189,0.41755,0.46824,0.42397,0.45286,0.47676,0.43956,0.41222,0.51962,0.52312,0.51806,0.43195,0.43554,0.43502,0.43104,0.44182,0.35563,0.47423,0.49623,0.42171,0.41013,0.45129,0.3643,0.48947,0.58321,0.42111,0.57365,0.51481,0.47262,0.39771,0.39127,0.44478,0.40641,0.55227,0.34211,0.4735,0.5014,0.52202,0.46203,0.44549,0.55805,0.44538,0.47017,0.48556,0.40619,0.44866,0.39182,0.48124,0.38166,0.41301,0.44975,0.47873,0.48752,0.58806,0.48003,0.56539,0.48106,0.50813,0.41143
cg00381604,0.2277,0.13865,0.15098,0.37902,0.1729,0.20423,0.1547,0.25212,0.1575,0.17555,0.22518,0.18916,0.15265,0.1939,0.19474,0.2728,0.25719,0.18175,0.15754,0.21146,0.24966,0.22725,0.14651,0.28996,0.20188,0.27771,0.26796,0.24636,0.2741,0.24481,0.20811,0.27632,0.37934,0.32705,0.19859,0.19961,0.21789,0.19738,0.18464,0.14329,0.21572,0.25279,0.17828,0.16964,0.15394,0.14742,0.15899,0.25867,0.15477,0.21476,0.21214,0.27065,0.17765,0.21838,0.17415,0.23854,0.27289,0.1486,0.30426,0.2884,0.24575,0.21149,0.17216,0.23653,0.17837,0.16301,0.24288,0.21985,0.22404,0.1287,0.19954,0.18187,0.22488,0.1471,0.18942,0.20156,0.23075,0.20064,0.23355,0.1933,0.21986,0.15431
cg24335620,0.75182,0.7911,0.665,0.85341,0.7683,0.70835,0.71911,0.8244,0.8634,0.77844,0.85229,0.80647,0.63584,0.72099,0.81288,0.8275,0.77728,0.73742,0.7348,0.70539,0.8138,0.78486,0.72476,0.80072,0.73302,0.75426,0.7664,0.80711,0.77883,0.76108,0.7732,0.82731,0.84668,0.78075,0.7562,0.73406,0.78741,0.78292,0.7518,0.71073,0.73029,0.79962,0.70901,0.69881,0.82665,0.65692,0.7761,0.8413,0.75171,0.83659,0.81011,0.85924,0.67445,0.81258,0.8483,0.77202,0.69854,0.7748,0.81581,0.77597,0.83382,0.85013,0.74719,0.69637,0.81414,0.81699,0.7092,0.72306,0.8583,0.68502,0.80968,0.74174,0.75852,0.76561,0.79724,0.79022,0.69623,0.75202,0.79788,0.76873,0.78205,0.80271
cg16162899,0.81629,0.87404,0.83182,0.79349,0.88133,0.78165,0.78982,0.80409,0.84282,0.81804,0.78062,0.81959,0.7724,0.85921,0.86773,0.85589,0.81287,0.78284,0.79379,0.79051,0.88388,0.79235,0.56569,0.88955,0.84487,0.74863,0.86626,0.81962,0.89661,0.89244,0.77978,0.86603,0.9115,0.86191,0.72684,0.8308,0.87826,0.90903,0.84539,0.86137,0.7749,0.82521,0.5389,0.63629,0.86275,0.7633,0.80327,0.8752,0.78132,0.8905,0.86986,0.70954,0.79717,0.87774,0.70502,0.73621,0.61779,0.84831,0.8617,0.8929,0.86451,0.86565,0.90389,0.79947,0.85642,0.79329,0.8461,0.88194,0.5808,0.81542,0.86342,0.88286,0.79939,0.81119,0.87582,0.91749,0.63729,0.69799,0.78711,0.87672,0.79842,0.86291


(287450, 82)


### methyl_probe_m

In [20]:
methylation['methyl_probe_m'] = pd.read_csv(
    '../downloads/challenge_data/sanger_molecular_data/methyl/'
    'cpg_probe_level/methyl_probe_m.csv/methyl_probe_m.csv',
    sep=',', low_memory=False, index_col=0,
)

In [21]:
display(methylation['methyl_probe_m'].head())
print(methylation['methyl_probe_m'].shape)

Unnamed: 0,C32,HT-29,HCT-116,NCI-H23,MDA-MB-231,COLO-205,MCF7,T47D,BT-549,NCI-H226,A549,MDA-MB-468,SW48,NCI-H1437,DMS-114,NCI-H1299,NCI-H1975,Calu-3,RKO,MDA-MB-436,HCC38,BT-474,SW837,UACC-812,NCI-H1703,NCI-H3122,NCI-H2291,NCI-H520,NCI-H2228,NCI-H358,SW900,NCI-H522,NCI-H2170,NCI-H2085,Calu-6,LS-513,SW948,NCI-H747,CAL-148,MDA-MB-453,HCC1428,CAL-120,CAMA-1,MFM-223,CAL-51,BT-20,HCC1806,HCC70,HCC1954,HCC1187,EVSA-T,MDA-MB-157,MDA-MB-361,HCC1569,HCC1500,DU-4475,TCCSUP,UM-UC-3,HT-1197,647-V,BFTC-905,KU-19-19,VM-CUB-1,HT-1376,SW780,J82,T-24,RT4,VCaP,NCI-H1563,NCI-H838,KATOIII,HCC1419,HCC1395,NCI-H1793,NCI-SNU-16,MDA-MB-415,Hs-578-T,22RV1,HCC1143,M14,HCC1937
cg12045430,-0.81819,-1.2207,-1.1564,-0.50873,-1.112,-1.2493,-1.0685,-0.71638,-0.84827,-1.5493,-0.77097,-0.82917,-1.5347,-0.49286,-0.28991,-0.93614,-0.8649,-0.64211,-1.6664,-0.85804,-0.78778,-0.81449,-1.1456,-0.98531,-0.92718,-0.78755,-0.93083,-0.55844,-0.64328,-0.95011,-1.0157,-0.96456,-0.54687,-0.82267,-0.81343,-1.0042,-1.0131,-1.0072,-0.94308,-1.3219,-0.72696,-1.3496,-1.237,-1.4663,-0.95817,-1.3655,-0.67568,-0.5442,-1.2691,-0.99466,-0.86045,-0.52839,-0.89684,-0.98499,-1.0305,-0.97675,-0.28415,-1.6296,-0.62168,-0.77334,-0.65247,-0.8215,-1.0621,-0.45766,-0.82541,-1.0216,-0.76003,-1.0609,-0.80508,-1.4238,-0.93075,-1.4411,-1.1308,-1.3219,-0.84655,-0.62624,-1.2734,-1.5376,-1.722,-0.85994,-1.8812,-1.5044
cg20826792,0.17446,-0.60727,-0.78594,0.16376,-0.31363,-0.64287,-0.52582,-0.082055,0.21262,-0.29757,-0.14762,-0.26093,-1.1913,0.234,0.24941,-0.1627,-0.20928,-0.2487,-1.2931,-0.36873,-0.10958,-0.40542,-0.60238,-0.47219,-0.4802,-0.18352,-0.44217,-0.27286,-0.13421,-0.35051,-0.51184,0.11326,0.13351,0.10429,-0.39518,-0.37407,-0.37709,-0.40048,-0.33725,-0.8575,-0.14884,-0.021735,-0.45556,-0.5243,-0.28197,-0.80322,-0.060765,0.48468,-0.45911,0.42812,0.085495,-0.15816,-0.59873,-0.63765,-0.31999,-0.5465,0.30276,-0.94335,-0.15305,0.008097,0.12717,-0.21955,-0.3158,0.33652,-0.31646,-0.17232,-0.083329,-0.54785,-0.29734,-0.63432,-0.10833,-0.69608,-0.50714,-0.29095,-0.12281,-0.072061,0.51355,-0.11528,0.37955,-0.10935,0.046928,-0.51656
cg00381604,-1.7621,-2.6351,-2.4915,-0.71225,-2.2581,-1.9622,-2.45,-1.5687,-2.4194,-2.2315,-1.7828,-2.0998,-2.4727,-2.0557,-2.0479,-1.4145,-1.5302,-2.1706,-2.4189,-1.8988,-1.5876,-1.7657,-2.5424,-1.292,-1.9831,-1.379,-1.4499,-1.6131,-1.4051,-1.6251,-1.928,-1.389,-0.71033,-1.041,-2.0128,-2.0036,-1.8438,-2.0237,-2.1427,-2.5798,-1.8622,-1.5636,-2.2045,-2.2913,-2.4583,-2.5319,-2.4032,-1.519,-2.4493,-1.8704,-1.893,-1.4302,-2.2107,-1.8396,-2.2456,-1.6745,-1.4139,-2.5184,-1.1932,-1.303,-1.6178,-1.8985,-2.2656,-1.6905,-2.2036,-2.3603,-1.6403,-1.8272,-1.7922,-2.7592,-2.0042,-2.1694,-1.7853,-2.5356,-2.0973,-1.986,-1.7372,-1.9943,-1.7144,-2.0612,-1.8272,-2.4543
cg24335620,1.599,1.9211,0.98922,2.5414,1.7295,1.2803,1.3562,2.231,2.6601,1.8129,2.5286,2.0591,0.80411,1.3697,2.1191,2.2622,1.8032,1.4897,1.4703,1.2596,2.1278,1.8672,1.3968,2.0065,1.4571,1.618,1.7141,2.065,1.8162,1.6716,1.7694,2.2602,2.4652,1.8323,1.6331,1.4648,1.8891,1.8506,1.5989,1.2969,1.4371,1.9966,1.2848,1.2142,2.2536,0.93714,1.7934,2.4064,1.5981,2.356,2.093,2.6099,1.0508,2.1162,2.4834,1.7597,1.2124,1.7826,2.147,1.7923,2.327,2.504,1.5634,1.1975,2.1311,2.1584,1.2862,1.3845,2.5986,1.1209,2.0889,1.5221,1.6513,1.7077,1.9752,1.9134,1.1966,1.6005,1.9809,1.7329,1.8433,2.0246
cg16162899,2.1517,2.7948,2.3062,1.942,2.8927,1.8399,1.9099,2.0371,2.4228,2.1686,1.8312,2.1836,1.7629,2.6095,2.7138,2.5703,2.119,1.8499,1.9446,1.9159,2.9282,1.932,0.38131,3.0097,2.4452,1.5744,2.6953,2.1839,3.1163,3.0527,1.8241,2.6925,3.3645,2.642,1.4119,2.2958,2.8509,3.3208,2.4509,2.6353,1.7834,2.2391,0.22496,0.80687,2.6521,1.6892,2.0297,2.81,1.8371,3.0237,2.7407,1.2885,1.9746,2.8439,1.257,1.4807,0.69275,2.4834,2.6393,3.0596,2.6737,2.6877,3.2334,1.9952,2.5764,1.9402,2.4589,2.9012,0.47042,2.1433,2.6603,2.914,1.9945,2.1031,2.8182,3.475,0.81316,1.2086,1.8864,2.8301,1.9858,2.6541


(287450, 82)


### probe_info

In [22]:
probe_info = pd.read_csv(
    '../downloads/challenge_data/sanger_molecular_data/methyl/'
    'cpg_probe_level/probe_info.csv/probe_info.csv',
    sep=',', low_memory=False,
)

In [23]:
display(probe_info.head())
print(probe_info.shape)

Unnamed: 0,Name,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,AlleleB_ProbeSeq,Infinium_Design_Type,Next_Base,Color_Channel,Forward_Sequence,Genome_Build,CHR,MAPINFO,SourceSeq,Chromosome_36,Coordinate_36,Strand,Probe_SNPs,Probe_SNPs_10,Random_Loci,Methyl27_Loci,UCSC_RefGene_Name,UCSC_RefGene_Accession,UCSC_RefGene_Group,UCSC_CpG_Islands_Name,Relation_to_UCSC_CpG_Island,Phantom,DMR,Enhancer,HMM_Island,Regulatory_Feature_Name,Regulatory_Feature_Group,DHS
0,cg00000165,12637463,CAAAATCTATTAATACAATAACTTTTAATAAAACAACTAAAACACA...,,,II,,,CTAAGTGCAGTCAGGATCTGTTAGTACAGTGGCTTTTGATGGAACA...,37,1,91194674,AGGATCTGTTAGTACAGTGGCTTTTGATGGAACAGCTGAGGCACAC...,1,90967262,R,,,,,,,,chr1:91190489-91192804,S_Shore,,CDMR,True,1:90967262-90967361,,,
1,cg00000363,16661505,RTCTTAACTTAACTTAATTTTCTCCTTAATCTAAAAAACTTTCCCT...,,,II,,,CTGCCCAATCGGTCCCTTCCTTCACTCCTCCCCATTCTTAACAAGA...,37,1,230560793,TCTTGACTTGGCTTAGTTTTCTCCTTAATCTGAGAAACTTTCCCTG...,1,228627416,F,,,,,,,,chr1:230561103-230562702,N_Shore,,,,1:228627033-228629325,,,
2,cg00000957,65648367,ATACTACTAACCCATACCCAACAAAACAAAAAACCCCAAAACATCA...,36743439.0,ATACTACTAACCCATACCCGACAAAACAAAAAACCCCAAAACGTCA...,I,C,Grn,TCCTGCAGGCGCACAGACCTCATCCGCTCCAGCTTACGCCTGCGGG...,37,1,5937253,ATGCTACTGACCCATGCCCGGCAGGGCAAGGGGCCCCAGGACGTCA...,1,5859840,F,rs3747991,rs77973802,,,NPHP4,NM_015102,Body,chr1:5937157-5937392,Island,,,,1:5859745-5859915,1:5937082-5937731,Unclassified_Cell_type_specific,
3,cg00001349,11722421,CAAAACAACACAAACCAAAATCTTCCAATCTCAAACTATTTATTCC...,53758324.0,CAAAACGACACGAACCGAAATCTTCCAATCTCAAACTATTTATTCC...,I,C,Grn,CCACCTCACCCGCAAGGCGGCACGAGCCGGAATCTTCCAGTCTCAG...,37,1,166958439,CAAGGCGGCACGAGCCGGAATCTTCCAGTCTCAGGCTGTTTGTTCC...,1,165225063,R,,,,,MAEL,NM_032858,TSS200,chr1:166958220-166958683,Island,,,,1:165224845-165225561,,,
4,cg00001446,37673467,AATAAAAATCATTAACAAAAAAAACCCCATCAAAAAATAACCCTAA...,,,II,,,TAGGTCAGGAGAATGGAGGTCATTAGCAAGGGGGACCCCATCAGAG...,37,1,43831041,ATGGAGGTCATTAGCAAGGGGGACCCCATCAGAGGGTAGCCCTGGA...,1,43603628,R,,,,,ELOVL1,NM_022821,Body,chr1:43832814-43833073,N_Shore,,,,,,,


(287450, 32)


## Gene sets

In [24]:
methylation_cell_lines = set(methylation['methyl_ilse_beta'].columns)
print(len(methylation_cell_lines))

82


### cancer_gene_census

In [25]:
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/cosmic')
cancer_gene_census = pd.read_sql_table('cancer_gene_census', engine)

In [26]:
cancer_gene_census.head(2)

Unnamed: 0,gene_symbol,name,entrez_geneid,genome_location,chr_band,somatic,germline,tumour_types_somatic,tumour_types_germline,cancer_syndrome,tissue_type,molecular_genetics,mutation_types,translocation_partner,other_germline_mut,other_syndrome,synonyms
0,ABI1,abl-interactor 1,10006,10:26748570-26860863,10p11.2,yes,,AML,,,L,Dom,T,KMT2A,,,"ABI1,E3B1,ABI-1,SSH3BP1,10006"
1,ABL1,v-abl Abelson murine leukemia viral oncogene h...,25,9:130835447-130885683,9q34.1,yes,,"CML, ALL, T-ALL",,,L,Dom,"T, Mis","BCR, ETV6, NUP214",,,"ABL1,p150,ABL,c-ABL,JTK7,bcr/abl,v-abl,P00519,..."


### drug_to_hgnc_target

In [27]:
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream_2015')
drug_to_hgnc_target = pd.read_sql_table('drug_to_hgnc_target', engine)
drug_to_hgnc_target.head(2)

Unnamed: 0,drug,hgnc_name
0,ADAM17,ADAM17
1,AKT,AKT1


## Subsets

In [28]:
probe_sets = dict()

#### probe_to_gene

In [29]:
probe_to_gene = pd.DataFrame([
        (p, g) for (p, genes) in probe_info[['Name', 'UCSC_RefGene_Name']].dropna().values 
        for g in genes.split(';')
    ], columns=['probe', 'gene'])


In [30]:
probe_to_gene.head()

Unnamed: 0,probe,gene
0,cg00000957,NPHP4
1,cg00001349,MAEL
2,cg00001446,ELOVL1
3,cg00001583,NR5A2
4,cg00001583,NR5A2


In [31]:
probe_info.shape

(287450, 32)

#### cancer_gene_sensus

In [32]:
cancer_gene_census.head(2)

Unnamed: 0,gene_symbol,name,entrez_geneid,genome_location,chr_band,somatic,germline,tumour_types_somatic,tumour_types_germline,cancer_syndrome,tissue_type,molecular_genetics,mutation_types,translocation_partner,other_germline_mut,other_syndrome,synonyms
0,ABI1,abl-interactor 1,10006,10:26748570-26860863,10p11.2,yes,,AML,,,L,Dom,T,KMT2A,,,"ABI1,E3B1,ABI-1,SSH3BP1,10006"
1,ABL1,v-abl Abelson murine leukemia viral oncogene h...,25,9:130835447-130885683,9q34.1,yes,,"CML, ALL, T-ALL",,,L,Dom,"T, Mis","BCR, ETV6, NUP214",,,"ABL1,p150,ABL,c-ABL,JTK7,bcr/abl,v-abl,P00519,..."


In [33]:
probe_sets['cancer_gene_census'] = set(
    cancer_gene_census.merge(probe_to_gene, left_on='gene_symbol', right_on='gene')['probe']
)
print(len(probe_sets['cancer_gene_census']))

9918


#### drug_to_hgnc_target

In [34]:
drug_to_hgnc_target.head()

Unnamed: 0,drug,hgnc_name
0,ADAM17,ADAM17
1,AKT,AKT1
2,AKT,AKT2
3,AKT,AKT3
4,AKT_1,AKT1


In [35]:
probe_sets['drug_to_hgnc_target'] = set(
    drug_to_hgnc_target.merge(probe_to_gene, left_on='hgnc_name', right_on='gene')['probe']
)
print(len(probe_sets['drug_to_hgnc_target']))

3991


In [36]:
print("Number of missing targets:", len(set(drug_to_hgnc_target['hgnc_name']) - set(probe_to_gene['gene'])))

Number of missing targets: 26


#### Enhancer

In [37]:
Counter(probe_info['Enhancer'].dropna())

Counter({True: 28799})

In [38]:
probe_sets['enhancer'] = probe_info[probe_info['Enhancer'] == True]['Name']
print(len(probe_sets['enhancer']))

28799


#### DMR

In [39]:
Counter(probe_info['DMR'].dropna())

Counter({'CDMR': 5169, 'DMR': 15394, 'RDMR': 10385})

In [40]:
probe_sets['CDMR'] = probe_info[probe_info['DMR'] == 'CDMR']['Name']
print(len(probe_sets['CDMR']))

probe_sets['DMR'] = probe_info[probe_info['DMR'] == 'DMR']['Name']
print(len(probe_sets['DMR']))

probe_sets['RDMR'] = probe_info[probe_info['DMR'] == 'RDMR']['Name']
print(len(probe_sets['RDMR']))

5169
15394
10385


#### Probe_SNPs_10

In [41]:
Counter(probe_info['Probe_SNPs_10'].dropna())

Counter({'rs3103221': 1,
         'rs28571589': 1,
         'rs11545698': 1,
         'rs17855991': 1,
         'rs58626562': 1,
         'rs77234486': 1,
         'rs77453698': 1,
         'rs76620711': 1,
         'rs62119032': 1,
         'rs7277954': 1,
         'rs35287188': 1,
         'rs12032520': 1,
         'rs2736846': 1,
         'rs78276637': 1,
         'rs79250533': 1,
         'rs13390084': 1,
         'rs74935574': 1,
         'rs75875499': 1,
         'rs78702718': 1,
         'rs77209957': 1,
         'rs71512848': 1,
         'rs77708976': 1,
         'rs79014038': 1,
         'rs422946': 1,
         'rs11553096': 1,
         'rs744446': 1,
         'rs34147588': 1,
         'rs77171912': 1,
         'rs17881696': 1,
         'rs78254674': 1,
         'rs73339286': 1,
         'rs78106474': 1,
         'rs77119996': 1,
         'rs888820': 1,
         'rs74641028': 1,
         'rs2677741': 1,
         'rs4284953': 1,
         'rs77001450': 1,
         'rs3832920': 1

In [42]:
probe_sets['has_snp'] = probe_info[probe_info['Probe_SNPs_10'].notnull()]['Name']
print(len(probe_sets['has_snp']))

14694


#### DHS

In [43]:
Counter(probe_info['DHS'].dropna())

Counter({True: 41581})

In [44]:
Counter(probe_info['Relation_to_UCSC_CpG_Island'].dropna())

Counter({'Island': 139783,
         'N_Shelf': 22790,
         'N_Shore': 58598,
         'S_Shelf': 20451,
         'S_Shore': 45828})

# Correlations

In [45]:
methylation.keys()

dict_keys(['methyl_probe_beta', 'methyl_ilse_beta', 'methyl_ilse_m', 'methyl_probe_m'])

In [46]:
results_all = {}
results_best = {}

for feature in methylation.keys():
    print('feature: {}'.format(feature)); sys.stdout.flush()
    if '_ilse_' in feature:
        p_sets = [None]
    else:
        p_sets = list(probe_sets.keys()) + [None]
    for probe_set in p_sets:
        print('probe_set: {}'.format(probe_set)); sys.stdout.flush()
        df = methylation[feature].T
        if probe_set is not None:
            probe_set_list = list(set(probe_sets[probe_set]) & set(df.columns))
            df = df[probe_set_list]
        index = df.index.copy()
        index.names = ['c_1']
        columns=df.index.copy()
        columns.names = ['c_2']
        # Get a correlation score using all available metrics
        data = df.values
        results = []
        for metric in fn.metrics:
            PW = sp.spatial.distance.squareform(sp.spatial.distance.pdist(data, metric=metric))
            PW = pd.DataFrame(PW, index=index, columns=columns).unstack().reset_index()
            PW['synergy_score_x'] = PW['c_1'].map(css)
            PW['synergy_score_y'] = PW['c_2'].map(css)
            PW['synergy_score_diff'] = (PW['synergy_score_y'] - PW['synergy_score_x'])
            # Pearson
            P_r, P_p = sp.stats.pearsonr(PW['synergy_score_diff'].abs(), PW[0])
            # Spearman
            S = sp.stats.spearmanr(PW['synergy_score_diff'].abs(), PW[0])
            S_r, S_p = S.correlation, S.pvalue
            results.append((metric, P_r, S_r, PW))
        results.sort(key=lambda x: np.nansum([abs(x[1]), abs(x[2])]), reverse=True)
        # Save best result
        metric, P_r, S_r, PW = results[0]
        results_best[(feature, probe_set)] = {
            'metric': metric,
            'P_r': P_r,
            'S_r': S_r,
            'df': PW,
        }
        # Stats for all metrics
        df = pd.DataFrame([r[:3] for r in results], columns=['metric', 'pearson_r', 'spearman_r'])
        results_all[(feature, probe_set)] = df

feature: methyl_probe_beta
probe_set: enhancer
probe_set: RDMR
probe_set: cancer_gene_census
probe_set: DMR
probe_set: CDMR
probe_set: drug_to_hgnc_target
probe_set: has_snp
probe_set: None
feature: methyl_ilse_beta
probe_set: None
feature: methyl_ilse_m
probe_set: None
feature: methyl_probe_m
probe_set: enhancer
probe_set: RDMR
probe_set: cancer_gene_census
probe_set: DMR
probe_set: CDMR
probe_set: drug_to_hgnc_target
probe_set: has_snp
probe_set: None


In [47]:
results_best_df = pd.DataFrame(results_best).T.reset_index()
results_best_df = pd.DataFrame(results_best).T.reset_index().rename(
    columns={'index': 'feature'}
)
results_best_df['probe_set'] = results_best_df['feature'].apply(lambda x: x[1])
results_best_df['feature'] = results_best_df['feature'].apply(lambda x: x[0])
results_best_df.head()

Unnamed: 0,feature,P_r,S_r,df,metric,probe_set
0,methyl_ilse_m,0.0798599,0.105409,c_2 c_1 0 synerg...,hamming,
1,methyl_probe_beta,0.0938926,0.0860326,c_2 c_1 0 synerg...,chebyshev,cancer_gene_census
2,methyl_probe_beta,0.0867386,0.0720676,c_2 c_1 0 synerg...,chebyshev,enhancer
3,methyl_probe_m,0.112632,0.0636468,c_2 c_1 0 syne...,seuclidean,drug_to_hgnc_target
4,methyl_probe_m,0.136295,0.0750324,c_2 c_1 0 synergy...,chebyshev,CDMR


In [48]:
results_best_df['abs'] = results_best_df[['P_r', 'S_r']].apply(lambda x: np.mean(np.abs(x)), axis=1)
results_best_df.sort_values('abs', ascending=False, inplace=True)
display(results_best_df)

Unnamed: 0,feature,P_r,S_r,df,metric,probe_set,abs
12,methyl_probe_m,0.177524,0.1532,c_2 c_1 0 synergy...,chebyshev,has_snp,0.165362
16,methyl_probe_beta,0.114378,0.145104,c_2 c_1 0 synerg...,chebyshev,has_snp,0.129741
11,methyl_probe_m,0.127562,0.126189,c_2 c_1 0 synergy...,chebyshev,DMR,0.126875
8,methyl_probe_beta,0.0989615,0.124982,c_2 c_1 0 synerg...,chebyshev,DMR,0.111972
4,methyl_probe_m,0.136295,0.0750324,c_2 c_1 0 synergy...,chebyshev,CDMR,0.105664
13,methyl_probe_m,0.0995927,0.0929562,c_2 c_1 0 synergy...,chebyshev,enhancer,0.096274
14,methyl_probe_m,0.101848,0.0862922,c_2 c_1 0 synergy...,chebyshev,cancer_gene_census,0.09407
0,methyl_ilse_m,0.0798599,0.105409,c_2 c_1 0 synerg...,hamming,,0.092634
6,methyl_probe_m,0.103238,0.0793523,c_2 c_1 0 synergy...,chebyshev,,0.091295
1,methyl_probe_beta,0.0938926,0.0860326,c_2 c_1 0 synerg...,chebyshev,cancer_gene_census,0.089963


In [49]:
results_all[('methyl_probe_m', 'enhancer')]

Unnamed: 0,metric,pearson_r,spearman_r
0,chebyshev,0.099593,0.092956
1,hamming,0.079858,0.023832
2,jaccard,0.079858,0.023832
3,seuclidean,0.069363,0.030014
4,euclidean,0.065774,0.020866
5,minkowski,0.065774,0.020866
6,cityblock,0.059866,0.019033
7,sqeuclidean,0.043907,0.020866
8,braycurtis,0.045578,0.018212
9,cosine,0.042289,0.016987


# Save

In [50]:
import csv2sql
db = csv2sql.DataFrameToMySQL(
    'mysql://strokach:@192.168.6.19:3306/az_dream_2015_features', 
    'methyl', 
    '192.168.6.8', 
    echo=False
)

In [51]:
import csv2sql.g2d

## gc

In [52]:
methylation.keys()

dict_keys(['methyl_probe_beta', 'methyl_ilse_beta', 'methyl_ilse_m', 'methyl_probe_m'])

In [53]:
methylation['methyl_probe_m'].head()

Unnamed: 0,C32,HT-29,HCT-116,NCI-H23,MDA-MB-231,COLO-205,MCF7,T47D,BT-549,NCI-H226,A549,MDA-MB-468,SW48,NCI-H1437,DMS-114,NCI-H1299,NCI-H1975,Calu-3,RKO,MDA-MB-436,HCC38,BT-474,SW837,UACC-812,NCI-H1703,NCI-H3122,NCI-H2291,NCI-H520,NCI-H2228,NCI-H358,SW900,NCI-H522,NCI-H2170,NCI-H2085,Calu-6,LS-513,SW948,NCI-H747,CAL-148,MDA-MB-453,HCC1428,CAL-120,CAMA-1,MFM-223,CAL-51,BT-20,HCC1806,HCC70,HCC1954,HCC1187,EVSA-T,MDA-MB-157,MDA-MB-361,HCC1569,HCC1500,DU-4475,TCCSUP,UM-UC-3,HT-1197,647-V,BFTC-905,KU-19-19,VM-CUB-1,HT-1376,SW780,J82,T-24,RT4,VCaP,NCI-H1563,NCI-H838,KATOIII,HCC1419,HCC1395,NCI-H1793,NCI-SNU-16,MDA-MB-415,Hs-578-T,22RV1,HCC1143,M14,HCC1937
cg12045430,-0.81819,-1.2207,-1.1564,-0.50873,-1.112,-1.2493,-1.0685,-0.71638,-0.84827,-1.5493,-0.77097,-0.82917,-1.5347,-0.49286,-0.28991,-0.93614,-0.8649,-0.64211,-1.6664,-0.85804,-0.78778,-0.81449,-1.1456,-0.98531,-0.92718,-0.78755,-0.93083,-0.55844,-0.64328,-0.95011,-1.0157,-0.96456,-0.54687,-0.82267,-0.81343,-1.0042,-1.0131,-1.0072,-0.94308,-1.3219,-0.72696,-1.3496,-1.237,-1.4663,-0.95817,-1.3655,-0.67568,-0.5442,-1.2691,-0.99466,-0.86045,-0.52839,-0.89684,-0.98499,-1.0305,-0.97675,-0.28415,-1.6296,-0.62168,-0.77334,-0.65247,-0.8215,-1.0621,-0.45766,-0.82541,-1.0216,-0.76003,-1.0609,-0.80508,-1.4238,-0.93075,-1.4411,-1.1308,-1.3219,-0.84655,-0.62624,-1.2734,-1.5376,-1.722,-0.85994,-1.8812,-1.5044
cg20826792,0.17446,-0.60727,-0.78594,0.16376,-0.31363,-0.64287,-0.52582,-0.082055,0.21262,-0.29757,-0.14762,-0.26093,-1.1913,0.234,0.24941,-0.1627,-0.20928,-0.2487,-1.2931,-0.36873,-0.10958,-0.40542,-0.60238,-0.47219,-0.4802,-0.18352,-0.44217,-0.27286,-0.13421,-0.35051,-0.51184,0.11326,0.13351,0.10429,-0.39518,-0.37407,-0.37709,-0.40048,-0.33725,-0.8575,-0.14884,-0.021735,-0.45556,-0.5243,-0.28197,-0.80322,-0.060765,0.48468,-0.45911,0.42812,0.085495,-0.15816,-0.59873,-0.63765,-0.31999,-0.5465,0.30276,-0.94335,-0.15305,0.008097,0.12717,-0.21955,-0.3158,0.33652,-0.31646,-0.17232,-0.083329,-0.54785,-0.29734,-0.63432,-0.10833,-0.69608,-0.50714,-0.29095,-0.12281,-0.072061,0.51355,-0.11528,0.37955,-0.10935,0.046928,-0.51656
cg00381604,-1.7621,-2.6351,-2.4915,-0.71225,-2.2581,-1.9622,-2.45,-1.5687,-2.4194,-2.2315,-1.7828,-2.0998,-2.4727,-2.0557,-2.0479,-1.4145,-1.5302,-2.1706,-2.4189,-1.8988,-1.5876,-1.7657,-2.5424,-1.292,-1.9831,-1.379,-1.4499,-1.6131,-1.4051,-1.6251,-1.928,-1.389,-0.71033,-1.041,-2.0128,-2.0036,-1.8438,-2.0237,-2.1427,-2.5798,-1.8622,-1.5636,-2.2045,-2.2913,-2.4583,-2.5319,-2.4032,-1.519,-2.4493,-1.8704,-1.893,-1.4302,-2.2107,-1.8396,-2.2456,-1.6745,-1.4139,-2.5184,-1.1932,-1.303,-1.6178,-1.8985,-2.2656,-1.6905,-2.2036,-2.3603,-1.6403,-1.8272,-1.7922,-2.7592,-2.0042,-2.1694,-1.7853,-2.5356,-2.0973,-1.986,-1.7372,-1.9943,-1.7144,-2.0612,-1.8272,-2.4543
cg24335620,1.599,1.9211,0.98922,2.5414,1.7295,1.2803,1.3562,2.231,2.6601,1.8129,2.5286,2.0591,0.80411,1.3697,2.1191,2.2622,1.8032,1.4897,1.4703,1.2596,2.1278,1.8672,1.3968,2.0065,1.4571,1.618,1.7141,2.065,1.8162,1.6716,1.7694,2.2602,2.4652,1.8323,1.6331,1.4648,1.8891,1.8506,1.5989,1.2969,1.4371,1.9966,1.2848,1.2142,2.2536,0.93714,1.7934,2.4064,1.5981,2.356,2.093,2.6099,1.0508,2.1162,2.4834,1.7597,1.2124,1.7826,2.147,1.7923,2.327,2.504,1.5634,1.1975,2.1311,2.1584,1.2862,1.3845,2.5986,1.1209,2.0889,1.5221,1.6513,1.7077,1.9752,1.9134,1.1966,1.6005,1.9809,1.7329,1.8433,2.0246
cg16162899,2.1517,2.7948,2.3062,1.942,2.8927,1.8399,1.9099,2.0371,2.4228,2.1686,1.8312,2.1836,1.7629,2.6095,2.7138,2.5703,2.119,1.8499,1.9446,1.9159,2.9282,1.932,0.38131,3.0097,2.4452,1.5744,2.6953,2.1839,3.1163,3.0527,1.8241,2.6925,3.3645,2.642,1.4119,2.2958,2.8509,3.3208,2.4509,2.6353,1.7834,2.2391,0.22496,0.80687,2.6521,1.6892,2.0297,2.81,1.8371,3.0237,2.7407,1.2885,1.9746,2.8439,1.257,1.4807,0.69275,2.4834,2.6393,3.0596,2.6737,2.6877,3.2334,1.9952,2.5764,1.9402,2.4589,2.9012,0.47042,2.1433,2.6603,2.914,1.9945,2.1031,2.8182,3.475,0.81316,1.2086,1.8864,2.8301,1.9858,2.6541


In [54]:
methyl_gene_m_gbgc = (
    methylation['methyl_probe_m']
    .merge(probe_to_gene.set_index('probe'), left_index=True, right_index=True)
    .set_index('gene')
    .unstack()
    .reset_index()
    .rename(columns={'level_0': 'c', 'gene': 'g', 0: 'methyl_probe_m_gbgc'})
    .groupby(['g', 'c'])
    .agg(['max', 'min', 'mean', 'std'])
    .reset_index()
)
methyl_gene_m_gbgc.columns = ['_'.join(c).strip('_') for c in methyl_gene_m_gbgc.columns]

In [55]:
display(methyl_gene_m_gbgc.head())
print(methyl_gene_m_gbgc.shape)

Unnamed: 0,g,c,methyl_probe_m_gbgc_max,methyl_probe_m_gbgc_min,methyl_probe_m_gbgc_mean,methyl_probe_m_gbgc_std
0,A1BG,22RV1,3.3757,-3.7236,2.072653,1.962828
1,A1BG,647-V,2.2957,-0.20359,0.931899,0.683624
2,A1BG,A549,3.2095,-1.3556,1.824773,1.18244
3,A1BG,BFTC-905,3.1022,-3.4555,1.724413,1.903672
4,A1BG,BT-20,2.54,-2.406,1.531114,1.287592


(1260258, 6)


In [56]:
methyl_gene_beta_gbgc = (
    methylation['methyl_probe_beta']
    .merge(probe_to_gene.set_index('probe'), left_index=True, right_index=True)
    .set_index('gene')
    .unstack()
    .reset_index()
    .rename(columns={'level_0': 'c', 'gene': 'g', 0: 'methyl_probe_beta_gbgc'})
    .groupby(['g', 'c'])
    .agg(['max', 'min', 'mean', 'std'])
    .reset_index()
)
methyl_gene_beta_gbgc.columns = ['_'.join(c).strip('_') for c in methyl_gene_beta_gbgc.columns]

In [57]:
display(methyl_gene_beta_gbgc.head())
print(methyl_gene_beta_gbgc.shape)

Unnamed: 0,g,c,methyl_probe_beta_gbgc_max,methyl_probe_beta_gbgc_min,methyl_probe_beta_gbgc_mean,methyl_probe_beta_gbgc_std
0,A1BG,22RV1,0.91212,0.070372,0.777246,0.249041
1,A1BG,647-V,0.8308,0.46478,0.648897,0.102923
2,A1BG,A549,0.90244,0.28098,0.759206,0.168752
3,A1BG,BFTC-905,0.89569,0.083544,0.740844,0.25011
4,A1BG,BT-20,0.85329,0.15873,0.727775,0.18254


(1260258, 6)


In [58]:
# Combine
methyl_gene_gbgc = (
    methyl_gene_m_gbgc
    .merge(methyl_gene_beta_gbgc, on=['g', 'c'], how='outer')
)

In [59]:
methyl_gene_gbgc.head()

Unnamed: 0,g,c,methyl_probe_m_gbgc_max,methyl_probe_m_gbgc_min,methyl_probe_m_gbgc_mean,methyl_probe_m_gbgc_std,methyl_probe_beta_gbgc_max,methyl_probe_beta_gbgc_min,methyl_probe_beta_gbgc_mean,methyl_probe_beta_gbgc_std
0,A1BG,22RV1,3.3757,-3.7236,2.072653,1.962828,0.91212,0.070372,0.777246,0.249041
1,A1BG,647-V,2.2957,-0.20359,0.931899,0.683624,0.8308,0.46478,0.648897,0.102923
2,A1BG,A549,3.2095,-1.3556,1.824773,1.18244,0.90244,0.28098,0.759206,0.168752
3,A1BG,BFTC-905,3.1022,-3.4555,1.724413,1.903672,0.89569,0.083544,0.740844,0.25011
4,A1BG,BT-20,2.54,-2.406,1.531114,1.287592,0.85329,0.15873,0.727775,0.18254


In [60]:
db.import_table(
    methyl_gene_gbgc, 
    'methyl_gbgc', [
        [('g', 'c'), True],
        [('c', 'g'), False],
    ],
)

DEBUG:csv2sql.core:Uncompressing file...
INFO:csv2sql.core:bzip2 -dkf '/home/kimlab1/database_data/biodata/recipes/az_dream_2015/notebooks/methyl/methyl_gbgc.tsv.bz2'
DEBUG:csv2sql.core:Running on host: '192.168.6.8'
DEBUG:csv2sql.core:Initializing SSH client: '192.168.6.8'
DEBUG:csv2sql.core:Command ran successfully!
DEBUG:csv2sql.core:output: 
DEBUG:csv2sql.core:Loading data into MySQL table: 'methyl_gbgc'...
DEBUG:csv2sql.core:Running locally
DEBUG:csv2sql.core:Command ran successfully!
DEBUG:csv2sql.core:output: 
DEBUG:csv2sql.core:Removing uncompressed file '/home/kimlab1/database_data/biodata/recipes/az_dream_2015/notebooks/methyl/methyl_gbgc.tsv'...
INFO:csv2sql.core:rm -f '/home/kimlab1/database_data/biodata/recipes/az_dream_2015/notebooks/methyl/methyl_gbgc.tsv'
DEBUG:csv2sql.core:Running locally
DEBUG:csv2sql.core:Command ran successfully!
DEBUG:csv2sql.core:output: 


In [None]:
csv2sql.g2d.g2d('methyl_gbgc')

DROP TABLE IF EXISTS az_dream_2015_features.methyl_gbdc;

CREATE TABLE az_dream_2015_features.methyl_gbdc AS
SELECT
d2t.drug d,
t.c c,
max(t.methyl_probe_m_gbgc_max) methyl_probe_m_gbgc_max_max,
min(t.methyl_probe_m_gbgc_min) methyl_probe_m_gbgc_min_min,
avg(t.methyl_probe_m_gbgc_mean) methyl_probe_m_gbgc_mean_mean,
avg(t.methyl_probe_m_gbgc_std) methyl_probe_m_gbgc_std_mean,
max(t.methyl_probe_beta_gbgc_max) methyl_probe_beta_gbgc_max_max,
min(t.methyl_probe_beta_gbgc_min) methyl_probe_beta_gbgc_min_min,
avg(t.methyl_probe_beta_gbgc_mean) methyl_probe_beta_gbgc_mean_mean,
avg(t.methyl_probe_beta_gbgc_std) methyl_probe_beta_gbgc_std_mean
FROM az_dream_2015.drug_to_hgnc_target d2t
LEFT JOIN az_dream_2015_features.methyl_gbgc t ON (t.g = d2t.hgnc_name)
GROUP BY d2t.drug , t.c;

ALTER TABLE az_dream_2015_features.methyl_gbdc
MODIFY d VARCHAR(255);

CREATE INDEX a ON az_dream_2015_features.methyl_gbdc (d);

DROP INDEX a ON az_dream_2015_features.methyl_gbdc;

ALTER TABLE az_dream_2015_feat

## g

In [None]:
methyl_gene_m_gbg = (
    methylation['methyl_probe_m']
    .merge(probe_to_gene.set_index('probe'), left_index=True, right_index=True)
    .set_index('gene')
    .unstack()
    .reset_index()
    .rename(columns={'level_0': 'c', 'gene': 'g', 0: 'methyl_probe_m_gbg'})
    .groupby(['g'])
    .agg(['max', 'min', 'mean', 'std'])
    .reset_index()
)
methyl_gene_m_gbg.columns = ['_'.join(c).strip('_') for c in methyl_gene_m_gbg.columns]

In [None]:
methyl_gene_m_gbg.head()

In [None]:
methyl_gene_beta_gbg = (
    methylation['methyl_probe_beta']
    .merge(probe_to_gene.set_index('probe'), left_index=True, right_index=True)
    .set_index('gene')
    .unstack()
    .reset_index()
    .rename(columns={'level_0': 'c', 'gene': 'g', 0: 'methyl_probe_beta_gbg'})
    .groupby(['g'])
    .agg(['max', 'min', 'mean', 'std'])
    .reset_index()
)
methyl_gene_beta_gbg.columns = ['_'.join(c).strip('_') for c in methyl_gene_beta_gbg.columns]

In [None]:
methyl_gene_beta_gbg.head()

In [None]:
# Combine
methyl_gene_gbg = (
    methyl_gene_m_gbg
    .merge(methyl_gene_beta_gbg, on=['g'], how='outer')
)

In [None]:
methyl_gene_gbg.head()

In [None]:
!ls /home/kimlab1/database_data/biodata/recipes/az_dream_2015/notebooks/methyl/methyl_gbg.tsv

In [None]:
db.import_table(
    methyl_gene_gbg, 
    'methyl_gbg', [
        [('g'), True],
    ],
)

In [None]:
csv2sql.g2d.g2d('methyl_gbg')

## c

In [None]:
methylation.keys()

In [None]:
probe_sets.keys()

In [None]:
 methylation['methyl_probe_m'].head()

In [None]:
methyl_gbc = methylation['methyl_probe_m'].T[[]]

for key in methylation.keys():
    methyl_gbc[key + '_max'] = methylation[key].max()
    methyl_gbc[key + '_min'] = methylation[key].min()
    methyl_gbc[key + '_mean'] = methylation[key].mean()
    methyl_gbc[key + '_std'] = methylation[key].std()
    if '_probe' not in key:
        continue
    for probe_set_name, probe_set in probe_sets.items():
        methyl_gbc[key + '_' + probe_set_name + '_max'] = methylation[key].loc[probe_set, :].max()
        methyl_gbc[key + '_' + probe_set_name + '_min'] = methylation[key].loc[probe_set, :].min()
        methyl_gbc[key + '_' + probe_set_name + '_mean'] = methylation[key].loc[probe_set, :].mean()
        methyl_gbc[key + '_' + probe_set_name + '_std'] = methylation[key].loc[probe_set, :].std()

methyl_gbc = (
    methyl_gbc
    .reset_index()
    .rename(columns={'index': 'c'})
)
methyl_gbc.columns = [c.lower() for c in methyl_gbc.columns]

In [None]:
methyl_gbc.head()

In [None]:
db.import_table(
    methyl_gbc, 
    'methyl_gbc', [
        [('c'), True],
    ],
)

## cc

In [None]:
results_best_df.head()

In [None]:
dfs = []
for idx, row in results_best_df.iterrows():
    column_name = (
        '_'.join([row['feature'], row['metric'], row['probe_set'] if row['probe_set'] else ''])
        .lower()
    )
    print(column_name)
    df = row['df'].rename(columns={0: column_name})[['c_1', 'c_2', column_name]]
    dfs.append(df)
#     if subset not in ['', 'cgs', 'target']:
#         continue
#     print(subset)
#     for idx, row in results_best_df[results_best_df['subset'] == subset].iterrows():
#         group = row['group']
#         column_name = group + (('_of_' + subset) if subset else '')
#         df = row['df'].reindex_axis(['c_1', 'c_2', 0], axis=1).rename(columns={0: column_name})
#         dfs.append(df)

In [None]:
dfs[0].head()

In [None]:
final_df = dfs[0]
for df in dfs[1:]:
    final_df = final_df.merge(df, on=['c_1', 'c_2'], how='left')

In [None]:
final_df.head()

In [None]:
final_df.shape

In [None]:
db.import_table(
    final_df, 
    'methyl_gbcc', [
        [('c_1', 'c_2'), True],
    ],
)