In [1]:
from sklearn.preprocessing import LabelEncoder
from correlation import compute_mic, compute_ks
from minepy import MINE
from scipy.stats import ks_2samp

import pandas as pd

In [2]:
fish = ['13q14', '13q34', '17p13', '1q21', '11p15', '15q15', '19q13', '20q13', '21q22', '3q21', '5q31', '7q22', '9q33', 't_11_14_ccnd1',
        't_12_14_ccnd2', 't_14_16_maf', 't_14_20_mafb', 't_4_14_whsc1', 't_6_14_ccnd3', 't_8_14_mafa', 't_8_14_myc']

In [3]:
dtdp = pd.read_csv('data/mmrf/clinical_outcome/time_to_endpoint/days_to_disease_progression.tsv', sep='\t', index_col='ID').dropna()

dtdp.head()

Unnamed: 0_level_0,days_to_disease_progression
ID,Unnamed: 1_level_1
MMRF2754,7
MMRF2151,12
MMRF1450,15
MMRF1634,15
MMRF1778,18


# Clinical

In [4]:
clinical = pd.read_csv('data/clinical.tsv', sep='\t', index_col='ID')

resp = clinical.iloc[:,[0]]

kss = {'feature': [], 'metric': []}

for c in clinical.columns[2:]:
    
    if c in fish:
        continue
    
    tmp = clinical[c].dropna(how='any')
    
    if tmp.dtype == 'object':
        tmp = pd.DataFrame({c: LabelEncoder().fit_transform(tmp)}, index=tmp.index)
    
    else:
        tmp = pd.DataFrame(tmp)
        
    
    tmp = tmp.join(resp, how='inner')
    
    x, y = tmp.iloc[:, 0], tmp.iloc[:, 1]
    
    ks_result = ks_2samp(x[y == 0], x[y == 1])[1]

    kss['feature'].append(c)
    
    kss['metric'].append(ks_result)
    
kss = pd.DataFrame(kss)

kss.to_csv('output/correlation_clinical_x_brft.csv', sep=',', index=False)

kss.head()

Unnamed: 0,feature,metric
0,response_days_to_first_response,0.267081
1,cmmc,0.045612
2,ecog_ps,0.971618
3,cell_markers,0.850335
4,percent_aneuploid,0.947623


In [5]:
clinical = pd.read_csv('data/clinical.tsv', sep='\t', index_col='ID')

mine = MINE()

resp = dtdp

mics = {'feature': [], 'metric': []}

for c in clinical.columns[3:]:
    
    if c in fish:
        continue
    
    tmp = clinical[c].dropna(how='any')
    
    if tmp.dtype == 'object':
        tmp = pd.DataFrame({c: LabelEncoder().fit_transform(tmp)}, index=tmp.index)
    
    else:
        tmp = pd.DataFrame(tmp)
        
    
    tmp = tmp.join(resp, how='inner')
    
    x, y = tmp.iloc[:, 0], tmp.iloc[:, 1]
    
    mine.compute_score(x, y)

    mics['feature'].append(c)
    
    mics['metric'].append(mine.mic())
    
mics = pd.DataFrame(mics)

mics.to_csv('output/correlation_clinical_x_m2dp.csv', sep=',', index=False)

mics.head()

Unnamed: 0,feature,metric
0,cmmc,0.167826
1,ecog_ps,0.160572
2,cell_markers,0.162312
3,percent_aneuploid,0.178647
4,percent_plama_cells_bone_marrow,0.161906


# FISH

In [6]:
clinical = pd.read_csv('data/clinical.tsv', sep='\t', index_col='ID')

resp = clinical.iloc[:,[0]]

kss = {'feature': [], 'metric': []}

for c in clinical.columns[2:]:
    
    if c not in fish:
        continue
    
    tmp = clinical[c].dropna(how='any')
    
    if tmp.dtype == 'object':
        tmp = pd.Series(LabelEncoder().fit_transform(tmp), index=tmp.index, name=c)
    
    tmp = resp.join(tmp, how='inner')
    
    x = tmp.iloc[:, 0]
    y = tmp.iloc[:, 1]
    
    ks_result = ks_2samp(x[y == 0], x[y == 1])[1]

    kss['feature'].append(c)
    
    kss['metric'].append(ks_result)
    
kss = pd.DataFrame(kss)

kss.to_csv('output/correlation_fish_x_brft.csv', sep=',', index=False)

kss.head()

Unnamed: 0,feature,metric
0,13q14,0.787558
1,13q34,0.990075
2,17p13,0.977247
3,1q21,1.0
4,11p15,0.425851


In [7]:
clinical = pd.read_csv('data/clinical.tsv', sep='\t', index_col='ID')

mine = MINE()

resp = dtdp

mics = {'feature': [], 'metric': []}

for c in clinical.columns[2:]:
    
    if c not in fish:
        continue
    
    tmp = clinical[c].dropna(how='any')
    
    if tmp.dtype == 'object':
        tmp = pd.DataFrame({c: LabelEncoder().fit_transform(tmp)}, index=tmp.index)
    
    tmp = tmp.join(resp, how='inner')
    
    x, y = tmp.iloc[:, 0], tmp.iloc[:, 1]
    
    mine.compute_score(x, y)

    mics['feature'].append(c)
    
    mics['metric'].append(mine.mic())
    
mics = pd.DataFrame(mics)

mics.to_csv('output/correlation_fish_x_m2dp.csv', sep=',', index=False)

mics.head()

Unnamed: 0,feature,metric
0,13q14,0.200009
1,13q34,0.167785
2,17p13,0.136255
3,1q21,0.172007
4,11p15,0.191768


# GENE EXPRESSION

In [8]:
genecount = pd.read_csv('data/gene_count.tsv', sep='\t', index_col='ID')

### Days to First Response

In [9]:
df = clinical[['response_days_to_first_response']].dropna(how='any').join(genecount, how='inner')

print(df.shape)

kss = compute_ks(df, ['response_days_to_first_response'])

del kss['DEP_VAR']

kss.columns = ['feature', 'metric']

kss.to_csv('output/correlation_geneexp_x_brft.csv', sep=',', index=False)

kss.head()

(687, 27778)


Unnamed: 0,feature,metric
0,ENSG00000000003,0.927611
1,ENSG00000000005,0.989423
2,ENSG00000000419,0.099205
3,ENSG00000000457,0.141905
4,ENSG00000000460,0.017151


### Best Response Fist Line Therapy

In [10]:
df = clinical[['response_best_response_first_line']].dropna(how='any').join(genecount, how='inner')

kss = compute_ks(df, ['response_best_response_first_line'])

del kss['DEP_VAR']

kss.columns = ['feature', 'metric']

kss.to_csv('output/correlation_geneexp_x_brft.csv', sep=',', index=False)

kss.head()

Unnamed: 0,feature,metric
0,ENSG00000000003,0.661604
1,ENSG00000000005,0.999558
2,ENSG00000000419,0.978138
3,ENSG00000000457,0.824283
4,ENSG00000000460,0.761516


In [11]:
df = dtdp.join(genecount, how='inner')

mics = compute_mic(df, [dtdp.columns[0]])

del mics['DEP_VAR']

mics.columns = ['feature', 'metric']

mics.to_csv('output/correlation_geneexp_x_m2dp.csv', sep=',', index=False)

mics.head()

Unnamed: 0,feature,metric
0,ENSG00000000003,0.157618
1,ENSG00000000005,0.15179
2,ENSG00000000419,0.197359
3,ENSG00000000457,0.158206
4,ENSG00000000460,0.17666
