In [1]:
import sys
import os
from os.path import dirname, realpath, join
base_dir = dirname(dirname(os.getcwd()))
import itertools
import pandas as pd
from os.path import join
base_dir

sys.path.insert(0, base_dir)
from config_path import PROSTATE_DATA_PATH, PLOTS_PATH, GENE_PATH
from data.data_access import Data


## Mutations

In [2]:
mut_file = join(PROSTATE_DATA_PATH, 'processed/P1000_final_analysis_set_cross_important_only.csv')
mut_df = pd.read_csv(mut_file, index_col=0)

In [3]:
maml3_mut = mut_df['MAML3'].astype(int)
maml3_mut.value_counts()

0    1003
1       8
Name: MAML3, dtype: int64

In [4]:
100*8/1013.

0.7897334649555775

## Copy number 

In [5]:
selected_genes = 'tcga_prostate_expressed_genes_and_cancer_genes_and_memebr_of_reactome.csv'
data_params = {'id': 'ALL', 'type': 'prostate_paper',
             'params': {
                 'data_type': ['cnv'],
                 'account_for_data_type' : None,
                 'drop_AR': False,
                 'cnv_levels': 5,
                 'mut_binary': False,
                 'balanced_data': False,
                 'combine_type': 'union',  # intersection
                 'use_coding_genes_only': True,
                 'selected_genes': selected_genes,
                 'selected_samples': None,
                 'training_split': 0,
             }
             }

In [6]:
data_adapter = Data(**data_params)



In [7]:
x, y, info, col = data_adapter.get_data()

In [8]:
x_df = pd.DataFrame(x, columns = col, index=info)
x_df.head()

Unnamed: 0_level_0,RNF14,OR52B2,AGL,AGK,NCBP1,NCBP2,HSPA4,CELA2A,FTMT,AGA,...,PDCD6IP,BBS7,AP4M1,NPY5R,ACTL6A,GNGT1,AIP,WNT16,SELP,OR4A5
Unnamed: 0_level_1,cnv,cnv,cnv,cnv,cnv,cnv,cnv,cnv,cnv,cnv,...,cnv,cnv,cnv,cnv,cnv,cnv,cnv,cnv,cnv,cnv
00-029N9_LN,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-087MM_BONE,-1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,-1.0,...,-1.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
01-095N1_LN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
01-120A1_LIVER,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02-083E1_LN,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,-1.0,0.0,0.0


In [9]:
x_df[('MAML3', 'cnv')].value_counts()

 0.0    785
-1.0    104
 1.0     89
 2.0     25
-2.0     10
Name: (MAML3, cnv), dtype: int64

In [10]:
x_train, x_test, y_train, y_test, info_train, info_test, columns = data_adapter.get_train_test()

In [11]:
x_train_df = pd.DataFrame(x_train, columns = columns, index=info_train)

In [12]:
x_train_df[('MAML3', 'cnv')].value_counts()

 0.0    708
-1.0     95
 1.0     77
 2.0     22
-2.0      9
Name: (MAML3, cnv), dtype: int64

In [13]:

22/9.

2.4444444444444446

In [14]:
25/10.

2.5

## Correlation with CNV burden

In [15]:
## get cnv burden 
base_dir = join(PROSTATE_DATA_PATH, 'raw_data')
filename = '41588_2018_78_MOESM5_ESM.xlsx'
data= pd.read_excel(join(base_dir, filename), skiprows=2, index_col=1)
cnv= data['Fraction of genome altered']
cnv.head()

Patient.ID
AAPC-STID0000011640-Tumor-SM-2XU1H    0.010487
AAPC-STID0000021561-Tumor-SM-3RVWB    0.135831
AAPC-STID0000011949-Tumor-SM-2XU1I    0.190097
AAPC-STID0000021610-Tumor-SM-2XU13    0.054238
AAPC-STID0000021537-Tumor-SM-3RVW7    0.054551
Name: Fraction of genome altered, dtype: float64

In [16]:
maml3_cnv = x_df[('MAML3', 'cnv')]

In [17]:
maml3_cnv =maml3_cnv.astype(int)
maml3_cnv.value_counts()

 0    785
-1    104
 1     89
 2     25
-2     10
Name: (MAML3, cnv), dtype: int64

In [18]:

# maml3_df.columns= maml3_df.columns.droplevel(0)
maml3_cnv = maml3_cnv.replace({ -1:0, 1:0, 2:1, -2:1})

In [19]:
maml3_cnv.sum()

35

In [20]:
maml3_cnv.head()

00-029N9_LN       1
01-087MM_BONE     0
01-095N1_LN       0
01-120A1_LIVER    1
02-083E1_LN       1
Name: (MAML3, cnv), dtype: int64

In [21]:
maml3_mut.head()

Tumor_Sample_Barcode
00-029N9_LN       0
01-087MM_BONE     0
01-095N1_LN       0
01-120A1_LIVER    0
02-083E1_LN       0
Name: MAML3, dtype: int64

In [22]:
maml3_cnv.sum()

35

In [23]:

maml3_event = maml3_mut+ maml3_cnv
maml3_event =maml3_event >0.
maml3_event_df= maml3_event.to_frame()
maml3_event_df.columns=['MAML3']

In [24]:
cnv_df = cnv.to_frame()
data = cnv_df.join(maml3_event_df, how='inner')

In [25]:
data.head()

Unnamed: 0,Fraction of genome altered,MAML3
AAPC-STID0000011640-Tumor-SM-2XU1H,0.010487,False
AAPC-STID0000021561-Tumor-SM-3RVWB,0.135831,False
AAPC-STID0000011949-Tumor-SM-2XU1I,0.190097,False
AAPC-STID0000021610-Tumor-SM-2XU13,0.054238,False
AAPC-STID0000021537-Tumor-SM-3RVW7,0.054551,False


In [26]:
data.shape

(989, 2)

In [27]:
from scipy import stats
y=data['Fraction of genome altered'].values
x=data['MAML3'].values

stats.pointbiserialr(x,y)


PointbiserialrResult(correlation=-0.005881896435257775, pvalue=0.8534300466028149)

In [39]:
maml3_cnv_del_amps = x_df[('MAML3', 'cnv')].to_frame()
maml3_cnv_del_amps.columns= maml3_cnv_del_amps.columns.droplevel(0)
maml3_cnv_del_amps.head()
# maml3_cnv_del_amps.replace()

Unnamed: 0,cnv
00-029N9_LN,2.0
01-087MM_BONE,0.0
01-095N1_LN,0.0
01-120A1_LIVER,-2.0
02-083E1_LN,2.0


In [40]:
data_del_amps = cnv_df.join(maml3_cnv_del_amps, how='inner')

In [41]:
data_del_amps.head()

Unnamed: 0,Fraction of genome altered,cnv
AAPC-STID0000011640-Tumor-SM-2XU1H,0.010487,0.0
AAPC-STID0000021561-Tumor-SM-3RVWB,0.135831,0.0
AAPC-STID0000011949-Tumor-SM-2XU1I,0.190097,0.0
AAPC-STID0000021610-Tumor-SM-2XU13,0.054238,0.0
AAPC-STID0000021537-Tumor-SM-3RVW7,0.054551,1.0


In [42]:
ind_amp = data_del_amps['cnv'] >1
ind_del = data_del_amps['cnv'] <-1
sum(ind_amp), sum(ind_del)

(25, 10)

In [49]:
cnv_amp = data_del_amps.loc[ind_amp, 'Fraction of genome altered']
cnv_del = data_del_amps.loc[ind_del, 'Fraction of genome altered']

In [50]:
from scipy.stats import ttest_ind

t_stat, p = ttest_ind(cnv_amp.values, cnv_del.values)
t_stat, p

(2.244326728646481, 0.03163677866754105)