In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import os

# Load & preprocess mutation data

In [2]:
df_brca = pd.read_csv("data/brca.mut.csv", index_col=0)

In [3]:
df_brca.head()

Unnamed: 0_level_0,A1CF,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADACL2,AADACL4,AAGAB,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-A1-A0SB-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SD-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SE-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SF-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SG-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Mutation data is {0, 1}, we don't need 64 bits
df = df_brca.astype('int8')

# Prepare cathegories

We want to know whether a sample belongs to a tumor or normal tissue.

TCGA documentation (https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/) mentions that the information is encoded in the 'BarCode'

For instance, sample `TCGA-05-4244-01A-01R-1107-07`, the fourth identifier is `01A` which means 'Tumor' (`01`), whereas sample `TCGA-91-6829-11A-01R-1858-07`, has the fourth identifier `11A` which means Normal.



In [5]:
def to_tumor_normal(barcode):
    return barcode.split('-')[3][0] == '0'

tn = [to_tumor_normal(b) for b in df.index.values]
y = np.array(tn)

# Check that we only have tumor data, not normal tissue
sum(y), sum(1-y)

(982, 0)

### Load clinical table

In [6]:
df_clinical = pd.read_csv("data/brca_tcga_clinical_data.tsv", sep="\t", low_memory=False)

In [7]:
# Rename some columns for easier access
cli = df_clinical.rename(columns={'Sample ID': 'sample_id', 
                         'Overall Survival (Months)': 'os', 
                         'Overall Survival Status': 'os_status'}
                        )

# Drop all other columns, we are not using them
cli = cli[['sample_id', 'os', 'os_status']].copy()
cli.set_index(cli.sample_id, inplace=True)
cli

Unnamed: 0_level_0,sample_id,os,os_status
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-3C-AAAU-01,TCGA-3C-AAAU-01,132.95,LIVING
TCGA-3C-AALI-01,TCGA-3C-AALI-01,131.57,LIVING
TCGA-3C-AALJ-01,TCGA-3C-AALJ-01,48.42,LIVING
TCGA-3C-AALK-01,TCGA-3C-AALK-01,47.57,LIVING
TCGA-4H-AAAK-01,TCGA-4H-AAAK-01,11.43,LIVING
...,...,...,...
TCGA-WT-AB44-01,TCGA-WT-AB44-01,29.01,LIVING
TCGA-XX-A899-01,TCGA-XX-A899-01,15.34,LIVING
TCGA-XX-A89A-01,TCGA-XX-A89A-01,16.03,LIVING
TCGA-Z7-A8R5-01,TCGA-Z7-A8R5-01,107.98,LIVING


In [8]:
# Remove samples with missing values
to_remove = pd.isna(cli.os) | pd.isna(cli.os_status)
sum(to_remove)

6

In [9]:
cli = cli.loc[~to_remove].copy()

In [10]:
# How many in each 'Survival state'
cli.os_status.value_counts()

LIVING      948
DECEASED    154
Name: os_status, dtype: int64

In [11]:
# Censor data
cli = cli.loc[cli.os_status == 'DECEASED'].copy()

In [12]:
cli = cli[['sample_id', 'os']].copy()
cli

Unnamed: 0_level_0,sample_id,os
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-A1-A0SK-01,TCGA-A1-A0SK-01,31.77
TCGA-A2-A04P-01,TCGA-A2-A04P-01,18.00
TCGA-A2-A04V-01,TCGA-A2-A04V-01,63.07
TCGA-A2-A0CM-01,TCGA-A2-A0CM-01,24.77
TCGA-A2-A0CO-01,TCGA-A2-A0CO-01,114.72
...,...,...
TCGA-LL-A73Z-01,TCGA-LL-A73Z-01,7.46
TCGA-OL-A5D6-01,TCGA-OL-A5D6-01,36.27
TCGA-OL-A66K-01,TCGA-OL-A66K-01,41.89
TCGA-PE-A5DC-01,TCGA-PE-A5DC-01,46.98


### Intersect clinical and mutataion data

In [13]:
# Intersect sample Ids
cli_samples = [s for s in list(cli.index.values)]
df_samples = [s for s in list(df.index.values)]
keep = [s for s in cli_samples if s in df_samples]
len(keep)

140

In [14]:
df_x = df.loc[keep].copy()
df_x.head()

Unnamed: 0_level_0,A1CF,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADACL2,AADACL4,AAGAB,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-A1-A0SK-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A04P-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A04V-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A0CM-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A0CO-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Keep 'Overall survival' data
s_y = cli['os'].loc[keep].copy()

# Combine into a single dataframe
df = pd.concat([s_y,df_x], axis=1, sort=True)
df

Unnamed: 0,os,A1CF,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADACL2,AADACL4,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
TCGA-A1-A0SK-01,31.77,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A04P-01,18.00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A04V-01,63.07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A0CM-01,24.77,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A0CO-01,114.72,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-GM-A2DA-01,216.59,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-HN-A2OB-01,62.42,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-OL-A5D6-01,36.27,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-OL-A66K-01,41.89,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Are there samples with mutataions?
count_mut_per_sample = df.sum(axis=1)
(count_mut_per_sample == 0).sum()

0

In [17]:
# Are there genes with zero or low number of mutataions?
count_mut_per_gene = df.sum(axis=0)
keep = count_mut_per_gene > 3
keep.sum()

156

In [18]:
keep_names = count_mut_per_gene[keep].index
df = df[keep_names].copy()
df.head()

Unnamed: 0,os,ABCA13,ABCA8,ABCC8,ADRBK1,AKAP9,ALMS1,ANAPC1,ANK3,APC,...,UNC13C,USO1,USP34,USP9X,XIRP2,ZC3H13,ZCCHC6,ZNF208,ZNF469,ZZEF1
TCGA-A1-A0SK-01,31.77,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
TCGA-A2-A04P-01,18.0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A04V-01,63.07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A0CM-01,24.77,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A0CO-01,114.72,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df.sum(axis=0).min()

4.0

# Create dataset for model trainig

In [20]:
x = df.iloc[:,1:].to_numpy()
x

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [21]:
y = df.iloc[:,0].to_numpy()
y

array([ 31.77,  18.  ,  63.07,  24.77, 114.72,  77.14,   5.19,  61.89,
        27.1 ,  44.84,   8.38,  23.75,  33.9 ,  29.96,  47.27,  35.91,
        20.17,  29.01,  30.98,  37.02,   9.99, 111.99,  13.99,   5.72,
        86.6 ,  26.02,  14.65,   6.47,  81.57,  95.63,  58.9 ,  26.02,
         5.65,  55.81,  83.8 ,  17.21,  41.79,  94.15,  93.76,  20.24,
        54.17,  10.58,  77.56,  32.56,  24.61,  11.89, 129.47,  50.66,
        65.47, 146.39, 112.29,  18.76, 127.23,  84.53, 128.98, 212.09,
       102.69, 140.18,  97.4 ,  79.4 ,  18.33,  20.99, 113.7 , 129.6 ,
       244.91,  28.25,  68.89,  58.51,  37.84,  18.82,  43.5 ,  75.43,
        46.35,  32.56,  20.11,  90.77,  26.64,  72.5 ,  37.71,  30.26,
        55.58,  37.52,  66.  ,   7.36,  51.35,  51.12,  51.12,  69.88,
        91.92, 113.73, 113.73,  82.79,  42.25,  11.99,  55.65,  49.54,
        17.67,  25.79,  31.5 ,  89.09,  97.4 ,  25.07, 114.06,  33.15,
        74.67, 122.73,  33.97,  63.3 ,  54.96,  45.6 ,  72.01,  53.94,
      

Make sure the dimentions for `x` and `y` match

In [22]:
# Create list of genes
genes = list(df.columns[1:])
x.shape, y.shape, len(genes)

((140, 155), (140,), 155)

# Create Regressors and anlyze feature importance

In [23]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import GridSearchCV

def fit_cv(model):
    ' Find best number of estimators for the model '
    param_grid = [{'n_estimators': [1, 3, 5, 10, 20, 30, 40, 50, 60, 80, 90, 100]}]
    gs = GridSearchCV(model, param_grid, cv=5)
    gs.fit(x, y)
    n = gs.best_params_['n_estimators']
    print(f"Best 'n_estimators'= {n}")
    return n

def importance(model):
    ' Show most important genes '
    model.fit(x,y)
    top = pd.Series(model.feature_importances_, genes).sort_values(ascending=False)
    print('Top genes:')
    print(top.head(10))

In [24]:
rf = RandomForestRegressor(n_jobs=-1, random_state=42)
n = fit_cv(rf)

rf = RandomForestRegressor(n_estimators=n, n_jobs=-1, random_state=42)
importance(rf)

Best 'n_estimators'= 100
Top genes:
TP53       0.055408
C5orf42    0.053313
SI         0.046321
OR5AK2     0.039489
MAP2K4     0.034253
OLFML2A    0.030662
PIK3CA     0.029774
MST1P9     0.025226
CAD        0.024349
MYH7       0.017562
dtype: float64


In [25]:
gb = GradientBoostingRegressor(random_state=42)
n = fit_cv(gb)

gb = GradientBoostingRegressor(n_estimators=n, random_state=42)
importance(gb)

Best 'n_estimators'= 1
Top genes:
C5orf42    0.351708
SI         0.235054
OLFML2A    0.196267
SPTA1      0.184286
MSH6       0.019945
HECW2      0.012178
DOCK3      0.000563
FASN       0.000000
FLG        0.000000
FCGBP      0.000000
dtype: float64


In [26]:
et = ExtraTreesRegressor(n_jobs=-1, random_state=42)
n = fit_cv(et)

et = ExtraTreesRegressor(n_estimators=n, n_jobs=-1, random_state=42)
importance(et)

Best 'n_estimators'= 3
Top genes:
TTN        0.079858
C5orf42    0.070960
TP53       0.058892
SI         0.047424
OLFML2A    0.040110
OR5AK2     0.038964
MAP2K4     0.038065
SPTA1      0.037181
MDN1       0.035238
PIK3CA     0.027472
dtype: float64
