In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import os

# Genomic data

In [2]:
df_brca = pd.read_csv("data/tcga_brca_mutations_by_gene.csv", index_col=0)

In [3]:
df_brca.head()

Unnamed: 0_level_0,A1CF,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADACL2,AADACL4,AAGAB,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-A1-A0SB-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SD-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SE-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SF-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SG-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Mutation data is {0, 1}, we don't need 64 bits
df = df_brca.astype('int8')

# Clinical data

In [5]:
df_clinical = pd.read_csv("data/tcga_brca_clinical_data.tsv", sep="\t", low_memory=False)

In [6]:
df_clinical.head()

Unnamed: 0,Study ID,Patient ID,Sample ID,Diagnosis Age,American Joint Committee on Cancer Metastasis Stage Code,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code,Neoplasm Disease Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Publication Version Type,American Joint Committee on Cancer Tumor Stage Code,Brachytherapy first reference point administered total dose,...,Staging System.1,Surgery for positive margins,Surgery for positive margins other,Surgical procedure first,Time between clamping and freezing,Time between excision and freezing,Tissue Source Site,Person Neoplasm Status,Vial number,Patient's Vital Status
0,brca_tcga,TCGA-3C-AAAU,TCGA-3C-AAAU-01,55.0,MX,NX,Stage X,6th,TX,,...,,,,Modified Radical Mastectomy,,,3C,WITH TUMOR,A,Alive
1,brca_tcga,TCGA-3C-AALI,TCGA-3C-AALI-01,50.0,M0,N1a,Stage IIB,6th,T2,,...,,,,Lumpectomy,,,3C,TUMOR FREE,A,Alive
2,brca_tcga,TCGA-3C-AALJ,TCGA-3C-AALJ-01,62.0,M0,N1a,Stage IIB,7th,T2,,...,,,,Modified Radical Mastectomy,,,3C,TUMOR FREE,A,Alive
3,brca_tcga,TCGA-3C-AALK,TCGA-3C-AALK-01,52.0,M0,N0 (i+),Stage IA,7th,T1c,,...,,,,Simple Mastectomy,,,3C,TUMOR FREE,A,Alive
4,brca_tcga,TCGA-4H-AAAK,TCGA-4H-AAAK-01,50.0,M0,N2a,Stage IIIA,7th,T2,,...,,,,Modified Radical Mastectomy,,,4H,TUMOR FREE,A,Alive


In [7]:
# Rename some columns for easier access
cli = df_clinical.rename(columns={'Sample ID': 'sample_id', 
                         'Overall Survival (Months)': 'os', 
                         'Overall Survival Status': 'os_status'}
                        )

# Drop all other columns, we are not using them
cli = cli[['sample_id', 'os', 'os_status']].copy()
cli.set_index(cli.sample_id, inplace=True)
cli.drop(columns='sample_id', inplace=True)

In [8]:
# Remove samples with missing values
to_remove = pd.isna(cli.os) | pd.isna(cli.os_status)
sum(to_remove)

6

In [9]:
cli = cli.loc[~to_remove].copy()

In [10]:
# How many in each 'Survival state'
cli.os_status.value_counts()

LIVING      948
DECEASED    154
Name: os_status, dtype: int64

In [11]:
# Censor data
cli = cli.loc[cli.os_status == 'DECEASED'].copy()

In [12]:
cli = cli[['os']].copy()
cli

Unnamed: 0_level_0,os
sample_id,Unnamed: 1_level_1
TCGA-A1-A0SK-01,31.77
TCGA-A2-A04P-01,18.00
TCGA-A2-A04V-01,63.07
TCGA-A2-A0CM-01,24.77
TCGA-A2-A0CO-01,114.72
...,...
TCGA-LL-A73Z-01,7.46
TCGA-OL-A5D6-01,36.27
TCGA-OL-A66K-01,41.89
TCGA-PE-A5DC-01,46.98


# Intersect clinical and genomic data

In [13]:
df = cli.join(df, how='inner').copy()

In [14]:
df.head()

Unnamed: 0,os,A1CF,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADACL2,AADACL4,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
TCGA-A1-A0SK-01,31.77,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A04P-01,18.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A04V-01,63.07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A0CM-01,24.77,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A0CO-01,114.72,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Cleanup dataset

In [15]:
# Are there samples with mutataions?
count_mut_per_sample = df.sum(axis=1)
(count_mut_per_sample == 0).sum()

0

In [16]:
# Are there genes with zero or low number of mutataions?
count_mut_per_gene = df.sum(axis=0)
keep = count_mut_per_gene > 3
keep.sum()

156

In [17]:
# Only keep genes with 3 or more mutataions
keep_names = count_mut_per_gene[keep].index
df = df[keep_names].copy()
df.head()

Unnamed: 0,os,ABCA13,ABCA8,ABCC8,ADRBK1,AKAP9,ALMS1,ANAPC1,ANK3,APC,...,UNC13C,USO1,USP34,USP9X,XIRP2,ZC3H13,ZCCHC6,ZNF208,ZNF469,ZZEF1
TCGA-A1-A0SK-01,31.77,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
TCGA-A2-A04P-01,18.0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A04V-01,63.07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A0CM-01,24.77,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-A2-A0CO-01,114.72,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df.sum(axis=0).min()

4.0

# Create dataset for model trainig

In [19]:
x, y = df.iloc[:,1:].to_numpy(), df.iloc[:,0].to_numpy()

In [20]:
# Create list of genes
genes = list(df.columns[1:])

In [21]:
# Make sure the dimentions for x and y match
x.shape, y.shape, len(genes)

((140, 155), (140,), 155)

# Create an ML model

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [23]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)

# Find best number of estimators for the model '
param_grid = [{'n_estimators': [1, 3, 5, 10, 20, 30, 40, 50, 60, 80, 90, 100]}]
gs = GridSearchCV(model, param_grid, cv=5)
gs.fit(x, y)
n = gs.best_params_['n_estimators']
n

100

# Analyze feature importance

In [24]:
model = RandomForestRegressor(n_estimators=n, n_jobs=-1, random_state=42)
model.fit(x,y)
top = pd.Series(model.feature_importances_, genes).sort_values(ascending=False)
top.head(10)

TP53       0.055408
C5orf42    0.053313
SI         0.046321
OR5AK2     0.039489
MAP2K4     0.034253
OLFML2A    0.030662
PIK3CA     0.029774
MST1P9     0.025226
CAD        0.024349
MYH7       0.017562
dtype: float64