# TCGA: Pre-processing DNA mutations

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sb
import sklearn as sk
import statsmodels as sm
import sys
import os

from pathlib import Path

In [3]:
# Some path utilities
def ls(path):
    return [p for p in path.iterdir()]


def find_files(path, pattern=None):
    """ Recursively find all files """
    all_files = list()
    if path.is_dir():
        all_files.extend([f for p in ls(path) for f in find_files(p)])
    else:
        all_files.append(path)
    if pattern is not None:
        all_files = [f for f in all_files if f.match(pattern)]
    return all_files

# OK, let's do some duck-typing
Path.ls = ls
Path.find_files = find_files

# Which variants are expected to be 'high impact'?
high_impact_variants = set(['Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del'
                   , 'In_Frame_Ins', 'Indel', 'Intron', 'Missense'
                   , 'Missense_Mutation', 'Nonsense_Mutation', 'Nonstop_Mutation'
                   , 'Read-through', 'Splice_Site', 'Splice_Site_Del'
                   , 'Splice_Site_Ins', 'Splice_Site_SNP', 'Translation_Start_Site'])


def path_to_sample(path):
    """ Return the sample name from MAF file path """
    return path.stem.split('.')[0]

# A simple test case
assert 'TCGA-IH-A3EA-01' == path_to_sample(Path('data/tcga/gdac.broadinstitute.org_SKCM.Mutation_Packager_Calls.Level_3.2016012800.0.0/TCGA-IH-A3EA-01.maf.txt'))


def path_to_cancer_type(path):
    """ Return the cancre type MAF file path 
    For example, path='data/tcga/gdac.broadinstitute.org_SKCM.Mutation_Packager_Calls.Level_3.2016012800.0.0/TCGA-IH-A3EA-01.maf.txt'
    we want to return 'SKCM'
    """
    return path.parent.stem.split('.')[2].split('_')[1]

# A simple test case
assert 'SKCM' == path_to_cancer_type(Path('data/tcga/gdac.broadinstitute.org_SKCM.Mutation_Packager_Calls.Level_3.2016012800.0.0/TCGA-IH-A3EA-01.maf.txt'))


def mutated_genes(maf_file):
    """ Parse the MAF file and return a dictionary of genes with
    the count of 'high impact' mutations per gene """
    # print(f"Mutated genes: '{maf_file}'")
    try:
        df = pd.read_csv(maf_file, sep='\t', low_memory=False)
    except UnicodeDecodeError:
        print(f"ERROR reading file '{maf_file}'")
        return None
    keep_rows = [vc in high_impact_variants for vc in df.Variant_Classification]
    count_by_gene = dict()
    for gene in df[keep_rows].Hugo_Symbol:
        count_by_gene[gene] = count_by_gene.get(gene, 0) + 1
    return count_by_gene


def process_maf_dir(path):
    """ Find all MAF files in 'path' and get a dictionary of
    mutated genes per cancer_type and sample """
    by_cancer_type = dict()
    for p in path.find_files('*maf.txt'):
        cancer_type = path_to_cancer_type(p)
        if cancer_type not in by_cancer_type:
            print(f"Adding cancer type '{cancer_type}'")
            by_cancer_type[cancer_type] = dict()
        sample = path_to_sample(p)
        by_cancer_type[cancer_type][sample] = mutated_genes(p)
    return by_cancer_type


def get_all_genes(by_sample):
    """ Return a (sorted) list of genes in all samples """
    all_genes = set([g for gene_dict in by_sample.values() if gene_dict is not None for g in gene_dict.keys()])
    all_genes = list(all_genes)
    all_genes.sort()
    return all_genes


def variants_df(by_sample):
    """ Create a dataframe containing number of variants per sample """
    df = pd.DataFrame(by_sample, index=get_all_genes(by_sample), columns=sorted(list(by_sample.keys())))
    return df.fillna(0)

# Load and pre-process DNA mutations
Load all mutations, filter them and convert them in a single dataframe per cancer type

In [None]:
path = Path('data/tcga')
sample_by_cancer_type = process_maf_dir(path)

### Convert all 

In [4]:
df_by_cancer_type = dict()
for cancer_type in sample_by_cancer_type.keys():
    print(f"Cancer type: {cancer_type}")
    by_sample = sample_by_cancer_type[cancer_type]
    df_by_cancer_type[cancer_type] = variants_df(sample_by_cancer_type[cancer_type])
    file = f"{path}/{cancer_type}.mutations_by_gene.csv"
    print(f"Saving to file '{file}'")
    df_by_cancer_type[cancer_type].to_csv(file)

Adding cancer type 'PAAD'
Adding cancer type 'ACC'
Adding cancer type 'LUAD'
Adding cancer type 'READ'
Adding cancer type 'LUSC'
Adding cancer type 'ESCA'
Adding cancer type 'GBMLGG'
Adding cancer type 'STAD'
Adding cancer type 'OV'
Adding cancer type 'DLBC'
Adding cancer type 'LGG'
Adding cancer type 'UCEC'
Adding cancer type 'PRAD'
Adding cancer type 'CESC'
Adding cancer type 'COADREAD'
Adding cancer type 'UVM'
Adding cancer type 'GBM'
Adding cancer type 'THYM'
Adding cancer type 'PCPG'
Adding cancer type 'STES'
Adding cancer type 'KIRC'
Adding cancer type 'UCS'
Adding cancer type 'SKCM'
ERROR reading file 'data/tcga/gdac.broadinstitute.org_SKCM.Mutation_Packager_Calls.Level_3.2016012800.0.0/TCGA-IH-A3EA-01.maf.txt'
Adding cancer type 'BRCA'
Adding cancer type 'TGCT'
Adding cancer type 'CHOL'
Adding cancer type 'LIHC'
Adding cancer type 'HNSC'
Adding cancer type 'LAML'
Adding cancer type 'KIPAN'
Adding cancer type 'COAD'
Adding cancer type 'KICH'
Adding cancer type 'SARC'
Adding canc