# Supplementary tables

In [None]:
import os
import json

os.makedirs('tables', exist_ok=True)
with open('list_ttypes.json', 'r') as fp:
    tumors = json.load(fp)

## Table 1

In [None]:
import pandas as pd
from scripts.cohorts import TTYPES

tcga_samplesheet='../mutations/tcga/samples/sample_tracksheet.tsv.gz'
icgc_samplesheet='../mutations/icgc/samples/sample_tracksheet.tsv.gz'

df_tcga = pd.read_csv(tcga_samplesheet, sep ='\t', names = ['sample', 'file', 'nmuts'])
df_tcga['project'] = '505'
df_icgc = pd.read_csv(icgc_samplesheet, sep ='\t',names = ['sample', 'file', 'nmuts'])
df_icgc['project'] = df_icgc['file'].apply(lambda x : x.split('-')[1].split('.')[0])

df = pd.concat([df_tcga, df_icgc])
df['ttype'] = df['file'].apply(lambda x : x.split('.')[0])
df = df[df['ttype'].isin(tumors)]

with open('tables/t1.tsv', 'wt') as outfile:
    header = 'Cohort\tProject\tTumor Name\tSamples\tMutations\n'
    outfile.write(header)
    for ttype, data in df.groupby(by='ttype'):
        len_cohort = len(data)
        total_muts = data['nmuts'].sum()
        project = data['project'].iloc[0]
        out = '{}\t{}\t{}\t{}\t{}\n'.format(ttype, project, TTYPES[ttype], len_cohort, total_muts)

        outfile.write(out)

## Table 2

### Cohorts zoomout

In [None]:
from scripts.cohorts import generate_table

tcga = {}
for file in os.listdir('../mutations/tcga/cohorts'):
    name = file.replace('.tsv.gz', '')
    data = '../mutations/tcga/cohorts/increase_zoomout/{}/obsexp.tsv'.format(name)
    if os.path.exists(data):
        tcga[name] = data
icgc = {}
for file in os.listdir('../mutations/icgc/cohorts'):
    name = file.replace('.tsv.gz', '')
    data = '../mutations/icgc/cohorts/increase_zoomout/{}/obsexp.tsv'.format(name)
    if os.path.exists(data):
        icgc[name] = data
cohorts = {**tcga, **icgc}

df = generate_table(cohorts, tumors)
df.to_csv('tables/t2_cohorts_zoomout.tsv', sep='\t', index=False, header=True)

### Cohorts zoomin

In [None]:
from scripts.cohorts import generate_table

tcga = {}
for file in os.listdir('../mutations/tcga/cohorts'):
    name = file.replace('.tsv.gz', '')
    data = '../mutations/tcga/cohorts/increase_zoomin/{}/obsexp.tsv'.format(name)
    if os.path.exists(data):
        tcga[name] = data
icgc = {}
for file in os.listdir('../mutations/icgc/cohorts'):
    name = file.replace('.tsv.gz', '')
    data = '../mutations/icgc/cohorts/increase_zoomin/{}/obsexp.tsv'.format(name)
    if os.path.exists(data):
        icgc[name] = data
cohorts = {**tcga, **icgc}

df = generate_table(cohorts, tumors)
df.to_csv('tables/t2_cohorts_zoomin.tsv', sep='\t', index=False, header=True)

### 505 signatures

In [None]:
from scripts.signatures import generate_table

tcga = {}
folder = '../signatures/tcga_joined/increase'
for signature in os.listdir(folder):
    data = os.path.join(folder, signature, 'obsexp.tsv')
    if os.path.exists(data):
        tcga[signature] = data
        
df = generate_table(tcga)
df['Project'] = '505'
df.to_csv('tables/t2_signatures505.tsv', sep='\t', index=False, header=True)

### PanCanAtlas Signatures

In [None]:
from scripts.signatures import generate_table

tcga = {}
folder = '../signatures/pancanatlas_joined/increase'
for signature in os.listdir(folder):
    data = os.path.join(folder, signature, 'obsexp.tsv')
    if os.path.exists(data):
        tcga[signature] = data
        
df = generate_table(tcga)
df['Project'] = 'PANCANATLAS'
df.to_csv('tables/t2_signaturesPanCanAtlas.tsv', sep='\t', index=False, header=True)

### Samples

In [None]:
import pandas as pd
from scripts.cohorts import TTYPES
from scripts.samples import generate_table

samples = {}

folders = ['../mutations/tcga/samples/increase', '../mutations/icgc/samples/increase']
for folder in folders:
    for ctype in os.listdir(folder):
        if ctype in tumors:
            ctype_folder = os.path.join(folder, ctype)
            for sample in os.listdir(ctype_folder):
                file = os.path.join(ctype_folder, sample, 'obsexp.tsv')
                if os.path.exists(file):
                    samples[sample] = file
                    
muts_info = []                   
for file in ['../mutations/tcga/samples/sample_tracksheet.tsv.gz', '../mutations/icgc/samples/sample_tracksheet.tsv.gz']: 
    muts_info.append(pd.read_csv(file, sep='\t', names=['name', 'file', 'muts']))
tracksheets = pd.concat(muts_info, ignore_index=True)
tracksheets['tumor_name'] = tracksheets['file'].apply(lambda x: x.split('.')[0])

df = generate_table(samples, tracksheets, TTYPES)
df.to_csv('tables/t2_samples.tsv', sep='\t', index=False, header=True)