Notebook used to download gene expression data from the GDC.

In [1]:
import requests
import json
import re
import io
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path

datapath = Path("../data")

pd.set_option('display.max_rows', 50)

# Select primary site

Uncomment the desired primary site

In [48]:
# primary_site = "Kidney" # 126 replicates
# TCGA-KIRC 72 replicates (Adenomas and Adenocarcinomas)
# TCGA-KIRP 31 replicates (Adenomas and Adenocarcinomas)
# TCGA-KICH 23 replicates (Adenomas and Adenocarcinomas)

# primary_site = "Breast" # 112 replicates
# TCGA-BRCA 111 replicates (Ductal and Lobular Neoplasms)

# primary_site = "Lung" # 106 replicates
# TCGA-LUAD 52 replicates (Adenomas and Adenocarcinomas)
# TCGA-LUSC 49 replicates (Squamous Cell Neoplasms)

# primary_site = "Thyroid" # TCGA-THCA 58 replicates (Adenomas and Adenocarcinomas)

# primary_site = "Prostate" # 52 replicates
# TCGA-PRAD 51 replicates (Adenomas and Adenocarcinomas)

# primary_site = "Colorectal" # 50 replicates 
# TCGA-COAD 39 replicates (Adenomas and Adenocarcinomas)

primary_site = "Liver" # 50 replicates 
# TCGA-LIHC 50 replicates (Adenomas and Adenocarcinomas)

# primary_site = "Stomach" # 33 replicates
# primary_site = "Uterus" # 23 replicates
# primary_site = "Bladder" # 19 replicates
# primary_site = "Esophagus" # 8 replicates
# primary_site = "Bile Duct" # 8 replicates
# primary_site = "Pancreas" # 4 replicates
# primary_site = "Adrenal Gland" # 3
# primary_site = "Cervix" # 3
# primary_site = "Soft Tissue" # 2
# primary_site = "Thymus" # 2
# primary_site = "Brain" # 0
# primary_site = "Ovary" # 0
# primary_site = "Bone Marrow" # 0
# primary_site = "Skin" # 0
# primary_site = "Nervous System" # 0
# primary_site = "Bone" # 0
# primary_site = "Eye" # 0
# primary_site = "Lymph Nodes" # 0
# primary_site = "Pleura" # 0
# primary_site = "Testis" # 0

overwrite = False       # Re-download and overwrite existing files if present
filterByExpr = True     # Filter low expression genes using edgeR
cleanup = True          # Delete downloaded files after merging

In [49]:
outfolder = Path(datapath, primary_site.lower())
!mkdir -p $outfolder

outname = primary_site.lower() + ".csv"
outfile = Path(outfolder, outname)
print("Storing gene expression data in", outfile)

Storing gene expression data in ../data/liver/liver.csv


# Fetch IDs for matching N and T samples

We start by fetching patient meta data using the GDC API

In [50]:
fields = [
    "file_name",
    "cases.submitter_id",
    "cases.samples.sample_type",
    "cases.disease_type",
    "cases.project.project_id",
    "cases.demographic.gender",
    "cases.demographic.vital_status",
    "cases.demographic.days_to_birth",
    "cases.demographic.days_to_death",
    "cases.demographic.race",
    "cases.demographic.ethnicity",
    "cases.exposures.alcohol_intensity",
    "cases.exposures.alcohol_history",
    "cases.exposures.years_smoked"
    ]

fields = ",".join(fields)

files_endpt = "https://api.gdc.cancer.gov/files"

# This set of filters is nested under an 'and' operator.
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": [primary_site]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.experimental_strategy",
            "value": ["RNA-Seq"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TSV"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_category",
            "value": ["Transcriptome Profiling"]
            }
        }
    ]
}

# A POST is used, so the filter parameters can be passed directly as a Dict object.
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "200000"
    }

# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
r = response.content.decode("utf-8")
print(len(r))

208035


In [51]:
data = io.StringIO(r)
df = pd.read_csv(data, sep="\t")
df = df[df["file_name"].str.endswith("augmented_star_gene_counts.tsv")]

df_N = df[df["cases.0.samples.0.sample_type"] == "Solid Tissue Normal"]
df_N = df_N.drop_duplicates(subset="cases.0.submitter_id") # technical replicates

df_T = df[df["cases.0.samples.0.sample_type"] == "Primary Tumor"]
df_T = df_T.drop_duplicates(subset="cases.0.submitter_id") 

NT = set(df_T["cases.0.submitter_id"]) & set(df_N["cases.0.submitter_id"])
print("Biological replicates:", len(NT), "\n")

df_N = df_N[df_N["cases.0.submitter_id"].isin(NT)]
df_T = df_T[df_T["cases.0.submitter_id"].isin(NT)]
df_NT = pd.concat([df_N,df_T])

assert(len(df_NT["cases.0.submitter_id"].unique()) == len(df_NT)//2)
assert(len(df_NT[df_NT["cases.0.samples.0.sample_type"]=="Solid Tissue Normal"])
       == len(df_NT[df_NT["cases.0.samples.0.sample_type"]=="Primary Tumor"]))

display(df_NT["cases.0.disease_type"].value_counts())
print("")
display(df_NT["cases.0.project.project_id"].value_counts())

Biological replicates: 50 



Adenomas and Adenocarcinomas    100
Name: cases.0.disease_type, dtype: int64




TCGA-LIHC    100
Name: cases.0.project.project_id, dtype: int64

In [52]:
df_NT = df_NT.sort_values(by=["cases.0.submitter_id", "cases.0.samples.0.sample_type"]).reset_index(drop=True)
ids = list(df_NT["id"])
df_NT.columns = [col.split(".")[-1] for col in df_NT.columns.to_list()]
df_NT.head()

Unnamed: 0,days_to_birth,days_to_death,ethnicity,gender,race,vital_status,disease_type,alcohol_history,alcohol_intensity,years_smoked,project_id,sample_type,submitter_id,file_name,id
0,-26400.0,1135.0,not hispanic or latino,female,white,Dead,Adenomas and Adenocarcinomas,Not Reported,,,TCGA-LIHC,Primary Tumor,TCGA-BC-A10Q,e576f331-424e-43f0-af27-20e07d91c826.rna_seq.a...,df736f4c-b9d6-4805-bd7d-7bc6d92174d7
1,-26400.0,1135.0,not hispanic or latino,female,white,Dead,Adenomas and Adenocarcinomas,Not Reported,,,TCGA-LIHC,Solid Tissue Normal,TCGA-BC-A10Q,14fc153e-3cb3-4d3f-8259-b73a436f3d0b.rna_seq.a...,6e812349-35e7-44e7-ac7c-a4bc19d81fd0
2,-24274.0,308.0,not hispanic or latino,female,white,Dead,Adenomas and Adenocarcinomas,Not Reported,,,TCGA-LIHC,Primary Tumor,TCGA-BC-A10R,01a67ac4-91db-4a3d-9137-fe2240acc861.rna_seq.a...,d80985cc-be01-452c-931c-65fccbe783d9
3,-24274.0,308.0,not hispanic or latino,female,white,Dead,Adenomas and Adenocarcinomas,Not Reported,,,TCGA-LIHC,Solid Tissue Normal,TCGA-BC-A10R,c0b92413-c9f7-4a67-9ca7-cb065ac9c407.rna_seq.a...,d139256a-c9e8-417b-90af-c466a58d459d
4,-27944.0,837.0,not hispanic or latino,male,white,Dead,Adenomas and Adenocarcinomas,Not Reported,,,TCGA-LIHC,Primary Tumor,TCGA-BC-A10T,533c5b0b-6eda-46c5-9cf0-8c6decb728c9.rna_seq.a...,35c358f0-4afc-4bd3-a0f4-2741134d6f35


To get matching N and T samples after downloading (which does not preserve order of df_NT)

- Sort df_NT by id column (not inplace) and save the new index order in ``sorted_id_indices``
- After downloading the files, sort MANIFEST.txt by id
- Then replace manifest indices with ``sorted_id_indices`` and sort by index
- Matching NT pairs are now grouped together again

In [53]:
sorted_id_indices = df_NT.sort_values(by="id").index
df_NT.sort_values(by="id").head()

Unnamed: 0,days_to_birth,days_to_death,ethnicity,gender,race,vital_status,disease_type,alcohol_history,alcohol_intensity,years_smoked,project_id,sample_type,submitter_id,file_name,id
85,-26985.0,,not hispanic or latino,male,white,Alive,Adenomas and Adenocarcinomas,Not Reported,,,TCGA-LIHC,Solid Tissue Normal,TCGA-EP-A3RK,28e28451-9c55-4f01-9ac1-fda75a6f828d.rna_seq.a...,0444bb3d-0844-4333-97b0-e71faca95abb
55,-26009.0,1005.0,not hispanic or latino,female,white,Dead,Adenomas and Adenocarcinomas,Not Reported,,,TCGA-LIHC,Solid Tissue Normal,TCGA-DD-A1EJ,c9432a6e-4d79-4de2-8ffa-e3b05129f78f.rna_seq.a...,0940bd97-2818-48f6-a0a5-f955152409b8
19,-22968.0,,not reported,female,white,Alive,Adenomas and Adenocarcinomas,Not Reported,,,TCGA-LIHC,Solid Tissue Normal,TCGA-BC-A216,1432142c-908d-46d7-9a2a-2597a0f00ae8.rna_seq.a...,095b68ff-e317-4667-b1d1-ff69254d146a
27,-15410.0,1149.0,not hispanic or latino,male,black or african american,Dead,Adenomas and Adenocarcinomas,Not Reported,,,TCGA-LIHC,Solid Tissue Normal,TCGA-DD-A114,8e595756-ee6b-4fd9-9844-6001ea97fa2e.rna_seq.a...,0c11ccb8-4405-4a2a-9180-b483fb845968
5,-27944.0,837.0,not hispanic or latino,male,white,Dead,Adenomas and Adenocarcinomas,Not Reported,,,TCGA-LIHC,Solid Tissue Normal,TCGA-BC-A10T,ebbdae1d-a199-4577-a73a-77e9c6a55a92.rna_seq.a...,0dbea4cc-32b6-42af-b4f0-fa0a67b05a7f


# Download the files

We proceed by downloading the gene expression data for our selected samples

In [54]:
file_already_exists = os.path.isfile(outfile)
print("File already exists:", file_already_exists)

tmp = Path(outfolder, "tmp")
!mkdir -p $tmp
print(f"Storing temporary files in {tmp}")

File already exists: True
Storing temporary files in ../data/liver/tmp


In [55]:
if overwrite or not file_already_exists:

    # Download the data
    data_endpt = "https://api.gdc.cancer.gov/data"
    params = {"ids": ids}

    response = requests.post(data_endpt,
                          data = json.dumps(params),
                          headers={
                              "Content-Type": "application/json"
                              })

    response_head_cd = response.headers["Content-Disposition"]

    filename = Path(tmp, re.findall("filename=(.+)", response_head_cd)[0])

    with open(filename, "wb") as output_file:
        output_file.write(response.content)

    # Unpack the data
    !tar -xzf $filename -C $tmp

# Sort manifest to get matching NT pairs like in df_NT
try:
    man = pd.read_csv(Path(tmp, "MANIFEST.txt"),sep="\t")
except FileNotFoundError:
    man = pd.read_csv(Path(outfolder, "MANIFEST.txt"),sep="\t")

man = man.sort_values(by="id")
man.index = sorted_id_indices
man = man.sort_index()
man.head()

Unnamed: 0,id,filename,md5,size,state
0,df736f4c-b9d6-4805-bd7d-7bc6d92174d7,df736f4c-b9d6-4805-bd7d-7bc6d92174d7/e576f331-...,8e0c59d0d0efd855bdd59fdd4e84e2d0,4223797,validated
1,6e812349-35e7-44e7-ac7c-a4bc19d81fd0,6e812349-35e7-44e7-ac7c-a4bc19d81fd0/14fc153e-...,6eef53bb76ba2ab3fc3842fb29cfae88,4201688,validated
2,d80985cc-be01-452c-931c-65fccbe783d9,d80985cc-be01-452c-931c-65fccbe783d9/01a67ac4-...,65cd2791cb00e320933c2511b66dd7cf,4220262,validated
3,d139256a-c9e8-417b-90af-c466a58d459d,d139256a-c9e8-417b-90af-c466a58d459d/c0b92413-...,8e93ddc7538d8b7aaa829d040c7e4d25,4209435,validated
4,35c358f0-4afc-4bd3-a0f4-2741134d6f35,35c358f0-4afc-4bd3-a0f4-2741134d6f35/533c5b0b-...,69bc0158cc1444d14c2007cf219b12b2,4227337,validated


# Formatting and cleanup

In [56]:
if overwrite or not file_already_exists:

    df_tmp = pd.DataFrame()

    for i, filename in  enumerate(man["filename"]):

        filename = Path(tmp, filename)
        
        # if Path(filename).is_file(): 
        #     !gzip -d $filename
    
        f = pd.read_csv(filename, sep="\t",header=[1], index_col=0)

        if i%2==0: 
            sample_type = "T"
            sample_number = i//2 + 1
        else:
            sample_type = "N"
            sample_number = (i+1)//2

        df_tmp[f"{sample_type}{sample_number}"] = f["unstranded"]
        
        if i % 20 == 0: print(i, "files processed")

0 files processed
20 files processed
40 files processed
60 files processed
80 files processed


In [133]:
rows_to_remove = ["N_unmapped","N_multimapping","N_noFeature","N_ambiguous"]

if overwrite or not file_already_exists:
    df_out = df_tmp.drop(rows_to_remove, axis=0)
else:
    print("Loading existing file:", outfile)
    df_out = pd.read_csv(outfile, index_col = 0)
    df_meta = pd.read_csv(meta_outfile, index_col=0)

replicates = len(df_out.columns)//2
cols = [f"N{i}" for i in range(1,replicates+1)] + [f"T{i}" for i in range(1,replicates+1)]

Loading existing file: ../data/liver/liver.csv


In [58]:
NT_cols = [f'T{i//2 + 1}' if i%2==0 else f'N{(i+1)//2}' for i in range(len(man))] # sorted like df_NT
sample_metadata = df_NT
sample_metadata["case"] = [c[0] for c in NT_cols]
sample_metadata.index = NT_cols
sample_metadata = sample_metadata.sort_index()
sample_metadata.index = cols

assert(np.all(sample_metadata[:replicates]["submitter_id"].values == sample_metadata[replicates:]["submitter_id"].values))
assert(np.all(sample_metadata[:replicates]["disease_type"].values == sample_metadata[replicates:]["disease_type"].values))
sample_metadata = sample_metadata.T
print(sample_metadata.shape)
sample_metadata.head()

(16, 100)


Unnamed: 0,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,...,T41,T42,T43,T44,T45,T46,T47,T48,T49,T50
days_to_birth,-26400.0,-22968.0,-25489.0,-27756.0,-20188.0,-15410.0,-24934.0,-28406.0,-14613.0,-24646.0,...,,-28101.0,,-27530.0,-18562.0,-19473.0,-19140.0,-28049.0,-22913.0,-18951.0
days_to_death,1135.0,,,,,1149.0,1622.0,,223.0,,...,581.0,,247.0,194.0,91.0,,770.0,711.0,34.0,2116.0
ethnicity,not hispanic or latino,not reported,not reported,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino,...,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino,not hispanic or latino
gender,female,female,male,female,female,male,male,female,male,male,...,male,female,female,male,male,male,female,male,female,female
race,white,white,white,black or african american,white,black or african american,asian,white,asian,black or african american,...,white,white,white,white,asian,asian,white,white,white,black or african american


In [99]:
if "." in df_out.index[0]:
    df_out = df_out.sort_index(axis=1)
    df_out.index = df_out.index.map(lambda x: x.split(".")[0])
    df_out.index.name = ""
    df_out.columns = cols
    
dupes = df_out.index.duplicated(keep="first")
print(f"{dupes.sum()} duplicate indices dropped")
df_out = df_out[~dupes]

print(df_out.shape)
df_out.head()

44 duplicate indices dropped
(60616, 100)


Unnamed: 0,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,...,T41,T42,T43,T44,T45,T46,T47,T48,T49,T50
,,,,,,,,,,,,,,,,,,,,,
ENSG00000000003,4303.0,4847.0,4199.0,3059.0,3621.0,5268.0,5224.0,3008.0,4803.0,5692.0,...,8836.0,16693.0,7225.0,5569.0,4008.0,3752.0,5699.0,3954.0,5665.0,2321.0
ENSG00000000005,1.0,3.0,4.0,0.0,0.0,7.0,4.0,1.0,16.0,3.0,...,0.0,0.0,0.0,1.0,1.0,4.0,22.0,12.0,10.0,2.0
ENSG00000000419,802.0,964.0,1063.0,703.0,657.0,1399.0,1093.0,777.0,795.0,1043.0,...,1454.0,1419.0,866.0,2055.0,2124.0,909.0,1295.0,2241.0,1920.0,542.0
ENSG00000000457,411.0,263.0,485.0,256.0,396.0,577.0,479.0,279.0,353.0,600.0,...,777.0,543.0,673.0,620.0,2940.0,605.0,585.0,859.0,642.0,184.0
ENSG00000000460,69.0,113.0,100.0,78.0,131.0,95.0,77.0,82.0,303.0,285.0,...,367.0,132.0,319.0,219.0,1387.0,108.0,236.0,704.0,359.0,42.0


In [100]:
if overwrite or not file_already_exists:
    
    meta_outfile = Path(outfile._str[:-4]+"_meta.csv")
    sample_metadata.to_csv(meta_outfile)
    print(f"Metadata saved in {meta_outfile}")

    # Save full count data (no genes filtered)
    outfile_full = Path(outfolder, f"{primary_site.lower()}.csv")
    df_out.to_csv(outfile_full)
    print(f"Full data saved in {outfile_full}")

Metadata saved in ../data/liver/liver_meta.csv
Full data saved in ../data/liver/liver.csv


In [61]:
if cleanup: 
    print("Cleaning up tmp folder")
    !mv $tmp/MANIFEST.txt $tmp/..
    !rm -r $tmp

Cleaning up tmp folder


# Filter by expression

Use edgeR's filterByExpr() function to remove low count genes

In [121]:
outfile_filtered

PosixPath('../data/liver/liver_filtered.csv')

In [119]:
import sys, importlib
importlib.reload(sys.modules['R_wrappers'])

modpath = Path("../scripts")
sys.path.append(os.path.abspath(modpath))

from R_wrappers import filterByExpr_wrapper

outfile_filtered = Path(outfile._str.replace(".csv","_filtered.csv"))
filterByExpr_wrapper(inpath=outfile._str, outpath=outfile_filtered, design="none")

In [162]:
df_filt = pd.read_csv(outfile_filtered, index_col = 0)
print(df_filt.shape)
df_filt.head()

(15263, 100)


Unnamed: 0,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,...,T41,T42,T43,T44,T45,T46,T47,T48,T49,T50
ENSG00000000003,4303,4847,4199,3059,3621,5268,5224,3008,4803,5692,...,8836,16693,7225,5569,4008,3752,5699,3954,5665,2321
ENSG00000000419,802,964,1063,703,657,1399,1093,777,795,1043,...,1454,1419,866,2055,2124,909,1295,2241,1920,542
ENSG00000000457,411,263,485,256,396,577,479,279,353,600,...,777,543,673,620,2940,605,585,859,642,184
ENSG00000000460,69,113,100,78,131,95,77,82,303,285,...,367,132,319,219,1387,108,236,704,359,42
ENSG00000000938,293,231,1476,362,128,630,285,195,209,596,...,214,188,366,96,204,290,327,193,68,333


# Split by project_id and disease_type

In [151]:
df = pd.read_csv(outfile_filtered, index_col=0, header=0)
df_meta = pd.read_csv(meta_outfile, index_col=0, header=0)

for project_id in set(df_meta.loc["project_id"]):
    mask = (df_meta.loc["project_id"] == project_id)
    mask = mask[mask>0].index
    df_p = df_meta.loc[:,mask]
    print(f"{project_id}: {len(df_p.columns)} samples")
    print(df_p.loc["disease_type"].value_counts(),"\n")

TCGA-LIHC: 100 samples
Adenomas and Adenocarcinomas    100
Name: disease_type, dtype: int64 



In [152]:
from misc import add_metadata_to_multiindex

df_merged = add_metadata_to_multiindex(df,df_meta)

In [153]:
project_ids = set(df_merged.columns.get_level_values(level="project_id"))

for project_id in project_ids:
    
    if project_id in ["TCGA-KIRC", "TCGA-KIRP", "TCGA-KICH", "TCGA-LUAD", 
                      "TCGA-THCA", "TCGA-PRAD", "TCGA-COAD", "TCGA-LIHC"]:
        disease_type = "Adenomas and Adenocarcinomas"
    elif project_id in ["TCGA-BRCA"]:
        disease_type = "Ductal and Lobular Neoplasms"
    elif project_id in ["TCGA-LUSC"]:
        disease_type = "Squamous Cell Neoplasms"
    elif project_id in ["TCGA-READ"]:
        continue
    else: 
        raise Exception(f"Specify disease_type for {project_id}")
        
    project_id_abbrev = project_id[5:] # drop TCGA- prefix
    project_outfolder = Path(outfolder, project_id_abbrev)
    os.system(f"mkdir -p {project_outfolder}")
    
    df_xs = df_merged.xs(project_id, level="project_id", axis=1)
    df_xs = df_xs.xs(disease_type, level="disease_type", axis=1)
    
    project_sample_columns = df_xs.columns.get_level_values(level="Sample")
    df_project = df[project_sample_columns]
    
    outfile_project = Path(project_outfolder, project_id_abbrev+".csv")
    df_project.to_csv(outfile_project)
    print(f"Data saved in {outfile_project}")

Data saved in ../data/liver/LIHC/LIHC.csv


In [157]:
df_project = pd.read_csv(outfile_project, index_col=0)
print(df_project.shape)
df_project.head()

(15263, 100)


Unnamed: 0,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,...,T41,T42,T43,T44,T45,T46,T47,T48,T49,T50
ENSG00000000003,4303,4847,4199,3059,3621,5268,5224,3008,4803,5692,...,8836,16693,7225,5569,4008,3752,5699,3954,5665,2321
ENSG00000000419,802,964,1063,703,657,1399,1093,777,795,1043,...,1454,1419,866,2055,2124,909,1295,2241,1920,542
ENSG00000000457,411,263,485,256,396,577,479,279,353,600,...,777,543,673,620,2940,605,585,859,642,184
ENSG00000000460,69,113,100,78,131,95,77,82,303,285,...,367,132,319,219,1387,108,236,704,359,42
ENSG00000000938,293,231,1476,362,128,630,285,195,209,596,...,214,188,366,96,204,290,327,193,68,333
