# PyDESeq 2 pipeline

This notebook gives a minimalistic example of how to perform DEA using PyDESeq2.

It allows you to run the PyDESeq2 pipeline on one of the following TCGA datasets:
- TCGA-BRCA
- TCGA-COAD
- TCGA-LUAD
- TCGA-LUSC
- TCGA-PAAD
- TCGA-PRAD
- TCGA-READ
- TCGA-SKCM.

Running this pipeline takes a few minutes (~5-10 min) depending on your setup and on the chosen dataset.

In [1]:
import os
import pickle as pkl

import numpy as np
import pandas as pd

from pydeseq2.DeseqDataSet import DeseqDataSet
from pydeseq2.DeseqStats import DeseqStats
from pydeseq2.utils import load_data

In [2]:
SAVE = False  # whether to save the outputs of this notebook

## Data loading

See the `datasets` readme for the required data organization. 

In [3]:
CANCER = "TCGA-COAD"

In [4]:
OUTPUT_PATH = f"../output_files/{CANCER}"
os.makedirs(OUTPUT_PATH, exist_ok=True)  # Create path if it doesn't exist

In [5]:
counts_df = load_data(
    modality="raw_counts",
    cancer_type=CANCER,
    debug=False,
)

In [6]:
clinical_df = load_data(
    modality="clinical",
    cancer_type=CANCER,
    debug=False,
)

In [7]:
counts_df

gene,5_8S_rRNA,5S_rRNA,7SK,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2ML1-AS1,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11AP1,ZYG11B,ZYX,ZYXP1,ZZEF1,ZZZ3
TCGA-D5-6530-01A-11R-1723-07,0,0,39,0,14,436,6349,14,0,0,...,247,839,2045,3,0,1128,9066,0,5605,1504
TCGA-G4-6320-01A-11R-1723-07,0,1,1314,3,8,325,2125,10,0,0,...,98,491,1451,2,0,796,8377,0,2134,1416
TCGA-AD-6888-01A-11R-1928-07,0,0,533,1,6,1518,2237,11,7,0,...,169,900,2135,0,0,1285,22128,0,2743,2054
TCGA-CK-6747-01A-11R-1839-07,0,0,49,3,15,2405,22747,33,5,0,...,171,808,1865,2,0,1535,10416,0,5070,2466
TCGA-AA-3975-01A-01R-1022-07,0,0,1913,1,7,634,5194,10,1,0,...,62,355,1003,1,0,540,5714,0,1428,724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-AA-3713-01A-21R-1723-07,0,0,75,5,35,1015,16679,7,4,0,...,66,803,2145,4,0,1483,9567,0,9745,2731
TCGA-5M-AAT4-01A-11R-A41B-07,0,0,189,1,10,820,5888,5,37,0,...,234,1171,1896,2,0,1099,10500,0,2748,1582
TCGA-AA-A00L-01A-01R-A002-07,0,0,24,1,0,622,2929,4,3,1,...,103,597,883,1,0,622,2990,0,1523,783
TCGA-A6-5657-01A-01R-A32Z-07,0,1,103,13,41,1305,41149,32,2,0,...,477,2685,1948,12,0,2174,32821,0,3366,2909


Remove samples for which `high_grade` is NaN.

In [8]:
samples_to_keep = ~clinical_df.high_grade.isna()
samples_to_keep.sum()

494

In [9]:
counts_df = counts_df.loc[samples_to_keep]
clinical_df = clinical_df.loc[samples_to_keep]

Filter out genes that have less than 10 counts in total

In [10]:
genes_to_keep = counts_df.columns[counts_df.sum(axis=0) >= 10]
len(genes_to_keep)

50401

In [11]:
counts_df = counts_df[genes_to_keep]

# I - Simple DEA analysis 

## 1 - Read counts modeling with the `DeseqDataSet` class

In [12]:
# Start by creating a DeseqDataSet
dds = DeseqDataSet(counts_df, clinical_df, n_cpus=8)

In [13]:
# Then, run DESeq2 on it
dds.deseq2()

Fitting size factors...
... done in 0.85 seconds.

Fitting dispersions...
... done in 28.02 seconds.

Fitting dispersion trend curve...
... done in 18.76 seconds.

Fitting MAP dispersions...
... done in 25.92 seconds.

Fitting LFCs...
... done in 12.68 seconds.

Refitting 9479 outliers.

Fitting dispersions...
... done in 4.89 seconds.

Fitting MAP dispersions...
... done in 3.97 seconds.

Fitting LFCs...
... done in 2.99 seconds.



In [14]:
if SAVE:
    with open(os.path.join(OUTPUT_PATH, "py_dds.pkl"), "wb") as f:
        pkl.dump(dds, f)

## 2 - Statistical analysis with the `DeseqStats` class

### Wald test

In [15]:
stat_res = DeseqStats(dds, n_cpus=8)

In [16]:
stat_res.summary()

Running Wald tests...
... done in 19.87 seconds.



Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5_8S_rRNA,0.210920,0.599716,0.771179,0.777661,4.367686e-01,
5S_rRNA,0.816280,0.901406,0.329549,2.735273,6.232862e-03,4.535908e-02
7SK,1872.124702,4.063256,0.262331,15.489056,4.112820e-54,1.389681e-49
A1BG,3.171243,-0.018032,0.119103,-0.151395,8.796640e-01,9.439759e-01
A1BG-AS1,15.289902,-0.055029,0.098678,-0.557665,5.770730e-01,7.621659e-01
...,...,...,...,...,...,...
ZYG11AP1,0.075674,0.139540,2.391782,0.058342,9.534765e-01,
ZYG11B,973.020734,0.060948,0.062942,0.968327,3.328810e-01,5.564327e-01
ZYX,8122.521671,-0.039176,0.064795,-0.604609,5.454386e-01,7.388091e-01
ZZEF1,2618.836571,0.218351,0.079944,2.731306,6.308381e-03,4.576081e-02


In [17]:
if SAVE:
    with open(os.path.join(OUTPUT_PATH, "py_results.pkl"), "wb") as f:
        pkl.dump(stat_res, f)

### LFC shrinkage

In [18]:
stat_res.lfc_shrink()

Fitting MAP LFCs...
... done in 12.40 seconds.



Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5_8S_rRNA,0.210920,0.038630,0.129204,0.777661,4.367686e-01,
5S_rRNA,0.816280,0.704521,0.433331,2.735273,6.232862e-03,4.535908e-02
7SK,1872.124702,2.235193,0.242119,15.489056,4.112820e-54,1.389681e-49
A1BG,3.171243,-0.008822,0.084974,-0.151395,8.796640e-01,9.439759e-01
A1BG-AS1,15.289902,-0.033281,0.078245,-0.557665,5.770730e-01,7.621659e-01
...,...,...,...,...,...,...
ZYG11AP1,0.075674,0.000376,0.120715,0.058342,9.534765e-01,
ZYG11B,973.020734,0.047118,0.057112,0.968327,3.328810e-01,5.564327e-01
ZYX,8122.521671,0.021630,0.056939,-0.604609,5.454386e-01,7.388091e-01
ZZEF1,2618.836571,0.236818,0.081254,2.731306,6.308381e-03,4.576081e-02


In [19]:
if SAVE:
    with open(os.path.join(OUTPUT_PATH, "py_shrunk_results.pkl"), "wb") as f:
        pkl.dump(stat_res, f)

In [49]:
type(stat_res.LFCs)

pandas.core.frame.DataFrame

In [22]:
type(dds.genewise_dispersions)

pandas.core.series.Series

In [26]:
type(dds.MAP_dispersions)

pandas.core.series.Series

In [43]:
type(dds._LFC_converged)

pandas.core.frame.DataFrame

In [51]:
stat_res.results_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5_8S_rRNA,0.210920,0.038630,0.129204,0.777661,4.367686e-01,
5S_rRNA,0.816280,0.704521,0.433331,2.735273,6.232862e-03,4.535908e-02
7SK,1872.124702,2.235193,0.242119,15.489056,4.112820e-54,1.389681e-49
A1BG,3.171243,-0.008822,0.084974,-0.151395,8.796640e-01,9.439759e-01
A1BG-AS1,15.289902,-0.033281,0.078245,-0.557665,5.770730e-01,7.621659e-01
...,...,...,...,...,...,...
ZYG11AP1,0.075674,0.000376,0.120715,0.058342,9.534765e-01,
ZYG11B,973.020734,0.047118,0.057112,0.968327,3.328810e-01,5.564327e-01
ZYX,8122.521671,0.021630,0.056939,-0.604609,5.454386e-01,7.388091e-01
ZZEF1,2618.836571,0.236818,0.081254,2.731306,6.308381e-03,4.576081e-02
