In [1]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import pandas as pd
import numpy as np
from adjustText import adjust_text
import matplotlib.pylab as plt



In [2]:
data = pd.read_csv("data_adj/fixed_counts_matrix.tsv", sep="\t")
#here is where you filter which samples you want
data = data.filter(items=['gene_id',
                'LB_10.5_NA_1', 'LB_10.5_NA_2','LB_11.5_NA_1', 'LB_11.5_NA_2',
                'G_4_F_1', 'G_4_F_2','G_4_M_1', 'G_4_M_2',
                'SM_0_NA_1', 'SM_0_NA_2',
                'MB_1', 'MB_2','MB_3', 'MB_4','MB_5',
                'MT_1', 'MT_2', 'MT_3', 'MT_4', 'MT_5'])
data = data.set_index(data["gene_id"])
data = data.drop('gene_id', axis=1)

In [3]:
data.head()

Unnamed: 0_level_0,LB_10.5_NA_1,LB_10.5_NA_2,LB_11.5_NA_1,LB_11.5_NA_2,G_4_F_1,G_4_F_2,G_4_M_1,G_4_M_2,SM_0_NA_1,SM_0_NA_2,MB_1,MB_2,MB_3,MB_4,MB_5,MT_1,MT_2,MT_3,MT_4,MT_5
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ENSMUSG00000099183.1,0,0,0,0,2,4,1,8,1,4,12,4,0,0,1,12,19,6,2,4
ENSMUSG00000065559.1,96,350,1561,1909,720323,517475,652657,101,298316,447669,70786,15451,27143,24389,27329,2584038,4111798,1074677,1300194,1256891
ENSMUSG00000065480.1,4,25,92,83,444,373,427,10,2556,4440,2433,306,334,359,1088,103180,104254,26475,26033,21244
ENSMUSG00000065405.3,18023,20008,13458,15665,26029,16861,16800,17258,17696,20188,223146,132440,125913,135138,120392,46398,64996,27729,28878,25335
ENSMUSG00000065567.1,125,57,111,71,213,240,186,1986,51,87,5289,1440,2074,1710,2018,535,812,517,529,416


In [4]:
metadata = pd.read_csv("data_adj/metadata.tsv", sep="\t",
                      usecols=["File.accession", "Biosample.term.name", "Age", "Sample"])

In [5]:
# Filter metadata DataFrame based on matching samples in merged_df
metadata_filtered = metadata[metadata['Sample'].isin(data.columns)].copy()

# Sort filtered_metadata based on the order of columns in merged_df
metadata_filtered['Sample'] = pd.Categorical(metadata_filtered['Sample'], categories=data.columns)
metadata_filtered.sort_values('Sample', inplace=True)

In [6]:
filtering = {'Biosample.term.name': ['gastrocnemius', 'skeletal muscle tissue'],#whatever you need from the biosample names
             'Age': ["PND_04","PND_0"]}
group = 'Age' # change to the group you're interested in making the comparison in

In [7]:
metadata_selected = metadata_filtered.copy(deep=True)

In [8]:
for col in filtering.keys():
    metadata_selected = metadata_selected[metadata_selected[col].isin(filtering[col])]

In [9]:
metadata_selected

Unnamed: 0,File.accession,Biosample.term.name,Age,Sample
2,ENCFF810XKL,gastrocnemius,PND_04,G_4_F_1
3,ENCFF641TJV,gastrocnemius,PND_04,G_4_F_2
4,ENCFF859KXI,gastrocnemius,PND_04,G_4_M_1
5,ENCFF286MEN,gastrocnemius,PND_04,G_4_M_2
47,ENCFF431VHL,skeletal muscle tissue,PND_0,SM_0_NA_1
49,ENCFF430EME,skeletal muscle tissue,PND_0,SM_0_NA_2


In [10]:
metadata_selected.index = metadata_selected['Sample']
data_selected = data.loc[:, metadata_selected['Sample']]

In [11]:
data_selected

Unnamed: 0_level_0,G_4_F_1,G_4_F_2,G_4_M_1,G_4_M_2,SM_0_NA_1,SM_0_NA_2
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000099183.1,2,4,1,8,1,4
ENSMUSG00000065559.1,720323,517475,652657,101,298316,447669
ENSMUSG00000065480.1,444,373,427,10,2556,4440
ENSMUSG00000065405.3,26029,16861,16800,17258,17696,20188
ENSMUSG00000065567.1,213,240,186,1986,51,87
...,...,...,...,...,...,...
ENSMUSG00000099172.1,0,0,0,0,0,0
ENSMUSG00000093219.1,0,0,0,0,0,1
ENSMUSG00000065602.1,10619,7689,8256,10667,5658,5973
ENSMUSG00000065536.2,16682,12544,14124,8378,12613,14819


In [12]:
metadata_selected

Unnamed: 0_level_0,File.accession,Biosample.term.name,Age,Sample
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G_4_F_1,ENCFF810XKL,gastrocnemius,PND_04,G_4_F_1
G_4_F_2,ENCFF641TJV,gastrocnemius,PND_04,G_4_F_2
G_4_M_1,ENCFF859KXI,gastrocnemius,PND_04,G_4_M_1
G_4_M_2,ENCFF286MEN,gastrocnemius,PND_04,G_4_M_2
SM_0_NA_1,ENCFF431VHL,skeletal muscle tissue,PND_0,SM_0_NA_1
SM_0_NA_2,ENCFF430EME,skeletal muscle tissue,PND_0,SM_0_NA_2


In [13]:
dds = DeseqDataSet(
    counts=data_selected.T,
    clinical=metadata_selected,
    design_factors=group,
    refit_cooks=True)

dds.deseq2()
stat_res = DeseqStats(dds, 
                      contrast=[group] + filtering[group])
stat_res.summary()

Fitting size factors...
... done in 0.00 seconds.





Fitting dispersions...
... done in 7.37 seconds.

Fitting dispersion trend curve...
... done in 1.03 seconds.

Fitting MAP dispersions...
... done in 4.04 seconds.

Fitting LFCs...
... done in 1.64 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 1.02 seconds.

Log2 fold change & Wald test p-value: Age PND_04 vs PND_0


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000099183.1,3.512399,1.037371,1.340909,0.773632,0.439148,0.794191
ENSMUSG00000065559.1,403528.906250,0.543952,1.559188,0.348869,0.727188,0.903588
ENSMUSG00000065480.1,1160.309448,-3.258662,1.063341,-3.064550,0.002180,0.032443
ENSMUSG00000065405.3,18439.164062,0.308742,0.187133,1.649852,0.098973,0.444546
ENSMUSG00000065567.1,547.787781,3.801855,1.156371,3.287746,,
...,...,...,...,...,...,...
ENSMUSG00000099172.1,0.000000,,,,,
ENSMUSG00000093219.1,0.127214,-1.108311,4.537885,-0.244235,0.807049,
ENSMUSG00000065602.1,8139.511719,0.992996,0.293025,3.388770,0.000702,0.012687
ENSMUSG00000065536.2,12522.605469,0.180131,0.512245,0.351650,0.725101,0.903588


In [14]:
l2fc_cutoff = 2 # Log 2 fold change; usually 0.5 - 2
pval_cutoff = 0.05 # 0.01 is even better than 0.05

In [15]:
annot = pd.read_csv("data_adj/mirna_id_to_name.tsv", sep="\t")
annot.drop(columns=['Unnamed: 0'], inplace=True)

In [16]:
df = stat_res.results_df.copy(deep=True)

In [17]:
# Find the machine-specific lowest non-zero value
lowest_nonzero_value = df['padj'][df['padj'] > 0].min()

# Replace 0 with the lowest non-zero value
df['padj'] = np.where(df['padj'] == 0, lowest_nonzero_value, df['padj'])

df = df.dropna()
df = df.merge(annot, left_index=True, right_on='gene_id')
df

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_id,gene_name
0,403528.906250,0.543952,1.559188,0.348869,0.727188,0.903588,ENSMUSG00000065559.1,Mir206
1,1160.309448,-3.258662,1.063341,-3.064550,0.002180,0.032443,ENSMUSG00000065480.1,Mir133b
818,18439.164062,0.308742,0.187133,1.649852,0.098973,0.444546,ENSMUSG00000065405.3,Mir30a
4,2.877694,-4.673396,1.772898,-2.636021,0.008388,0.084891,ENSMUSG00000092852.1,Mir5103
5,4.517928,0.572441,1.038588,0.551172,0.581516,0.884771,ENSMUSG00000098752.1,Mir6897
...,...,...,...,...,...,...,...,...
805,24862.458984,0.994421,0.422847,2.351728,0.018686,0.163023,ENSMUSG00000076011.1,Mir652
809,1.920392,-0.055842,1.506615,-0.037065,0.970433,0.988220,ENSMUSG00000093122.1,Mir1264
810,6.612656,1.366478,1.496950,0.912842,0.361326,0.742432,ENSMUSG00000080331.1,Mir1298
814,8139.511719,0.992996,0.293025,3.388770,0.000702,0.012687,ENSMUSG00000065602.1,Mirlet7f-2


In [18]:
# Calculate -log10(padj)
df['nlog10padj'] = -np.log10(df['padj'])

# Add labels to DE column based on our cutoffs above
df['DE'] = "No"
df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange > l2fc_cutoff)] = "Up"
df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange < -l2fc_cutoff)] = "Down"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange > l2fc_cutoff)] = "Up"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange < -l2fc_cutoff)] = "Down"


In [19]:
df

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_id,gene_name,nlog10padj,DE
0,403528.906250,0.543952,1.559188,0.348869,0.727188,0.903588,ENSMUSG00000065559.1,Mir206,0.044029,No
1,1160.309448,-3.258662,1.063341,-3.064550,0.002180,0.032443,ENSMUSG00000065480.1,Mir133b,1.488877,Down
818,18439.164062,0.308742,0.187133,1.649852,0.098973,0.444546,ENSMUSG00000065405.3,Mir30a,0.352084,No
4,2.877694,-4.673396,1.772898,-2.636021,0.008388,0.084891,ENSMUSG00000092852.1,Mir5103,1.071137,No
5,4.517928,0.572441,1.038588,0.551172,0.581516,0.884771,ENSMUSG00000098752.1,Mir6897,0.053169,No
...,...,...,...,...,...,...,...,...,...,...
805,24862.458984,0.994421,0.422847,2.351728,0.018686,0.163023,ENSMUSG00000076011.1,Mir652,0.787751,No
809,1.920392,-0.055842,1.506615,-0.037065,0.970433,0.988220,ENSMUSG00000093122.1,Mir1264,0.005146,No
810,6.612656,1.366478,1.496950,0.912842,0.361326,0.742432,ENSMUSG00000080331.1,Mir1298,0.129343,No
814,8139.511719,0.992996,0.293025,3.388770,0.000702,0.012687,ENSMUSG00000065602.1,Mirlet7f-2,1.896628,No


In [None]:
import matplotlib.pyplot as plt
from adjustText import adjust_text

df['label'] = df.gene_name
df.label[df.DE == "No"] = ""

# Create the figure
fig, ax = plt.subplots()

# Set the figure size
fig.set_size_inches(10, 10)

# Plot whole df first (with small size dots)
ax.scatter(x=df['log2FoldChange'], y=df['nlog10padj'], s=1, label="Not significant")

# Highlight up- or down-regulated genes
down = df[df.DE == "Down"]
down.sort_values(["padj"], inplace=True)
up = df[df.DE == "Up"]
up.sort_values(["padj"], inplace=True)

# Overlay up- and down-regulated gene dfs with larger label and specific color
ax.scatter(x=down['log2FoldChange'], y=down['nlog10padj'], s=3, label="Down-regulated", color="blue")
ax.scatter(x=up['log2FoldChange'], y=up['nlog10padj'], s=3, label="Up-regulated", color="red")

# Display names of top 20 up- or down-regulated genes
n_genes = 20
texts = []
for i in range(min(n_genes, up.shape[0])):
    texts.append(ax.text(x=up.iloc[i, 1],
                         y=up.iloc[i, 8],
                         s=up.iloc[i, 7]))
for i in range(min(n_genes, down.shape[0])):
    texts.append(ax.text(x=down.iloc[i, 1],
                         y=down.iloc[i, 8],
                         s=down.iloc[i, 7]))
adjust_text(texts, arrowprops=dict(arrowstyle="-", color='black', lw=0.5))

# Draw lines indicating lfc and padj cutoffs
ax.set_xlabel("logFC")
ax.set_ylabel("-log10(adj. p-value)")
ax.axvline(l2fc_cutoff, color="grey", linestyle="--")
ax.axvline(-l2fc_cutoff, color="grey", linestyle="--")
ax.axhline(-np.log10(pval_cutoff), color="grey", linestyle="--")

# Draw legend
ax.legend()

# Add a title to the plot
#ax.set_title("C2C12 0hr vs 72hr")
ax.set_title("Skeletal Muscle PND0 vs PND4")

# Save the plot as a high-resolution PNG with specific width and height
#output_file = "plots/volcano_c2c12.png"
output_file = "plots/volcano_skeletal_pnd0_4.png"
plt.savefig(output_file, dpi=300)

# Show the plot
plt.show()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.label[df.DE == "No"] = ""
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  down.sort_values(["padj"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  up.sort_values(["padj"], inplace=True)
