In [2]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import pandas as pd
import numpy as np
from adjustText import adjust_text
import matplotlib.pylab as plt



In [3]:
data = pd.read_csv("data_adj/fixed_counts_matrix.tsv", sep="\t")
#here is where we filter out what samples we want to use
data = data.filter(items=['gene_id',
                'LB_12.5_NA_1', 'LB_12.5_NA_2','LB_13.5_NA_1', 'LB_13.5_NA_2',
                'G_4_F_1', 'G_4_F_2','G_4_M_1', 'G_4_M_2',
                'SM_0_NA_1', 'SM_0_NA_2',
                'MB_1', 'MB_2','MB_3', 'MB_4','MB_5',
                'MT_1', 'MT_2', 'MT_3', 'MT_4', 'MT_5'])
data = data.set_index(data["gene_id"])
data = data.drop('gene_id', axis=1)

In [4]:
data.head()

Unnamed: 0_level_0,LB_12.5_NA_1,LB_12.5_NA_2,LB_13.5_NA_1,LB_13.5_NA_2,G_4_F_1,G_4_F_2,G_4_M_1,G_4_M_2,SM_0_NA_1,SM_0_NA_2,MB_1,MB_2,MB_3,MB_4,MB_5,MT_1,MT_2,MT_3,MT_4,MT_5
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ENSMUSG00000099183.1,1,1,6,0,2,4,1,8,1,4,12,4,0,0,1,12,19,6,2,4
ENSMUSG00000065559.1,9937,9002,22175,23657,720323,517475,652657,101,298316,447669,70786,15451,27143,24389,27329,2584038,4111798,1074677,1300194,1256891
ENSMUSG00000065480.1,212,156,446,452,444,373,427,10,2556,4440,2433,306,334,359,1088,103180,104254,26475,26033,21244
ENSMUSG00000065405.3,27408,19015,28252,23210,26029,16861,16800,17258,17696,20188,223146,132440,125913,135138,120392,46398,64996,27729,28878,25335
ENSMUSG00000065567.1,77,90,65,71,213,240,186,1986,51,87,5289,1440,2074,1710,2018,535,812,517,529,416


In [5]:
metadata = pd.read_csv("data_adj/metadata.tsv", sep="\t",
                      usecols=["File.accession", "Biosample.term.name", "Age", "Sample"])

In [6]:
# Filter metadata DataFrame based on matching samples in merged_df
metadata_filtered = metadata[metadata['Sample'].isin(data.columns)].copy()

# Sort filtered_metadata based on the order of columns in merged_df
metadata_filtered['Sample'] = pd.Categorical(metadata_filtered['Sample'], categories=data.columns)
metadata_filtered.sort_values('Sample', inplace=True)

In [7]:
filtering = {'Biosample.term.name': ['C2C12', 'limb'],#whatever you need from the biosample names
             'Age': ["ED_12.5","ED_13.5","0"]}
group = 'Biosample.term.name' # change to the group you're interested in making the comparison in

In [8]:
metadata_selected = metadata_filtered.copy(deep=True)

In [9]:
for col in filtering.keys():
    metadata_selected = metadata_selected[metadata_selected[col].isin(filtering[col])]

In [10]:
metadata_selected

Unnamed: 0,File.accession,Biosample.term.name,Age,Sample
37,ENCFF498WOL,limb,ED_12.5,LB_12.5_NA_1
38,ENCFF755ZZE,limb,ED_12.5,LB_12.5_NA_2
39,ENCFF093XOF,limb,ED_13.5,LB_13.5_NA_1
40,ENCFF369MXS,limb,ED_13.5,LB_13.5_NA_2
0,ENCFF784UWQ,C2C12,0,MB_1
1,ENCFF094EFP,C2C12,0,MB_2
51,Ctrl_0hr_A_new,C2C12,0,MB_3
52,Ctrl_0hr_B_new,C2C12,0,MB_4
53,Ctrl_0hr_C_new,C2C12,0,MB_5


In [11]:
metadata_selected.index = metadata_selected['Sample']
data_selected = data.loc[:, metadata_selected['Sample']]

In [12]:
data_selected

Unnamed: 0_level_0,LB_12.5_NA_1,LB_12.5_NA_2,LB_13.5_NA_1,LB_13.5_NA_2,MB_1,MB_2,MB_3,MB_4,MB_5
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSMUSG00000099183.1,1,1,6,0,12,4,0,0,1
ENSMUSG00000065559.1,9937,9002,22175,23657,70786,15451,27143,24389,27329
ENSMUSG00000065480.1,212,156,446,452,2433,306,334,359,1088
ENSMUSG00000065405.3,27408,19015,28252,23210,223146,132440,125913,135138,120392
ENSMUSG00000065567.1,77,90,65,71,5289,1440,2074,1710,2018
...,...,...,...,...,...,...,...,...,...
ENSMUSG00000099172.1,0,0,0,0,0,0,0,0,0
ENSMUSG00000093219.1,0,1,1,1,13,3,1,1,0
ENSMUSG00000065602.1,1899,1448,5934,6537,57897,15115,4542,5196,4304
ENSMUSG00000065536.2,8231,6966,17079,20465,189764,38714,5384,5343,4217


In [13]:
metadata_selected

Unnamed: 0_level_0,File.accession,Biosample.term.name,Age,Sample
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LB_12.5_NA_1,ENCFF498WOL,limb,ED_12.5,LB_12.5_NA_1
LB_12.5_NA_2,ENCFF755ZZE,limb,ED_12.5,LB_12.5_NA_2
LB_13.5_NA_1,ENCFF093XOF,limb,ED_13.5,LB_13.5_NA_1
LB_13.5_NA_2,ENCFF369MXS,limb,ED_13.5,LB_13.5_NA_2
MB_1,ENCFF784UWQ,C2C12,0,MB_1
MB_2,ENCFF094EFP,C2C12,0,MB_2
MB_3,Ctrl_0hr_A_new,C2C12,0,MB_3
MB_4,Ctrl_0hr_B_new,C2C12,0,MB_4
MB_5,Ctrl_0hr_C_new,C2C12,0,MB_5


In [14]:
dds = DeseqDataSet(
    counts=data_selected.T,
    clinical=metadata_selected,
    design_factors=group,
    refit_cooks=True)

dds.deseq2()
stat_res = DeseqStats(dds, 
                      contrast=[group] + filtering[group])
stat_res.summary()

Fitting size factors...
... done in 0.00 seconds.





Fitting dispersions...
... done in 5.79 seconds.

Fitting dispersion trend curve...
... done in 1.02 seconds.

Fitting MAP dispersions...
... done in 4.70 seconds.

Fitting LFCs...
... done in 1.74 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 1.01 seconds.

Log2 fold change & Wald test p-value: Biosample.term.name C2C12 vs limb


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000099183.1,1.809183,-0.492838,1.253140,-0.393282,6.941110e-01,7.621611e-01
ENSMUSG00000065559.1,22854.064453,0.492107,0.453132,1.086014,2.774729e-01,3.632023e-01
ENSMUSG00000065480.1,512.920654,0.801842,0.555936,1.442329,1.492097e-01,2.129895e-01
ENSMUSG00000065405.3,84913.140625,2.165969,0.340205,6.366654,1.931963e-10,1.019181e-09
ENSMUSG00000065567.1,1141.655396,4.426417,0.396570,11.161759,6.273790e-29,8.783306e-28
...,...,...,...,...,...,...
ENSMUSG00000099172.1,0.000000,,,,,
ENSMUSG00000093219.1,1.281263,1.014564,1.226110,0.827466,4.079730e-01,5.000073e-01
ENSMUSG00000065602.1,6812.530273,0.957157,0.512544,1.867464,6.183683e-02,1.009355e-01
ENSMUSG00000065536.2,17156.472656,0.306778,0.667267,0.459753,6.456933e-01,7.219076e-01


In [15]:
l2fc_cutoff = 2 # Log 2 fold change; usually 0.5 - 2
pval_cutoff = 0.05 # 0.01 is even better than 0.05

In [16]:
annot = pd.read_csv("data_adj/mirna_id_to_name.tsv", sep="\t")
annot.drop(columns=['Unnamed: 0'], inplace=True)

In [17]:
annot.head()

Unnamed: 0,gene_id,gene_name
0,ENSMUSG00000065559.1,Mir206
1,ENSMUSG00000065480.1,Mir133b
2,ENSMUSG00000065567.1,Mir30c-2
3,ENSMUSG00000105982.1,Mir6896
4,ENSMUSG00000092852.1,Mir5103


In [18]:
df = stat_res.results_df.copy(deep=True)

In [19]:
# Find the machine-specific lowest non-zero value
lowest_nonzero_value = df['padj'][df['padj'] > 0].min()

# Replace 0 with the lowest non-zero value
df['padj'] = np.where(df['padj'] == 0, lowest_nonzero_value, df['padj'])

df = df.dropna()
df = df.merge(annot, left_index=True, right_on='gene_id')
df

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_id,gene_name
0,22854.064453,0.492107,0.453132,1.086014,2.774729e-01,3.632023e-01,ENSMUSG00000065559.1,Mir206
1,512.920654,0.801842,0.555936,1.442329,1.492097e-01,2.129895e-01,ENSMUSG00000065480.1,Mir133b
818,84913.140625,2.165969,0.340205,6.366654,1.931963e-10,1.019181e-09,ENSMUSG00000065405.3,Mir30a
2,1141.655396,4.426417,0.396570,11.161759,6.273790e-29,8.783306e-28,ENSMUSG00000065567.1,Mir30c-2
3,3.741728,2.800741,1.058961,2.644801,8.173897e-03,1.685722e-02,ENSMUSG00000105982.1,Mir6896
...,...,...,...,...,...,...,...,...
810,45.555779,-8.215484,1.285252,-6.392120,1.636011e-10,8.693548e-10,ENSMUSG00000080331.1,Mir1298
811,4.724970,-6.061286,2.550924,-2.376114,1.749607e-02,3.326386e-02,ENSMUSG00000065450.1,Mir448
813,1.281263,1.014564,1.226110,0.827466,4.079730e-01,5.000073e-01,ENSMUSG00000093219.1,Mir3113
814,6812.530273,0.957157,0.512544,1.867464,6.183683e-02,1.009355e-01,ENSMUSG00000065602.1,Mirlet7f-2


In [20]:
# Calculate -log10(padj)
df['nlog10padj'] = -np.log10(df['padj'])

# Add labels to DE column based on our cutoffs above
df['DE'] = "No"
df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange > l2fc_cutoff)] = "Up"
df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange < -l2fc_cutoff)] = "Down"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange > l2fc_cutoff)] = "Up"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange < -l2fc_cutoff)] = "Down"


In [22]:
df.to_csv("data_adj/deg_quantity/MB_ED_mid.tsv", sep="\t", columns=["gene_name", "DE"])

In [None]:
import matplotlib.pyplot as plt
from adjustText import adjust_text

df['label'] = df.gene_name
df.label[df.DE == "No"] = ""

# Create the figure
fig, ax = plt.subplots()

# Set the figure size
fig.set_size_inches(10, 10)

# Plot whole df first (with small size dots)
ax.scatter(x=df['log2FoldChange'], y=df['nlog10padj'], s=1, label="Not significant")

# Highlight up- or down-regulated genes
down = df[df.DE == "Down"]
down.sort_values(["padj"], inplace=True)
up = df[df.DE == "Up"]
up.sort_values(["padj"], inplace=True)

# Overlay up- and down-regulated gene dfs with larger label and specific color
ax.scatter(x=down['log2FoldChange'], y=down['nlog10padj'], s=3, label="Down-regulated", color="blue")
ax.scatter(x=up['log2FoldChange'], y=up['nlog10padj'], s=3, label="Up-regulated", color="red")

# Display names of top 20 up- or down-regulated genes
n_genes = 20
texts = []
for i in range(min(n_genes, up.shape[0])):
    texts.append(ax.text(x=up.iloc[i, 1],
                         y=up.iloc[i, 8],
                         s=up.iloc[i, 7]))
for i in range(min(n_genes, down.shape[0])):
    texts.append(ax.text(x=down.iloc[i, 1],
                         y=down.iloc[i, 8],
                         s=down.iloc[i, 7]))
adjust_text(texts, arrowprops=dict(arrowstyle="-", color='black', lw=0.5))

# Draw lines indicating lfc and padj cutoffs
ax.set_xlabel("logFC")
ax.set_ylabel("-log10(adj. p-value)")
ax.axvline(l2fc_cutoff, color="grey", linestyle="--")
ax.axvline(-l2fc_cutoff, color="grey", linestyle="--")
ax.axhline(-np.log10(pval_cutoff), color="grey", linestyle="--")

# Draw legend
ax.legend()

# Add a title to the plot
ax.set_title("C2C12 0hr vs Embryonic")
#ax.set_title("Skeletal Muscle Embryonic vs PND0")

# Save the plot as a high-resolution PNG with specific width and height
output_file = "plots/volcano_MB.png"
#output_file = "plots/volcano_skeletal.png"
plt.savefig(output_file, dpi=300)

# Show the plot
plt.show()