## Volcano Plot
[Code derived from COSMOS project in UCI Mortazavi Lab](https://github.com/maroonApricot/cosmos-project/tree/main)

In [None]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from adjustText import adjust_text
import pandas as pd

In [None]:
#import your data
data = pd.read_csv("expression_matrix.txt", sep=" ", index_col=0)

metadata = pd.read_csv("metadata.csv")
filtering = {'Sample.Type': ['Cancerous', 'Normal']} #whatever you need, can have multiple filters
group = 'Sample.Type' # change to the group you're interested in making the comparison in
metadata.index = metadata['Sample']
metadata.drop("Sample", axis=1, inplace=True)
metadata

Unnamed: 0_level_0,Sample.Type
Sample,Unnamed: 1_level_1
Ca1,Cancerous
N1,Normal
Ca2534465,Cancerous
N2534465,Normal
Ca2537821,Cancerous
N2537821,Normal
Ca3,Cancerous
N3,Normal
N6,Normal
N7,Normal


In [76]:
dds = DeseqDataSet(
    counts=data.T,
    metadata=metadata,
    design_factors=group,
    refit_cooks=True)

dds.deseq2()

stat_res = DeseqStats(dds,contrast=[group] + filtering[group])
stat_res.summary()

Using None as control genes, passed at DeseqDataSet initialization


  dds = DeseqDataSet(
Fitting size factors...
... done in 0.02 seconds.

Fitting dispersions...
... done in 1.51 seconds.

Fitting dispersion trend curve...
... done in 0.52 seconds.

Fitting MAP dispersions...
... done in 1.42 seconds.

Fitting LFCs...
... done in 1.66 seconds.

Calculating cook's distance...
... done in 0.03 seconds.

Replacing 2417 outlier genes.

Fitting dispersions...
... done in 0.15 seconds.

Fitting MAP dispersions...
... done in 0.16 seconds.

Fitting LFCs...
... done in 0.17 seconds.

Running Wald tests...


Log2 fold change & Wald test p-value: Sample.Type Cancerous vs Normal
                       baseMean  log2FoldChange     lfcSE      stat  \
gene_id                                                               
ENSG00000000003.13  1366.021925       -1.651776  0.319333 -5.172577   
ENSG00000000005.5      8.129428       -0.618465  0.601904 -1.027514   
ENSG00000000419.11   411.322601       -0.128036  0.157807 -0.811346   
ENSG00000000457.12   582.986758       -0.227849  0.206794 -1.101816   
ENSG00000000460.15   219.642978        0.310271  0.184904  1.678015   
...                         ...             ...       ...       ...   
ENSGR0000275287.3      0.000000             NaN       NaN       NaN   
ENSGR0000276543.3      0.000000             NaN       NaN       NaN   
ENSGR0000277120.3      0.000000             NaN       NaN       NaN   
ENSGR0000280767.1      0.000000             NaN       NaN       NaN   
ENSGR0000281849.1      0.000000             NaN       NaN       NaN   

      

... done in 1.62 seconds.



In [None]:
df = stat_res.results_df.copy(deep=True)
# Find the machine-specific lowest non-zero value
lowest_nonzero_value = df['padj'][df['padj'] > 0].min()

# Replace 0 with the lowest non-zero value
df['padj'] = np.where(df['padj'] == 0, lowest_nonzero_value, df['padj'])

df.dropna(inplace=True)

mapping_dict = pd.read_csv("ensembl_to_gene_symbol.csv")
df = df.rename(index=mapping_dict)

In [None]:
# Calculate -log10(padj)
df['nlog10padj'] = -np.log10(df['padj'])

# Add labels to DE column based on our cutoffs above
pval_cutoff = 0.05
l2fc_cutoff = 2

df['DE'] = "No"
df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange > l2fc_cutoff)] = "Up"
df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange < -l2fc_cutoff)] = "Down"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange > l2fc_cutoff)] = "Up"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.DE[np.logical_and(df.padj < pval_cutoff, df.log2FoldChange < -l2fc_cutoff)] = "Down"


## Graphing volcano plot

In [None]:
df['label'] = df.index
df.label[df.DE == "No"] = ""

# Create the figure
fig, ax = plt.subplots()

# Set the figure size
fig.set_size_inches(10, 10)

# Plot whole df first (with small size dots)
ax.scatter(x=df['log2FoldChange'], y=df['nlog10padj'], s=1, label="Not significant")

# Highlight up- or down-regulated genes
down = df[df.DE == "Down"]
down.sort_values(["padj"], inplace=True)
up = df[df.DE == "Up"]
up.sort_values(["padj"], inplace=True)

# Overlay up- and down-regulated gene dfs with larger label and specific color
ax.scatter(x=down['log2FoldChange'], y=down['nlog10padj'], s=0.5, label="Down-regulated", color="blue")
ax.scatter(x=up['log2FoldChange'], y=up['nlog10padj'], s=0.5, label="Up-regulated", color="red")

# Point out gene of interest
n_genes = 20
texts = []
#texts.append(ax.text(x=down.loc["ABCE1", "log2FoldChange"],y=down.loc["ABCE1", 'nlog10padj'],s=down.loc["ABCE1", "label"]))
for i in range(min(n_genes, up.shape[0])):
    texts.append(ax.text(x=up.iloc[i, 1],
                         y=up.iloc[i, 8],
                         s=up.iloc[i, 7]))
for i in range(min(n_genes, down.shape[0])):
    texts.append(ax.text(x=down.iloc[i, 1],
                         y=down.iloc[i, 8],
                         s=down.iloc[i, 7]))
adjust_text(texts, arrowprops=dict(arrowstyle="-", color='white', lw=0.5))


# Draw lines indicating lfc and padj cutoffs
ax.set_xlabel("logFC")
ax.set_ylabel("-log10(adj. p-value)")
ax.axvline(l2fc_cutoff, color="grey", linestyle="--")
ax.axvline(-l2fc_cutoff, color="grey", linestyle="--")
ax.axhline(-np.log10(pval_cutoff), color="grey", linestyle="--")

# Draw legend
ax.legend()

# Add a title to the plot
ax.set_title("Control vs. Cancerous (ICC) Liver")

# Save the plot as a high-resolution PNG with specific width and height
output_file = "volcano.png"
plt.savefig(output_file, dpi=300)

# Show the plot
plt.show()
