In [26]:
import numpy as np
import pandas as pd
import scanpy as sc

In [27]:
adata = sc.read("lyon_transcriptomics_scanpy_normalised.h5")
adata.obs["Metadata_treatments"] = adata.obs["Metadata_treatments"].astype(str)
adata.obs["Metadata_target"] = adata.obs["Metadata_target"].astype(str)
adata.obs["Metadata_Mode_of_Action"] = adata.obs["Metadata_Mode_of_Action"].astype(str)
adata.obs.loc[adata.obs["Metadata_treatments"] == "DMSO", "Metadata_target"] = "DMSO"
adata.obs.loc[adata.obs["Metadata_treatments"] == "DMSO", "Metadata_Mode_of_Action"] = "DMSO"

In [28]:
# rename_dict = {
#     "cell wal" : 'Cell wall',
#     "lipid other": "lipid",
#     "lipid SBI": "lipid",
# }
# adata.obs["Metadata_Mode_of_Action"].replace(rename_dict, inplace=True)
# adata.obs["Metadata_Mode_of_Action"] = adata.obs["Metadata_Mode_of_Action"].str.capitalize()

# adata = adata[adata.obs["Metadata_Mode_of_Action"].isin(adata.obs["Metadata_Mode_of_Action"].value_counts()[adata.obs["Metadata_Mode_of_Action"].value_counts() >= 10].index)]
adata.obs["Metadata_Mode_of_Action"].value_counts()

Metadata_Mode_of_Action
lipid SBI           245
respiration         178
lipid other         129
GPI                 102
cytoskeleton         94
amino acid           72
prot metabolism      58
proteins             57
DMSO                 56
DHODH                56
cell wal             44
HOG                  43
Cell wall            32
Multi                30
HDAC                 28
ATP proton pump      25
GPI                  25
IMPDH                25
Name: count, dtype: int64

In [29]:
import pickle

gene_mapping = pickle.load(open("gene_mapping_botrytis.pkl", "rb"))

In [30]:
pickle.dump(gene_mapping, open("gene_mapping_botrytis.pkl", "wb"))

In [31]:
adata.var["Gene Name"] = [gene_mapping[x] if x in gene_mapping else x for x in adata.var.index]

In [32]:
adata.var["Gene Name"] = adata.var["Gene Name"].astype(str)
# Check for duplicate gene names and add a suffix if necessary
gene_name_counts = adata.var["Gene Name"].value_counts()
duplicate_gene_names = gene_name_counts[gene_name_counts > 1].index

for gene_name in duplicate_gene_names:
    indices = adata.var[adata.var["Gene Name"] == gene_name].index
    for i, idx in enumerate(indices):
        adata.var.at[idx, "Gene Name"] = f"{gene_name}_{i + 1}"

In [33]:
adata.obs["Metadata_Mode_of_Action"].unique()

array(['DMSO', 'lipid other ', 'lipid SBI ', 'prot metabolism ',
       'respiration ', 'IMPDH', 'cell wal ', 'Cell wall ',
       'cytoskeleton ', 'DHODH', 'GPI', 'HDAC', 'HOG', 'amino acid ',
       'ATP proton pump ', 'GPI ', 'proteins ', 'Multi'], dtype=object)

In [34]:
def gene_expression_per_group(subsample_adata, group, reference, genes2display=5):
    sc.tl.rank_genes_groups(
        subsample_adata,
        group,
        reference=reference,
        # method="wilcoxon",
    )
    # sc.pl.rank_genes_groups_dotplot(
    #     subsample_adata,
    #     n_genes = 5,
    #     standard_scale='var',
    #     gene_symbols = "Gene Name"
    # )
    # sc.pl.rank_genes_groups_dotplot(
    #     adata,
    #     n_genes = 5,
    #     standard_scale='var',
    #     gene_symbols = "Summary functional categories"
    # )
    subsample_adata = subsample_adata[subsample_adata.obs[group] != reference]

    subsample_adata.obs[group] = subsample_adata.obs[group].astype("category")
    sc.tl.dendrogram(subsample_adata, groupby=group)

    sc.pl.rank_genes_groups_dotplot(
        subsample_adata,
        n_genes=genes2display,
        # standard_scale='var',
        vmin=-3,
        vmax=3,
        cmap="bwr",
        values_to_plot="logfoldchanges",
        gene_symbols="Gene Name",
    )
    # sc.pl.rank_genes_groups_heatmap(
    #     subsample_adata,
    #     n_genes=10,
    #     use_raw=False,
    #     swap_axes=True,
    #     show_gene_labels=False,
    #     vmin=-3,
    #     vmax=3,
    #     cmap="bwr",
    #     gene_symbols = "Gene Name"
    # )
    # sc.pl.rank_genes_groups_dotplot(
    #     adata,
    #     n_genes=5,  # Adjust to the number of genes you want to plot
    #     #cmap = "bwr",
    #     values_to_plot="logfoldchanges",
    #     gene_symbols = "Description summary"
    # )

    # sc.pl.rank_genes_groups(subsample_adata, n_genes=10, sharey=False, gene_symbols = "Gene Name")

    # for member in subsample_adata.obs[group].unique():
    #     if member == "DMSO":
    #         continue
    #     sc.pl.rank_genes_groups_violin(subsample_adata, groups=member, n_genes=5, gene_symbols = "Gene Name")

In [35]:
# gene_expression_per_group(adata, "Metadata_target", "DMSO", 2)

In [36]:
# for moa in adata.obs["Metadata_Mode_of_Action"].unique():
#     if moa == "DMSO":
#         continue
#     adata_small = adata[adata.obs["Metadata_Mode_of_Action"].isin([moa, "DMSO"])].copy()
#     if adata_small.obs["Metadata_target"].nunique() < 2:
#         continue
#     print(moa)
#     adata_small.obs.loc[adata_small.obs["Metadata_Mode_of_Action"] == "DMSO", "Metadata_target"] = "DMSO"
#     print(adata_small.obs["Metadata_target"].value_counts())
#     gene_expression_per_group(adata_small, "Metadata_target", "DMSO")

In [37]:
df = pd.read_csv("full_raw_matrix_and_meta.csv", index_col=0)
leiden_dict = pickle.load(open("leiden_dict_lyon.pkl", "rb"))
df_meta = pd.read_csv("transcriptomics_moa_target 1.csv", index_col=0)
df_meta.loc[df_meta["Pathway / Target"] == "DMSO", "MoA_tr"] = "DMSO"
df_meta.loc[df_meta["Pathway / Target"] == "DMSO", "MoA_tr"] = "DMSO"
# df_meta.columns = [ "Metadata_" + x for x in df_meta.columns]
moa_dict = dict(zip(df_meta["BCS-code"], df_meta["MoA_tr"]))
target_dict = dict(zip(df_meta["BCS-code"], df_meta["MoA_tr"]))
moa_dict["DMSO"] = "DMSO"
target_dict["DMSO"] = "DMSO"

df["Metadata_MoA"] = np.nan
df["Metadata_target"] = np.nan
df["Metadata_MOA"] = df["Metadata_treatments"].map(moa_dict)
df["Metadata_target"] = df["Metadata_treatments"].map(target_dict)
df["Metadata_leiden"] = df["Metadata_treatments"].map(leiden_dict)

gene_mapping = pickle.load(open("gene_mapping_botrytis.pkl", "rb"))
print(df.shape)
df.rename(
    columns={
        "Metadata_MOA": "Metadata_Mode_of_Action",
        "Metadata_treatments": "Metadata_treatments",
    },
    inplace=True,
)
df.loc[df["Metadata_treatments"] == "DMSO", "Metadata_concentration"] = 1

(1440, 12205)


In [38]:
df["Metadata_leiden"].unique()

array(['Leiden cluster 3', 'Leiden cluster 6', 'Leiden cluster 5',
       'Leiden cluster 10', 'Leiden cluster 20', 'Leiden cluster 15',
       'Leiden cluster 0', 'Leiden cluster 8', 'Leiden cluster 13',
       'Leiden cluster 17', 'Leiden cluster 1', 'Leiden cluster 7',
       'Leiden cluster 12', 'Leiden cluster 14', 'Leiden cluster 16',
       'Leiden cluster 2', 'Leiden cluster 19', 'Leiden cluster 18',
       'Leiden cluster 11', 'Leiden cluster 4', 'Leiden cluster 9', nan],
      dtype=object)

In [39]:
s = df.columns.to_series().groupby(df.columns)

df.columns = np.where(
    s.transform("size") > 1, df.columns + s.cumcount().add(1).astype(str), df.columns
)

In [40]:
gene_name_counts = df.columns.value_counts()
gene_name_counts[gene_name_counts > 1]

Series([], Name: count, dtype: int64)

In [41]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats


def deseq(df: pd.DataFrame, group: str, reference: str) -> pd.DataFrame:
    """
    Perform differential expression analysis using DESeq2.

    This function takes a DataFrame containing gene expression counts and metadata,
    and performs differential expression analysis using the DESeq2 method. It prepares
    the data, fits the DESeq2 model, and returns a summary of the results sorted by
    adjusted p-values.

    Parameters:
    ----------
    df : pandas.DataFrame
        A DataFrame where rows represent genes and columns represent samples. The DataFrame
        should include metadata columns prefixed with "Metadata_" and a column for the
        grouping variable.

    group : str
        The name of the column in the DataFrame that specifies the grouping variable for
        the differential expression analysis. This variable should contain the treatment
        groups to be compared.

    reference : str
        The reference level for the contrast in the differential expression analysis. This
        should be one of the levels present in the `group` column.

    Returns:
    -------
    pandas.DataFrame
        A DataFrame containing the results of the differential expression analysis, sorted
        by adjusted p-values.
    """

    # if os.path.isfile(f"differential_expression_tables/{treatment}_differential_expression.csv"):
    #     print("File exists, skipping")
    #     return
    # Identify gene columns by excluding metadata columns
    gene_columns = df.columns[~df.columns.str.startswith("Metadata_")]

    # Drop rows with NA values in specified columns
    df.dropna(subset=["Metadata_plate", group], inplace=True)

    # Prepare counts DataFrame with genes as rows and samples as columns, filling NA with 0
    counts = df[gene_columns].fillna(0).astype(int)

    # Retain only genes with non-zero variance
    counts = counts.loc[:, counts.var() != 0]

    # Prepare metadata DataFrame
    metadata = df[["Metadata_plate", group]]
    print(counts.shape)  # Print shape of counts DataFrame
    print(metadata.shape)  # Print shape of metadata DataFrame

    # Initialize inference with specified number of CPUs
    inference = DefaultInference(n_cpus=14)

    # Create DESeqDataSet object
    dds = DeseqDataSet(
        counts=counts.reset_index(drop=True),
        metadata=metadata.reset_index(drop=True),
        design=f"~ Metadata_plate + {group}",
        refit_cooks=True,
        inference=inference,
    )

    # Fit size factors and dispersions
    dds.fit_size_factors()
    dds.fit_genewise_dispersions()
    dds.fit_dispersion_trend()
    dds.fit_dispersion_prior()
    dds.fit_MAP_dispersions()
    dds.fit_LFC()
    dds.calculate_cooks()

    # Refitting outlier counts if specified
    if dds.refit_cooks:
        dds.refit()

    # Set up and run differential expression statistics
    ds = DeseqStats(
        dds,
        contrast=[group, treatment, reference],
        alpha=0.05,
        cooks_filter=True,
        independent_filter=True,
    )
    ds.run_wald_test()

    # Perform independent filtering or p-value adjustment
    if ds.independent_filter:
        ds._independent_filtering()
    else:
        ds._p_value_adjustment()

    ds.summary()  # Print summary of results

    # Sort results by adjusted p-values
    summary_df = ds.results_df.copy()
    summary_df.sort_values("padj", inplace=True)

    return summary_df  # Return sorted results DataFrame

In [42]:
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
import os


def make_volcano(data_frame, treatment):
    """
    Make an interactive volcano plot as an html. Attach meta data to individual genes.

    Parameters
    ----------
    treatment : string
        Name of the treatment

    Returns
    -------
    fig : object
        The figure object of the volcano plot

    pd.Series : object
        Pandas series containing the log2FoldChange. Will be used later for
        colouring in the plot_gene_atlas function
    """
    # if os.path.isfile(f"differential_expression_tables/{treatment}_differential_expression.csv"):
    #     print("File exists, skipping")
    #     return


    # Convert p-values into -Log10p
    data_frame["-Log10P"] = np.log10(data_frame["padj"]) * -1
    data_frame.loc[data_frame["-Log10P"] < 0, "-Log10P"] = 0

    # Filter for significant Up-regulated and Down-regulated genes
    data_frame["Type hit"] = "Under Threshold"
    data_frame.loc[
        (data_frame["log2FoldChange"] > 1) & (data_frame["padj"] < 0.05), "Type hit"
    ] = "Up-regulated over threshold of P < 0.05 \n and log(2) Fold-Change > 1"
    data_frame.loc[
        (data_frame["log2FoldChange"] < -1) & (data_frame["padj"] < 0.05), "Type hit"
    ] = "Down-regulated over threshold of P < 0.05 \n and log(2) Fold-Change > 1"

    # Filter out non-significant genes
    data_frame.sort_values("-Log10P", ascending=False, inplace=True)
    print(data_frame.head())
    data_frame.to_csv(f"{treatment}_differential_expression.csv")

    data_frame.rename(
        columns={
            "log2FoldChange": "Log(2) Fold-Change",
            "-Log10P": "-Log(10) of adjusted P-value",
        },
        inplace=True,
    )

    all_significant = data_frame[
        data_frame["Type hit"] != "Under Threshold"
    ].sort_values("-Log(10) of adjusted P-value", ascending=False)
    filtered_data = all_significant.iloc[:15]

    fig = plt.figure(figsize=(18, 12))

    colour_dict = {
        "Under Threshold": "lightgrey",
        "Up-regulated over threshold of P < 0.05 \n and log(2) Fold-Change > 1": "cornflowerblue",
        "Down-regulated over threshold of P < 0.05 \n and log(2) Fold-Change > 1": "crimson",
    }

    # Make figure
    g = sns.scatterplot(
        data=data_frame,
        x="Log(2) Fold-Change",
        y="-Log(10) of adjusted P-value",
        hue="Type hit",
        alpha=0.6,
        palette=colour_dict,
        legend=True,
        edgecolor="none",
    )

    # Add lines to show thresholds
    ax = plt.gca()
    ax.axhline(y=np.log10(0.05) * -1, lw=2, ls="--", c="grey")
    ax.axvline(x=1, lw=2, ls="--", c="grey")
    ax.axvline(x=-1, lw=2, ls="--", c="grey")
    ax.spines[["right", "top"]].set_visible(False)
    ax.grid(True, alpha=0.2)
    ax.tick_params(axis="both", which="major", labelsize=12)

    texts = []
    for gene, row in filtered_data.iterrows():
        # Check the log2 fold change to determine text position
        if row["Log(2) Fold-Change"] < 1:
            x_offset = 0.05  # Offset to the right
        else:
            x_offset = -0.05  # Offset to the left for better visibility

        texts.append(
            ax.text(
                row["Log(2) Fold-Change"] + x_offset,
                row["-Log(10) of adjusted P-value"] * 1.01,
                f"{gene_mapping[gene]}",
                fontsize=14,
                color="black",
            )
        )

    # Use adjustText to avoid overlapping labels
    adjust_text(
        texts,
        expand=(1.1, 1.1),
        # only_move='y',  # Move only in the y direction
        arrowprops=dict(color="silver", lw=1),
    )

    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    plt.title(f"Volcano plot for {treatment} vs. DMSO", fontsize=18)
    plt.savefig(f"volcano_{treatment}.png")
    plt.close()
    # plt.show()


In [43]:
df = df[~df["Metadata_leiden"].isna()]

In [44]:
group = "Metadata_target"
group = "Metadata_Mode_of_Action"
#group = "Metadata_treatments"
group = "Metadata_leiden"

In [45]:
dmso_leidens = df.loc[df["Metadata_target"] == "DMSO", "Metadata_leiden"].unique()
dmso_leiden_dict = dict([(x, "DMSO") for x in dmso_leidens])
df["Metadata_leiden"] = df["Metadata_leiden"].replace(dmso_leiden_dict)

In [46]:
for group in ["Metadata_target", "Metadata_treatments", "Metadata_leiden"]:
    for treatment in df[group].unique():
        if treatment == "DMSO":
            continue
        df_small = df[df[group].isin([treatment, "DMSO"])].copy()
        summary_df = deseq(df_small, group, "DMSO")
        make_volcano(summary_df, treatment)


(200, 11746)
(200, 2)


Fitting size factors...
... done in 0.07 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 32.38 seconds.

Fitting dispersion trend curve...
... done in 0.33 seconds.

Fitting MAP dispersions...
... done in 29.79 seconds.

Fitting LFCs...
... done in 2.45 seconds.

Calculating cook's distance...
... done in 0.18 seconds.

Replacing 7 outlier genes.

Fitting dispersions...
... done in 0.06 seconds.

Fitting MAP dispersions...
... done in 0.04 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Running Wald tests...
... done in 1.21 seconds.



Log2 fold change & Wald test p-value: Metadata_target lipid other  vs DMSO
                  baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010     24.830269       -2.251783  0.307282 -7.328071  2.334896e-13   
Bcin01g00020     15.499798       -1.929769  0.326937 -5.902577  3.578668e-09   
Bcin01g00030     81.027390       -2.625814  0.347506 -7.556165  4.151259e-14   
Bcin01g00040     27.295532       -2.583620  0.336919 -7.668359  1.742103e-14   
Bcin01g00050     14.464725       -1.373388  0.233373 -5.884955  3.981615e-09   
...                    ...             ...       ...       ...           ...   
ENSRNA049513861   0.044430        0.331456  4.959641  0.066831  9.467165e-01   
ENSRNA049513883   0.347546       -0.283505  0.516350 -0.549056  5.829667e-01   
ENSRNA049514038   0.215926       -0.612377  0.639439 -0.957678  3.382250e-01   
ENSRNA049514141   0.104523        0.203421  1.717707  0.118426  9.057304e-01   
ENSRNA049514255   0.220159       -0.450296  0

Fitting size factors...
... done in 0.09 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 46.74 seconds.

Fitting dispersion trend curve...
... done in 0.37 seconds.

Fitting MAP dispersions...
... done in 43.34 seconds.

Fitting LFCs...
... done in 2.89 seconds.

Calculating cook's distance...
... done in 0.28 seconds.

Replacing 26 outlier genes.

Fitting dispersions...
... done in 0.14 seconds.

Fitting MAP dispersions...
... done in 0.09 seconds.

Fitting LFCs...
... done in 0.03 seconds.

Running Wald tests...
... done in 1.15 seconds.



Log2 fold change & Wald test p-value: Metadata_target lipid SBI  vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Bcin01g00010      67.949715        0.879477  0.296370  2.967495  0.003002   
Bcin01g00020      43.114182        0.828876  0.286293  2.895199  0.003789   
Bcin01g00030     211.640091        0.641467  0.280848  2.284040  0.022369   
Bcin01g00040      70.952797        0.698617  0.284455  2.455980  0.014050   
Bcin01g00050      28.541906        0.575963  0.234457  2.456582  0.014027   
...                     ...             ...       ...       ...       ...   
ENSRNA049513861    0.030497        0.267874  4.465506  0.059987  0.952166   
ENSRNA049513883    0.430430        0.052329  0.326812  0.160120  0.872787   
ENSRNA049514038    0.369934        0.287693  0.400914  0.717593  0.473008   
ENSRNA049514141    0.205839        0.344642  0.766075  0.449880  0.652797   
ENSRNA049514255    0.279365       -0.123719  0.485880 -0.254628  0.799011   

  

Fitting size factors...
... done in 0.05 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 23.24 seconds.

Fitting dispersion trend curve...
... done in 0.28 seconds.

Fitting MAP dispersions...
... done in 22.16 seconds.

Fitting LFCs...
... done in 2.24 seconds.

Calculating cook's distance...
... done in 0.10 seconds.

Replacing 7 outlier genes.

Fitting dispersions...
... done in 0.04 seconds.

Fitting MAP dispersions...
... done in 0.03 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Running Wald tests...
... done in 1.25 seconds.

  result = getattr(ufunc, method)(*inputs, **kwargs)


Log2 fold change & Wald test p-value: Metadata_target prot metabolism  vs DMSO
                  baseMean  log2FoldChange     lfcSE       stat        pvalue  \
Bcin01g00010     16.631866       -2.962161  0.220871 -13.411243  5.196066e-41   
Bcin01g00020     22.143028        0.448576  0.253305   1.770891  7.657876e-02   
Bcin01g00030     61.338985       -3.265744  0.238265 -13.706359  9.300982e-43   
Bcin01g00040     21.362955       -3.349372  0.267873 -12.503600  7.134529e-36   
Bcin01g00050     11.071600       -1.419013  0.214069  -6.628770  3.384945e-11   
...                    ...             ...       ...        ...           ...   
ENSRNA049513861   0.049370        1.888931  4.295027   0.439795  6.600857e-01   
ENSRNA049513883   0.955641        2.306129  0.541292   4.260413  2.040496e-05   
ENSRNA049514038   0.228123        0.567258  1.021743   0.555187  5.787667e-01   
ENSRNA049514141   0.167177        1.632212  2.538605   0.642956  5.202526e-01   
ENSRNA049514255   1.474402    

posx and posy should be finite values
posx and posy should be finite values
  return points[1, 1] - points[0, 1]
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values


(255, 11754)
(255, 2)


Fitting size factors...
... done in 0.09 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 38.25 seconds.

Fitting dispersion trend curve...
... done in 0.34 seconds.

Fitting MAP dispersions...
... done in 37.13 seconds.

Fitting LFCs...
... done in 2.49 seconds.

Calculating cook's distance...
... done in 0.21 seconds.

Replacing 43 outlier genes.

Fitting dispersions...
... done in 0.19 seconds.

Fitting MAP dispersions...
... done in 0.15 seconds.

Fitting LFCs...
... done in 0.02 seconds.

Running Wald tests...
... done in 1.23 seconds.



Log2 fold change & Wald test p-value: Metadata_target respiration  vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Bcin01g00010      69.545990        0.334474  0.278560  1.200726  0.229858   
Bcin01g00020      39.887661        0.457207  0.257903  1.772785  0.076264   
Bcin01g00030     220.220932        0.237123  0.269070  0.881269  0.378172   
Bcin01g00040      68.967293        0.253373  0.274167  0.924155  0.355405   
Bcin01g00050      30.632471        0.486086  0.206328  2.355891  0.018478   
...                     ...             ...       ...       ...       ...   
ENSRNA049513861    0.032574        0.234466  4.074557  0.057544  0.954112   
ENSRNA049513883    0.564496        0.337406  0.320189  1.053771  0.291988   
ENSRNA049514038    0.194776       -0.574689  0.550264 -1.044388  0.296306   
ENSRNA049514141    0.105885        0.195382  0.978977  0.199577  0.841811   
ENSRNA049514255    0.226951        0.012547  0.509440  0.024629  0.980351   



Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 20.87 seconds.

Fitting dispersion trend curve...
... done in 0.39 seconds.

Fitting MAP dispersions...
... done in 19.07 seconds.

Fitting LFCs...
... done in 2.32 seconds.

Calculating cook's distance...
... done in 0.06 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.18 seconds.



Log2 fold change & Wald test p-value: Metadata_target IMPDH vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Bcin01g00010      40.512295       -0.100411  0.270379 -0.371373  0.710360   
Bcin01g00020      30.553429        0.194323  0.269621  0.720726  0.471078   
Bcin01g00030     153.941910       -0.333775  0.262643 -1.270831  0.203789   
Bcin01g00040      57.920860       -0.128718  0.265800 -0.484264  0.628198   
Bcin01g00050      22.829159       -0.047238  0.212140 -0.222671  0.823791   
...                     ...             ...       ...       ...       ...   
ENSRNA049513861    0.057758        1.125603  3.555680  0.316565  0.751574   
ENSRNA049513883    0.404849        0.608123  0.595871  1.020563  0.307462   
ENSRNA049514038    0.204051        0.708808  1.004354  0.705735  0.480353   
ENSRNA049514141    0.129058        1.267291  1.875954  0.675545  0.499330   
ENSRNA049514255    0.176431        0.427736  1.097137  0.389865  0.696636   

       

Fitting size factors...
... done in 0.05 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 23.65 seconds.

Fitting dispersion trend curve...
... done in 0.32 seconds.

Fitting MAP dispersions...
... done in 23.66 seconds.

Fitting LFCs...
... done in 2.17 seconds.

Calculating cook's distance...
... done in 0.09 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.22 seconds.



Log2 fold change & Wald test p-value: Metadata_target cell wal  vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Bcin01g00010      43.905602       -0.341886  0.232916 -1.467853  0.142144   
Bcin01g00020      33.583134       -0.178443  0.223243 -0.799321  0.424104   
Bcin01g00030     188.741241       -0.273665  0.245187 -1.116146  0.264360   
Bcin01g00040      71.145363       -0.216734  0.245747 -0.881940  0.377809   
Bcin01g00050      28.406347       -0.233789  0.196688 -1.188628  0.234586   
...                     ...             ...       ...       ...       ...   
ENSRNA049513861    0.056467       -0.238441  3.924049 -0.060764  0.951547   
ENSRNA049513883    0.550826       -0.156983  0.468950 -0.334754  0.737810   
ENSRNA049514038    0.252265        0.313915  0.871099  0.360367  0.718573   
ENSRNA049514141    0.147235       -0.027341  1.959475 -0.013953  0.988867   
ENSRNA049514255    0.197416       -0.586398  0.947954 -0.618594  0.536184   

   

Fitting size factors...
... done in 0.04 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 22.01 seconds.

Fitting dispersion trend curve...
... done in 0.38 seconds.

Fitting MAP dispersions...
... done in 20.18 seconds.

Fitting LFCs...
... done in 2.23 seconds.

Calculating cook's distance...
... done in 0.07 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.22 seconds.



Log2 fold change & Wald test p-value: Metadata_target Cell wall  vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Bcin01g00010      48.671753        0.188198  0.202956  0.927285  0.353778   
Bcin01g00020      41.225513        0.553291  0.207895  2.661399  0.007782   
Bcin01g00030     220.398911        0.450815  0.194781  2.314469  0.020642   
Bcin01g00040      78.731827        0.384636  0.181867  2.114932  0.034436   
Bcin01g00050      30.633778        0.212699  0.167520  1.269694  0.204194   
...                     ...             ...       ...       ...       ...   
ENSRNA049513861    0.063577       -0.145355  3.918647 -0.037093  0.970411   
ENSRNA049513883    0.534639        0.150656  0.485173  0.310521  0.756165   
ENSRNA049514038    0.338306        0.697529  0.827624  0.842809  0.399335   
ENSRNA049514141    0.163462        0.137426  1.823251  0.075374  0.939917   
ENSRNA049514255    0.203255       -0.499966  1.016346 -0.491925  0.622773   

  

Fitting size factors...
... done in 0.05 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 30.26 seconds.

Fitting dispersion trend curve...
... done in 0.38 seconds.

Fitting MAP dispersions...
... done in 26.64 seconds.

Fitting LFCs...
... done in 2.83 seconds.

Calculating cook's distance...
... done in 0.14 seconds.

Replacing 6 outlier genes.

Fitting dispersions...
... done in 0.04 seconds.

Fitting MAP dispersions...
... done in 0.03 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Running Wald tests...
... done in 1.21 seconds.



Log2 fold change & Wald test p-value: Metadata_target cytoskeleton  vs DMSO
                  baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Bcin01g00010     24.576702       -1.296579  0.343983 -3.769311  0.000164   
Bcin01g00020     18.323969       -1.104318  0.325900 -3.388519  0.000703   
Bcin01g00030     95.694176       -1.422698  0.351712 -4.045069  0.000052   
Bcin01g00040     36.470383       -1.304039  0.351841 -3.706333  0.000210   
Bcin01g00050     16.811169       -0.724881  0.197205 -3.675767  0.000237   
...                    ...             ...       ...       ...       ...   
ENSRNA049513861   0.038263        0.421258  4.261995  0.098841  0.921265   
ENSRNA049513883   0.396675        0.232981  0.475203  0.490278  0.623937   
ENSRNA049514038   0.342001        1.482361  0.680178  2.179373  0.029304   
ENSRNA049514141   0.232137        1.208773  0.772575  1.564603  0.117676   
ENSRNA049514255   0.107718       -0.011682  1.072294 -0.010895  0.991308   

           

Fitting size factors...
... done in 0.04 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 26.50 seconds.

Fitting dispersion trend curve...
... done in 0.34 seconds.

Fitting MAP dispersions...
... done in 23.97 seconds.

Fitting LFCs...
... done in 2.38 seconds.

Calculating cook's distance...
... done in 0.10 seconds.

Replacing 6 outlier genes.

Fitting dispersions...
... done in 0.03 seconds.

Fitting MAP dispersions...
... done in 0.04 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Running Wald tests...
... done in 1.23 seconds.



Log2 fold change & Wald test p-value: Metadata_target DHODH vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      33.369751       -1.465445  0.229400 -6.388154  1.679000e-10   
Bcin01g00020      23.667122       -1.203882  0.233116 -5.164303  2.413368e-07   
Bcin01g00030     128.198044       -1.529432  0.240748 -6.352838  2.113785e-10   
Bcin01g00040      46.041630       -1.619297  0.240295 -6.738784  1.597172e-11   
Bcin01g00050      21.880724       -0.794759  0.171965 -4.621638  3.807225e-06   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.049590        0.107845  4.003591  0.026937  9.785099e-01   
ENSRNA049513883    0.393442       -0.307641  0.479704 -0.641314  5.213188e-01   
ENSRNA049514038    0.257032        0.626508  0.847931  0.738866  4.599881e-01   
ENSRNA049514141    0.161565        0.579184  0.957761  0.604727  5.453603e-01   
ENSRNA049514255    0.164251       -0.2421

Fitting size factors...
... done in 0.06 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 29.54 seconds.

Fitting dispersion trend curve...
... done in 0.40 seconds.

Fitting MAP dispersions...
... done in 27.62 seconds.

Fitting LFCs...
... done in 2.37 seconds.

Calculating cook's distance...
... done in 0.14 seconds.

Replacing 30 outlier genes.

Fitting dispersions...
... done in 0.08 seconds.

Fitting MAP dispersions...
... done in 0.11 seconds.

Fitting LFCs...
... done in 0.02 seconds.

Running Wald tests...
... done in 1.20 seconds.



Log2 fold change & Wald test p-value: Metadata_target GPI vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      92.695740        1.694896  0.265265  6.389435  1.665003e-10   
Bcin01g00020      72.386002        1.785965  0.246977  7.231315  4.783379e-13   
Bcin01g00030     341.849121        1.473305  0.223381  6.595497  4.238346e-11   
Bcin01g00040     124.292480        1.459724  0.247340  5.901689  3.597995e-09   
Bcin01g00050      50.324451        1.442963  0.218185  6.613481  3.753856e-11   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.041646        0.246783  4.342075  0.056835  9.546764e-01   
ENSRNA049513883    0.584550        0.437885  0.396159  1.105327  2.690177e-01   
ENSRNA049514038    0.328520        1.212113  0.624237  1.941751  5.216722e-02   
ENSRNA049514141    0.155371        0.546217  0.984166  0.555005  5.788911e-01   
ENSRNA049514255    0.272433        0.754479

Fitting size factors...
... done in 0.04 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 21.84 seconds.

Fitting dispersion trend curve...
... done in 0.46 seconds.

Fitting MAP dispersions...
... done in 20.78 seconds.

Fitting LFCs...
... done in 2.27 seconds.

Calculating cook's distance...
... done in 0.08 seconds.

Replacing 1 outlier genes.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Running Wald tests...
... done in 1.26 seconds.



Log2 fold change & Wald test p-value: Metadata_target HDAC vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      25.633125       -1.827829  0.251885 -7.256614  3.969003e-13   
Bcin01g00020      19.758617       -1.468987  0.254313 -5.776303  7.635985e-09   
Bcin01g00030     101.310417       -2.321982  0.258237 -8.991677  2.434868e-19   
Bcin01g00040      36.823555       -2.210744  0.268229 -8.242006  1.693467e-16   
Bcin01g00050      17.629440       -1.172277  0.213439 -5.492319  3.966897e-08   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.043816        0.896928  3.927959  0.228345  8.193784e-01   
ENSRNA049513883    1.054442        0.459175  0.698112  0.657739  5.107061e-01   
ENSRNA049514038    0.361117        2.278762  0.877750  2.596140  9.427767e-03   
ENSRNA049514141    0.253538        0.909439  3.337276  0.272509  7.852304e-01   
ENSRNA049514255    0.143633        0.89985

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 22.91 seconds.

Fitting dispersion trend curve...
... done in 0.41 seconds.

Fitting MAP dispersions...
... done in 21.08 seconds.

Fitting LFCs...
... done in 2.66 seconds.

Calculating cook's distance...
... done in 0.08 seconds.

Replacing 17 outlier genes.

Fitting dispersions...
... done in 0.03 seconds.

Fitting MAP dispersions...
... done in 0.05 seconds.

Fitting LFCs...
... done in 0.03 seconds.

Running Wald tests...
... done in 1.24 seconds.



Log2 fold change & Wald test p-value: Metadata_target HOG vs DMSO
                  baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Bcin01g00010     15.540810       -0.128263  0.276844 -0.463304  0.643146   
Bcin01g00020     11.707269       -0.079172  0.286508 -0.276334  0.782292   
Bcin01g00030     59.017170       -0.577087  0.257079 -2.244780  0.024782   
Bcin01g00040     22.916466       -0.526460  0.274393 -1.918635  0.055031   
Bcin01g00050     10.734890        0.049900  0.244579  0.204025  0.838334   
...                    ...             ...       ...       ...       ...   
ENSRNA049513861   0.037536        3.967910  3.939424  1.007231  0.313824   
ENSRNA049513883   0.196723        2.022989  0.657090  3.078709  0.002079   
ENSRNA049514038   0.115538        3.560255  1.152829  3.088277  0.002013   
ENSRNA049514141   0.323182        3.632094  3.541513  1.025577  0.305091   
ENSRNA049514255   0.079593        3.005960  1.160980  2.589157  0.009621   

                     

Fitting size factors...
... done in 0.05 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 27.13 seconds.

Fitting dispersion trend curve...
... done in 0.27 seconds.

Fitting MAP dispersions...
... done in 24.75 seconds.

Fitting LFCs...
... done in 2.27 seconds.

Calculating cook's distance...
... done in 0.10 seconds.

Replacing 1 outlier genes.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Running Wald tests...
... done in 1.32 seconds.



Log2 fold change & Wald test p-value: Metadata_target amino acid  vs DMSO
                  baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010     22.561056       -1.635648  0.319290 -5.122762  3.010921e-07   
Bcin01g00020     15.925217       -1.323806  0.283749 -4.665413  3.079982e-06   
Bcin01g00030     88.985519       -1.663859  0.318008 -5.232131  1.675666e-07   
Bcin01g00040     30.744354       -1.509181  0.323829 -4.660428  3.155531e-06   
Bcin01g00050     16.714336       -0.906204  0.215457 -4.205965  2.599708e-05   
...                    ...             ...       ...       ...           ...   
ENSRNA049513861   0.038673        0.404892  4.455341  0.090878  9.275896e-01   
ENSRNA049513883   0.812979        0.122673  0.509416  0.240811  8.097014e-01   
ENSRNA049514038   0.227493        0.099900  0.815930  0.122437  9.025531e-01   
ENSRNA049514141   0.212830        0.526485  1.195216  0.440494  6.595794e-01   
ENSRNA049514255   0.132210       -0.093975  1.

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 21.57 seconds.

Fitting dispersion trend curve...
... done in 0.32 seconds.

Fitting MAP dispersions...
... done in 19.19 seconds.

Fitting LFCs...
... done in 2.37 seconds.

Calculating cook's distance...
... done in 0.07 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.22 seconds.



Log2 fold change & Wald test p-value: Metadata_target ATP proton pump  vs DMSO
                   baseMean  log2FoldChange     lfcSE       stat  \
Bcin01g00010      99.664558        3.030281  0.296825  10.208990   
Bcin01g00020      72.789970        3.034250  0.303237  10.006197   
Bcin01g00030     364.085175        2.865329  0.275572  10.397735   
Bcin01g00040     143.299606        3.250796  0.309299  10.510210   
Bcin01g00050      51.246094        2.701767  0.265073  10.192542   
...                     ...             ...       ...        ...   
ENSRNA049513883    0.507166        0.621095  0.596956   1.040437   
ENSRNA049514038    0.465488        1.319338  0.769382   1.714802   
ENSRNA049514141    0.198266        0.428148  2.212733   0.193493   
ENSRNA049514212    0.006145        0.607783  4.307754   0.141090   
ENSRNA049514255    0.170313       -0.389977  1.425904  -0.273495   

                       pvalue          padj  
Bcin01g00010     1.807371e-24  5.259451e-22  
Bcin01g00020

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 21.89 seconds.

Fitting dispersion trend curve...
... done in 0.34 seconds.

Fitting MAP dispersions...
... done in 19.05 seconds.

Fitting LFCs...
... done in 2.37 seconds.

Calculating cook's distance...
... done in 0.07 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.27 seconds.



Log2 fold change & Wald test p-value: Metadata_target GPI  vs DMSO
                   baseMean  log2FoldChange     lfcSE       stat  \
Bcin01g00010      90.611389        3.215010  0.239797  13.407198   
Bcin01g00020      67.490135        3.216112  0.256789  12.524351   
Bcin01g00030     346.105133        3.047380  0.234657  12.986527   
Bcin01g00040     133.283447        3.322814  0.235391  14.116156   
Bcin01g00050      47.119049        2.836537  0.214422  13.228732   
...                     ...             ...       ...        ...   
ENSRNA049513861    0.067227        1.057824  4.351015   0.243121   
ENSRNA049513883    0.477996        0.765847  0.619380   1.236474   
ENSRNA049514038    0.292437        0.416070  1.065299   0.390566   
ENSRNA049514141    0.167294        1.210366  1.963421   0.616458   
ENSRNA049514255    0.424937        1.065427  1.032075   1.032316   

                       pvalue          padj  
Bcin01g00010     5.487340e-41  1.184818e-38  
Bcin01g00020     5.49390

Fitting size factors...
... done in 0.05 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 24.24 seconds.

Fitting dispersion trend curve...
... done in 0.40 seconds.

Fitting MAP dispersions...
... done in 24.10 seconds.

Fitting LFCs...
... done in 2.30 seconds.

Calculating cook's distance...
... done in 0.09 seconds.

Replacing 28 outlier genes.

Fitting dispersions...
... done in 0.07 seconds.

Fitting MAP dispersions...
... done in 0.06 seconds.

Fitting LFCs...
... done in 0.02 seconds.

Running Wald tests...
... done in 1.18 seconds.



Log2 fold change & Wald test p-value: Metadata_target proteins  vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010     101.071632        2.272039  0.430732  5.274825  1.328826e-07   
Bcin01g00020      70.453117        2.106042  0.433847  4.854339  1.207890e-06   
Bcin01g00030     333.708435        1.876680  0.403403  4.652120  3.285397e-06   
Bcin01g00040     119.651634        2.032773  0.420296  4.836527  1.321275e-06   
Bcin01g00050      50.356564        1.856028  0.390778  4.749575  2.038446e-06   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.050460       -0.174770  4.301723 -0.040628  9.675925e-01   
ENSRNA049513883    0.473560       -0.209671  0.503481 -0.416442  6.770867e-01   
ENSRNA049514038    0.309974       -0.030248  0.668309 -0.045261  9.638994e-01   
ENSRNA049514141    0.179082        0.036948  1.468898  0.025154  9.799323e-01   
ENSRNA049514255    0.247826        0.

Fitting size factors...
... done in 0.04 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 22.12 seconds.

Fitting dispersion trend curve...
... done in 0.33 seconds.

Fitting MAP dispersions...
... done in 20.58 seconds.

Fitting LFCs...
... done in 2.07 seconds.

Calculating cook's distance...
... done in 0.07 seconds.

Replacing 3 outlier genes.

Fitting dispersions...
... done in 0.03 seconds.

Fitting MAP dispersions...
... done in 0.02 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Running Wald tests...
... done in 1.32 seconds.



Log2 fold change & Wald test p-value: Metadata_target Multi vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      59.532852        1.354684  0.244044  5.550976  2.840794e-08   
Bcin01g00020      48.326462        1.652089  0.250378  6.598368  4.157077e-11   
Bcin01g00030     248.008163        1.452298  0.240525  6.038038  1.559997e-09   
Bcin01g00040      88.203705        1.587878  0.243337  6.525426  6.780867e-11   
Bcin01g00050      37.954979        1.443099  0.220508  6.544421  5.972646e-11   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.066802        0.003194  4.203390  0.000760  9.993937e-01   
ENSRNA049513883    0.567927        0.634072  0.455015  1.393519  1.634628e-01   
ENSRNA049514038    0.239567       -0.409030  0.853227 -0.479392  6.316599e-01   
ENSRNA049514141    0.136935       -0.191360  2.225112 -0.086000  9.314662e-01   
ENSRNA049514255    0.280231        0.3353

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.25 seconds.

Fitting dispersion trend curve...
... done in 0.27 seconds.

Fitting MAP dispersions...
... done in 16.59 seconds.

Fitting LFCs...
... done in 1.87 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.22 seconds.



Log2 fold change & Wald test p-value: Metadata_treatments BCS-AK75639 vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      50.238426       -2.242940  0.376401 -5.958914  2.539194e-09   
Bcin01g00020      35.899536       -1.549285  0.428173 -3.618357  2.964787e-04   
Bcin01g00030     194.596222       -2.069022  0.393723 -5.255022  1.480070e-07   
Bcin01g00040      68.095604       -2.025985  0.404054 -5.014148  5.326891e-07   
Bcin01g00050      28.065269       -1.127219  0.334306 -3.371816  7.467443e-04   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.120283       -0.092657  6.066298 -0.015274  9.878135e-01   
ENSRNA049513883    0.586549       -0.910195  1.390516 -0.654573  5.127425e-01   
ENSRNA049514038    0.378312       -1.545692  1.857643 -0.832071  4.053687e-01   
ENSRNA049514141    0.217103       -0.378974  6.061322 -0.062523  9.501461e-01   
ENSRNA049514255    0.361215    

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.51 seconds.

Fitting dispersion trend curve...
... done in 0.26 seconds.

Fitting MAP dispersions...
... done in 16.70 seconds.

Fitting LFCs...
... done in 1.88 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.25 seconds.



Log2 fold change & Wald test p-value: Metadata_treatments BCS-CZ69404 vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      49.176617       -3.109085  0.424392 -7.325982  2.371561e-13   
Bcin01g00020      34.967892       -3.040081  0.526623 -5.772784  7.797246e-09   
Bcin01g00030     189.267227       -4.202197  0.432577 -9.714341  2.619391e-22   
Bcin01g00040      66.523430       -3.767485  0.490191 -7.685752  1.521005e-14   
Bcin01g00050      27.366066       -2.077472  0.393788 -5.275609  1.323159e-07   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.119086        0.081967  6.055696  0.013535  9.892006e-01   
ENSRNA049513883    0.600375       -0.202538  1.206958 -0.167809  8.667337e-01   
ENSRNA049514038    0.374821       -1.390065  1.915867 -0.725554  4.681124e-01   
ENSRNA049514141    0.215119       -0.206459  6.057931 -0.034081  9.728127e-01   
ENSRNA049514255    0.301358    

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.30 seconds.

Fitting dispersion trend curve...
... done in 0.29 seconds.

Fitting MAP dispersions...
... done in 16.65 seconds.

Fitting LFCs...
... done in 1.85 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.25 seconds.

  result = getattr(ufunc, method)(*inputs, **kwargs)


Log2 fold change & Wald test p-value: Metadata_treatments BCS-DG83654 vs DMSO
                   baseMean  log2FoldChange     lfcSE       stat  \
Bcin01g00010      49.344002       -3.367699  0.432389  -7.788594   
Bcin01g00020      35.125645       -3.089112  0.525047  -5.883502   
Bcin01g00030     190.001282       -4.175914  0.407749 -10.241385   
Bcin01g00040      66.756523       -3.953671  0.505907  -7.815013   
Bcin01g00050      27.437544       -2.153494  0.397270  -5.420734   
...                     ...             ...       ...        ...   
ENSRNA049513861    0.119562       -0.002911  6.057105  -0.000481   
ENSRNA049513883    0.584964       -0.858598  1.269915  -0.676107   
ENSRNA049514038    0.392812       -0.890349  1.681336  -0.529548   
ENSRNA049514141    0.228483        0.382353  5.989124   0.063841   
ENSRNA049514255    0.338303       -0.180180  1.635024  -0.110201   

                       pvalue          padj  
Bcin01g00010     6.775878e-15  1.882089e-13  
Bcin01g00020 

posx and posy should be finite values
  return points[1, 1] - points[0, 1]
posx and posy should be finite values
posx and posy should be finite values


(65, 11688)
(65, 2)


Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.29 seconds.

Fitting dispersion trend curve...
... done in 0.29 seconds.

Fitting MAP dispersions...
... done in 16.66 seconds.

Fitting LFCs...
... done in 1.88 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.21 seconds.



Log2 fold change & Wald test p-value: Metadata_treatments BCS-AA31476 vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      51.064087       -1.925452  0.467364 -4.119810  3.791852e-05   
Bcin01g00020      35.075336       -1.839030  0.479902 -3.832096  1.270561e-04   
Bcin01g00030     192.586182       -2.356082  0.444269 -5.303281  1.137399e-07   
Bcin01g00040      67.642944       -1.939674  0.468712 -4.138309  3.498755e-05   
Bcin01g00050      27.392111       -1.535696  0.407380 -3.769691  1.634496e-04   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.118204        0.207479  6.056922  0.034255  9.726740e-01   
ENSRNA049513883    0.619625        0.648729  0.917075  0.707389  4.793248e-01   
ENSRNA049514038    0.372353       -1.264445  1.921486 -0.658056  5.105022e-01   
ENSRNA049514141    0.213493       -0.080656  6.053800 -0.013323  9.893699e-01   
ENSRNA049514255    0.326530    

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.29 seconds.

Fitting dispersion trend curve...
... done in 0.29 seconds.

Fitting MAP dispersions...
... done in 16.41 seconds.

Fitting LFCs...
... done in 1.85 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.17 seconds.



Log2 fold change & Wald test p-value: Metadata_treatments BCS-AI81874 vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      47.305325       -3.561433  0.525879 -6.772345  1.267115e-11   
Bcin01g00020      33.212162       -3.246106  0.599679 -5.413075  6.195165e-08   
Bcin01g00030     183.418854       -2.915262  0.462682 -6.300790  2.961328e-10   
Bcin01g00040      65.181442       -2.449575  0.513786 -4.767692  1.863486e-06   
Bcin01g00050      26.517929       -1.576800  0.409895 -3.846840  1.196511e-04   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.113554        1.033085  6.107289  0.169156  8.656739e-01   
ENSRNA049513883    0.551105        0.101451  1.162962  0.087235  9.304846e-01   
ENSRNA049514038    0.365434        0.100975  1.614897  0.062527  9.501432e-01   
ENSRNA049514141    0.204780        0.823965  6.102291  0.135025  8.925917e-01   
ENSRNA049514255    0.310883    

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.25 seconds.

Fitting dispersion trend curve...
... done in 0.32 seconds.

Fitting MAP dispersions...
... done in 16.41 seconds.

Fitting LFCs...
... done in 1.86 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.40 seconds.



Log2 fold change & Wald test p-value: Metadata_treatments BCS-AD77064 vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Bcin01g00010      52.156540       -0.583559  0.383094 -1.523277  0.127689   
Bcin01g00020      36.865421       -0.340527  0.436996 -0.779245  0.435836   
Bcin01g00030     199.188889       -0.688582  0.395163 -1.742528  0.081416   
Bcin01g00040      69.769783       -0.607940  0.410914 -1.479480  0.139012   
Bcin01g00050      28.736700       -0.242483  0.346663 -0.699479  0.484253   
...                     ...             ...       ...       ...       ...   
ENSRNA049513861    0.119079        0.037050  6.060235  0.006114  0.995122   
ENSRNA049513883    0.600106       -0.292354  1.060800 -0.275598  0.782857   
ENSRNA049514038    0.375831       -1.440003  1.918129 -0.750733  0.452813   
ENSRNA049514141    0.215329       -0.246537  6.052625 -0.040732  0.967509   
ENSRNA049514255    0.333143       -0.631095  1.895225 -0.332992  0.739140  

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.37 seconds.

Fitting dispersion trend curve...
... done in 0.32 seconds.

Fitting MAP dispersions...
... done in 16.45 seconds.

Fitting LFCs...
... done in 1.86 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.24 seconds.



Log2 fold change & Wald test p-value: Metadata_treatments BCS-CX66720 vs DMSO
                   baseMean  log2FoldChange     lfcSE       stat  \
Bcin01g00010      94.399063        3.296880  0.323962  10.176747   
Bcin01g00020      58.180901        3.302034  0.371923   8.878284   
Bcin01g00030     302.581116        3.059050  0.356760   8.574540   
Bcin01g00040     104.219513        3.094244  0.363115   8.521385   
Bcin01g00050      39.696167        2.687734  0.292704   9.182422   
...                     ...             ...       ...        ...   
ENSRNA049513861    0.118491        0.123154  6.061476   0.020318   
ENSRNA049513883    0.581623       -0.703399  1.306136  -0.538534   
ENSRNA049514038    0.425594       -0.305899  1.493992  -0.204753   
ENSRNA049514141    0.214231       -0.160565  6.062758  -0.026484   
ENSRNA049514255    0.317102       -0.063571  1.689185  -0.037634   

                       pvalue          padj  
Bcin01g00010     2.518389e-24  1.913394e-22  
Bcin01g00020 

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.62 seconds.

Fitting dispersion trend curve...
... done in 0.30 seconds.

Fitting MAP dispersions...
... done in 17.18 seconds.

Fitting LFCs...
... done in 2.05 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.12 seconds.

  result = getattr(ufunc, method)(*inputs, **kwargs)


Log2 fold change & Wald test p-value: Metadata_treatments BCS-CU93685 vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      41.652573       -2.171865  0.335434 -6.474797  9.493929e-11   
Bcin01g00020      35.234482        0.739029  0.344099  2.147724  3.173568e-02   
Bcin01g00030     159.004623       -2.595106  0.356356 -7.282343  3.280720e-13   
Bcin01g00040      55.769630       -2.527636  0.371344 -6.806727  9.984422e-12   
Bcin01g00050      23.201717       -1.514430  0.341370 -4.436327  9.150689e-06   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.098036        1.409709  4.856148  0.290294  7.715916e-01   
ENSRNA049513883    0.889162        2.465560  0.718996  3.429172  6.054259e-04   
ENSRNA049514038    0.348136        0.438006  1.361067  0.321811  7.475962e-01   
ENSRNA049514141    0.177129        1.156163  4.850098  0.238379  8.115870e-01   
ENSRNA049514255    0.892011    

posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
  return points[1, 1] - points[0, 1]
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy

(65, 11691)
(65, 2)


Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.39 seconds.

Fitting dispersion trend curve...
... done in 0.33 seconds.

Fitting MAP dispersions...
... done in 16.53 seconds.

Fitting LFCs...
... done in 2.01 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.19 seconds.



Log2 fold change & Wald test p-value: Metadata_treatments BCS-DA15669 vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      43.377655       -2.763178  0.597368 -4.625588  3.735373e-06   
Bcin01g00020      35.028904        1.047080  0.429108  2.440133  1.468184e-02   
Bcin01g00030     167.518219       -3.416897  0.517156 -6.607091  3.919459e-11   
Bcin01g00040      59.317093       -2.940901  0.662184 -4.441213  8.945304e-06   
Bcin01g00050      24.674110       -0.673236  0.480842 -1.400118  1.614779e-01   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.104820        2.516481  6.102861  0.412345  6.800869e-01   
ENSRNA049513883    0.586769        2.164817  1.255418  1.724380  8.463933e-02   
ENSRNA049514038    0.568665        1.650167  1.697677  0.972015  3.310433e-01   
ENSRNA049514141    0.189420        2.290154  6.079601  0.376695  7.064004e-01   
ENSRNA049514255    0.725976    

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.29 seconds.

Fitting dispersion trend curve...
... done in 0.26 seconds.

Fitting MAP dispersions...
... done in 16.73 seconds.

Fitting LFCs...
... done in 1.96 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.23 seconds.



Log2 fold change & Wald test p-value: Metadata_treatments BCS-AU85103 vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat        pvalue  \
Bcin01g00010      50.532028       -1.842177  0.400101 -4.604282  4.138913e-06   
Bcin01g00020      35.319958       -2.002517  0.475173 -4.214291  2.505644e-05   
Bcin01g00030     193.389191       -1.804058  0.380469 -4.741669  2.119652e-06   
Bcin01g00040      67.255539       -2.280073  0.439494 -5.187948  2.126239e-07   
Bcin01g00050      27.868528       -0.984964  0.340621 -2.891674  3.831951e-03   
...                     ...             ...       ...       ...           ...   
ENSRNA049513861    0.119032        0.040308  6.164412  0.006539  9.947828e-01   
ENSRNA049513883    0.622146        1.083337  0.649345  1.668353  9.524566e-02   
ENSRNA049514038    0.393972       -0.361843  1.516789 -0.238558  8.114480e-01   
ENSRNA049514141    0.215223       -0.270537  6.127664 -0.044150  9.647847e-01   
ENSRNA049514255    0.322814    

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 18.24 seconds.

Fitting dispersion trend curve...
... done in 0.29 seconds.

Fitting MAP dispersions...
... done in 16.56 seconds.

Fitting LFCs...
... done in 2.03 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 1.23 seconds.



Log2 fold change & Wald test p-value: Metadata_treatments BCS-DG52208 vs DMSO
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Bcin01g00010      55.615284        0.215721  0.338990  0.636364  0.524539   
Bcin01g00020      38.664478        0.284204  0.398121  0.713863  0.475312   
Bcin01g00030     211.007950        0.230497  0.352927  0.653102  0.513691   
Bcin01g00040      75.002197        0.488184  0.371394  1.314464  0.188690   
Bcin01g00050      30.269388        0.446506  0.308964  1.445171  0.148410   
...                     ...             ...       ...       ...       ...   
ENSRNA049513861    0.121117       -0.284311  6.053867 -0.046964  0.962542   
ENSRNA049513883    0.670726        0.792050  0.944137  0.838914  0.401517   
ENSRNA049514038    0.394430       -1.192599  1.712483 -0.696415  0.486169   
ENSRNA049514141    0.218949       -0.568261  6.050495 -0.093920  0.925173   
ENSRNA049514255    0.362476       -0.031517  1.626626 -0.019376  0.984541  

Fitting size factors...
... done in 0.03 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...


KeyboardInterrupt: 

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

def volcano_comparison(treatments):
    """
    Create an interactive volcano plot for multiple treatments and save as HTML. 
    Attach meta data to individual genes.

    Parameters
    ----------
    treatments : list of strings
        List of treatment names.

    Returns
    -------
    fig : object
        The figure object of the volcano plot.
    """
    fig, ax = plt.subplots(figsize=(22, 12))
    target, treatments = treatments
    colour_dict = {
        "Under Threshold": "lightgrey",
        "Up-regulated": "cornflowerblue",
        "Down-regulated": "crimson"
    }

    markers = ['o', 'X', 'D', '^', 'v', 'p', '*']
    filtered_dataframes, overlapping_significant_genes = [], []
    

    for i, treatment in enumerate(treatments):
        table_path = f"{treatment}_differential_expression.csv"
        if not os.path.isfile(table_path):
            print(f"File {table_path} not found.")
            continue
        
        data_frame = pd.read_csv(table_path, index_col=0)
        
        # Check for duplicate indices
        duplicates = data_frame.index.duplicated(keep=False)

        # Create a new index with suffixes for duplicates
        if duplicates.any():
            # Create a dictionary to count occurrences of each index
            count_dict = {}
            new_index = []

            for idx in data_frame.index:
                if idx in count_dict:
                    count_dict[idx] += 1
                    new_index.append(f"{idx}_{count_dict[idx]}")
                else:
                    count_dict[idx] = 1
                    new_index.append(idx)

            # Assign the new unique index to the DataFrame
            data_frame.index = new_index
        
        data_frame["-Log10P"] = np.maximum(-np.log10(data_frame["padj"]), 0)
        data_frame["Type hit"] = "Under Threshold"
        data_frame.loc[(data_frame["log2FoldChange"].abs() > 1) & (data_frame["padj"] < 0.05), "Type hit"] = "Up-regulated"
        data_frame.rename(columns={
            "log2FoldChange": "Log(2) Fold-Change",
            "-Log10P": "-Log(10) of adjusted P-value"
        }, inplace=True)
        
        significant_data = data_frame[data_frame["Type hit"] != "Under Threshold"].sort_values("-Log(10) of adjusted P-value", ascending=False)
        significant_data["Treatment"] = treatment
        filtered_dataframes.append(significant_data)
        overlapping_significant_genes.append(set(significant_data.index))

        sns.scatterplot(data=data_frame, x="Log(2) Fold-Change", y="-Log(10) of adjusted P-value", alpha=0.6, color="gainsboro", 
                        edgecolor='none', marker=markers[i % len(markers)], ax=ax)
    if len(overlapping_significant_genes) < 2:
        print("No overlapping genes found.")
        return
    intersection_of_genes = set.intersection(*overlapping_significant_genes)
    n = min(len(intersection_of_genes), 10)
    if n <= 1:
        print("No overlapping genes found.")
        return
    colors = plt.get_cmap("tab10")(np.linspace(0, 1, n))
    
    overlapping_gene_data = pd.concat([df[df.index.isin(intersection_of_genes)] for df in filtered_dataframes], axis=0)

    if not overlapping_gene_data.empty:
        overlapping_gene_data["Top 10 overlapping\ngenes by significance"] = overlapping_gene_data.index
        top_genes = overlapping_gene_data.groupby("Top 10 overlapping\ngenes by significance")["-Log(10) of adjusted P-value"].max().nlargest(10).index
        overlapping_gene_data = overlapping_gene_data[overlapping_gene_data["Top 10 overlapping\ngenes by significance"].isin(top_genes)]

        # Sort the overlapping_gene_data by -Log(10) of adjusted P-value in descending order
        overlapping_gene_data = overlapping_gene_data.sort_values("-Log(10) of adjusted P-value", ascending=False)

        # Create a sorted list of unique genes based on their order in the sorted DataFrame
        sorted_genes = overlapping_gene_data["Top 10 overlapping\ngenes by significance"].unique()

        for i, gene in enumerate(sorted_genes):
            subset = overlapping_gene_data[overlapping_gene_data["Top 10 overlapping\ngenes by significance"] == gene]
            ax.plot(subset["Log(2) Fold-Change"], subset["-Log(10) of adjusted P-value"], ls="--", alpha=0.4, c=colors[i])
        
        #overlapping_gene_data["Top 10 overlapping\ngenes by significance"] = [gene_mapping[x].split(",")[0] for x in overlapping_gene_data["Top 10 overlapping\ngenes by significance"]]
        overlapping_gene_data = overlapping_gene_data.drop_duplicates(subset=["Top 10 overlapping\ngenes by significance", "Treatment"], keep="first")
        sns.scatterplot(
            data=overlapping_gene_data.reset_index(drop = True),
            x="Log(2) Fold-Change",
            y="-Log(10) of adjusted P-value",
            hue="Top 10 overlapping\ngenes by significance", 
            style="Treatment",
            markers=markers[:len(intersection_of_genes)],
            palette=colors, 
            s=100,
            edgecolor="white",
            linewidth=1, ax=ax
        )
    # Add threshold lines
    ax.axhline(y=-np.log10(0.05), lw=2, ls="--", c="grey")
    ax.axvline(x=1, lw=2, ls="--", c="grey")
    ax.axvline(x=-1, lw=2, ls="--", c="grey")
    
    ax.spines[['right', 'top']].set_visible(False)
    ax.grid(True, alpha=0.2)
    ax.tick_params(axis='both', which='major', labelsize=12)
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

    ax.set_title(f"Volcano Plot of Target {target}", fontsize=16)
    plt.savefig(f"volcano_plot_{target}.png", bbox_inches='tight', dpi=300)
    plt.savefig(f"volcano_plot_{target}.svg")
    plt.show()

In [None]:
treatment_groupings = list(df.groupby(["Metadata_target"])["Metadata_treatments"])
treatment_groupings = [(x[0][0],list(x[1].unique())) for x in treatment_groupings if len(x[1]) > 1]
treatment_groupings = [x for x in treatment_groupings if len(x[1]) > 1]

In [None]:
for grouping in treatment_groupings:
    print(grouping)
    volcano_comparison(grouping)