# Input preparation for examples
- Run DESeq2 on count data
- For simple example filter not significant foldchanges
---
Author: Felix Offensperger

In [1]:
# Preprocessing script for DESeq2 analysis of yeast heat shock data
# This script prepares the data for DESeq2 analysis by extracting relevant samples,
# creating a count matrix, and generating sample information.

import pandas as pd
# Load the yeast heat shock expression data
data = pd.read_csv('./complex_yeast_heatshock.tsv',sep='\t')

# Prepare data for DESeq2 analysis for 25 degree conditions
# Select columns for 25_0_* samples (25°C, 0 min timepoint)
cols_25 = [col for col in data.columns if col.startswith('25_0_')]
data_25 = data[['gene_id'] + cols_25]

# Prepare count matrix and sample information for DESeq2
count_matrix = data_25.set_index('gene_id', drop=True)
count_matrix.index.name=None

# Assign condition based on sample name (extract genotype/perturbation)
def get_condition(col):
    if 'Wildtype' in col:
        return 'Wildtype'
    elif 'Double.KDKO' in col:
        return 'Double.KDKO'
    elif 'MSN24.KO' in col:
        return 'MSN24.KO'
    elif 'HSF1.KD' in col:
        return 'HSF1.KD'
    else:
        return 'Other'

# Create a sample information table for DESeq2
sample_info = pd.DataFrame({
    'sample': count_matrix.columns,
    'condition': [get_condition(col) for col in count_matrix.columns]
})

# Write count matrix and sample info to files for R/DESeq2
count_matrix.to_csv('./deseq2_counts_25deg.tsv', sep='\t')
sample_info.to_csv('./deseq2_sample_info_25deg.tsv', sep='\t', index=False)

# R code for DESeq2 analysis (to be run in an R environment)
r_code = """
library(DESeq2)
# Define comparisons of interest (KO vs WT)
comparisons <- list(
    c('Double.KDKO', 'Wildtype'),
    c('MSN24.KO', 'Wildtype'),
    c('HSF1.KD', 'Wildtype')
)
# Read count matrix and sample info
dounts <- read.table('./deseq2_counts_25deg.tsv', header=TRUE, row.names=1, sep='\t',check.names=FALSE)
coldata <- read.table('./deseq2_sample_info_25deg.tsv', header=TRUE, sep='\t', row.names=1)
# Create DESeq2 object
dds <- DESeqDataSetFromMatrix(countData=counts, colData=coldata, design=~condition)
dds <- DESeq(dds)

# Run all comparisons and collect results
all_res <- list()
for (cmp in comparisons) {
    res <- results(dds, contrast=c('condition', cmp[1], cmp[2]))
    res_df <- as.data.frame(res)
    res_df$comparison <- paste(cmp[1], 'vs', cmp[2])
    res_df$gene_id <- rownames(res_df)
    all_res[[paste(cmp[1], cmp[2], sep='_vs_')]] <- res_df
}
combined_res <- do.call(rbind, all_res)
write.csv(combined_res, file='./deseq2_results_25deg_all_comparisons.csv', row.names=FALSE,quote=FALSE)
"""
# Print the R code for user to run in R
print(r_code)


library(DESeq2)
# Define comparisons of interest (KO vs WT)
comparisons <- list(
    c('Double.KDKO', 'Wildtype'),
    c('MSN24.KO', 'Wildtype'),
    c('HSF1.KD', 'Wildtype')
)
# Read count matrix and sample info
dounts <- read.table('./deseq2_counts_25deg.tsv', header=TRUE, row.names=1, sep='	',check.names=FALSE)
coldata <- read.table('./deseq2_sample_info_25deg.tsv', header=TRUE, sep='	', row.names=1)
# Create DESeq2 object
dds <- DESeqDataSetFromMatrix(countData=counts, colData=coldata, design=~condition)
dds <- DESeq(dds)

# Run all comparisons and collect results
all_res <- list()
for (cmp in comparisons) {
    res <- results(dds, contrast=c('condition', cmp[1], cmp[2]))
    res_df <- as.data.frame(res)
    res_df$comparison <- paste(cmp[1], 'vs', cmp[2])
    res_df$gene_id <- rownames(res_df)
    all_res[[paste(cmp[1], cmp[2], sep='_vs_')]] <- res_df
}
combined_res <- do.call(rbind, all_res)
write.csv(combined_res, file='./deseq2_results_25deg_all_comparisons.csv', row.names=FAL

In [2]:
# Preprocessing script for DESeq2 analysis of yeast heat shock data (37°C, 10 min)
import pandas as pd
# Load the yeast heat shock expression data
data = pd.read_csv('./complex_yeast_heatshock.tsv', sep='\t')

# Prepare data for DESeq2 analysis for 37 degree, 10 min conditions
cols_37_10 = [col for col in data.columns if col.startswith('37_10_')]
data_37_10 = data[['gene_id'] + cols_37_10]

# Prepare count matrix and sample information for DESeq2
count_matrix_37_10 = data_37_10.set_index('gene_id', drop=True)
count_matrix_37_10.index.name = None

# Assign condition based on sample name (extract genotype/perturbation)
def get_condition(col):
    if 'Wildtype' in col:
        return 'Wildtype'
    elif 'Double.KDKO' in col:
        return 'Double.KDKO'
    elif 'MSN24.KO' in col:
        return 'MSN24.KO'
    elif 'HSF1.KD' in col:
        return 'HSF1.KD'
    else:
        return 'Other'

# Create a sample information table for DESeq2
sample_info_37_10 = pd.DataFrame({
    'sample': count_matrix_37_10.columns,
    'condition': [get_condition(col) for col in count_matrix_37_10.columns]
})

# Write count matrix and sample info to files for R/DESeq2
count_matrix_37_10.to_csv('./deseq2_counts_37deg_10min.tsv', sep='\t')
sample_info_37_10.to_csv('./deseq2_sample_info_37deg_10min.tsv', sep='\t', index=False)

# R code for DESeq2 analysis (to be run in an R environment)
r_code_37_10 = """
library(DESeq2)
# Define comparisons of interest (KO vs WT)
comparisons <- list(
    c('Double.KDKO', 'Wildtype'),
    c('MSN24.KO', 'Wildtype'),
    c('HSF1.KD', 'Wildtype')
)
# Read count matrix and sample info
counts <- read.table('./deseq2_counts_37deg_10min.tsv', header=TRUE, row.names=1, sep='\\t',check.names=FALSE)
coldata <- read.table('./deseq2_sample_info_37deg_10min.tsv', header=TRUE, sep='\\t', row.names=1)
# Create DESeq2 object
dds <- DESeqDataSetFromMatrix(countData=counts, colData=coldata, design=~condition)
dds <- DESeq(dds)

# Run all comparisons and collect results
all_res <- list()
for (cmp in comparisons) {
    res <- results(dds, contrast=c('condition', cmp[1], cmp[2]))
    res_df <- as.data.frame(res)
    res_df$comparison <- paste(cmp[1], 'vs', cmp[2])
    res_df$gene_id <- rownames(res_df)
    all_res[[paste(cmp[1], cmp[2], sep='_vs_')]] <- res_df
}
combined_res <- do.call(rbind, all_res)
write.csv(combined_res, file='./deseq2_results_37deg_10min_all_comparisons.csv', row.names=FALSE,quote=FALSE)
"""
print(r_code_37_10)



library(DESeq2)
# Define comparisons of interest (KO vs WT)
comparisons <- list(
    c('Double.KDKO', 'Wildtype'),
    c('MSN24.KO', 'Wildtype'),
    c('HSF1.KD', 'Wildtype')
)
# Read count matrix and sample info
counts <- read.table('./deseq2_counts_37deg_10min.tsv', header=TRUE, row.names=1, sep='\t',check.names=FALSE)
coldata <- read.table('./deseq2_sample_info_37deg_10min.tsv', header=TRUE, sep='\t', row.names=1)
# Create DESeq2 object
dds <- DESeqDataSetFromMatrix(countData=counts, colData=coldata, design=~condition)
dds <- DESeq(dds)

# Run all comparisons and collect results
all_res <- list()
for (cmp in comparisons) {
    res <- results(dds, contrast=c('condition', cmp[1], cmp[2]))
    res_df <- as.data.frame(res)
    res_df$comparison <- paste(cmp[1], 'vs', cmp[2])
    res_df$gene_id <- rownames(res_df)
    all_res[[paste(cmp[1], cmp[2], sep='_vs_')]] <- res_df
}
combined_res <- do.call(rbind, all_res)
write.csv(combined_res, file='./deseq2_results_37deg_10min_all_comparisons

In [3]:
# Reading the DESeq2 results into a Polars DataFrame

import polars as pl

# Explicitly specify the schema for the columns to ensure correct types
schema = {
    "baseMean": pl.Float32,
    "log2FoldChange": pl.Float32,
    "lfcSE": pl.Float32,
    "stat": pl.Float32,
    "pvalue": pl.Float32,
    "padj": pl.Float32,
    "comparison": pl.Utf8,
    "gene_id": pl.Utf8
}

# Read the DESeq2 results CSV file
deseq_results = pl.read_csv(
    './deseq2_results_25deg_all_comparisons.csv',
    null_values=['NA'],
    schema=schema
)
# Show the first few rows for inspection
deseq_results.head()

baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,comparison,gene_id
f32,f32,f32,f32,f32,f32,str,str
3.228615,1.634825,1.341761,1.218417,0.223065,1.0,"""Double.KDKO vs Wildtype""","""15S_rRNA"""
29.572617,-0.269511,0.644189,-0.418373,0.675675,1.0,"""Double.KDKO vs Wildtype""","""21S_rRNA"""
0.961595,2.566992,2.213386,1.159758,0.246147,1.0,"""Double.KDKO vs Wildtype""","""HRA1"""
190.579895,0.629549,0.36173,1.740382,0.081792,1.0,"""Double.KDKO vs Wildtype""","""ICR1"""
10.889631,-1.626008,0.817116,-1.989936,0.046598,1.0,"""Double.KDKO vs Wildtype""","""LSR1"""


In [4]:
# Post-processing the DESeq2 results: cleaning up log2FoldChange values
# Set log2FoldChange to 0 for non-significant genes (p > 0.05)
deseq_results = deseq_results.with_columns(
    pl.when(pl.col("pvalue") > 0.05)
      .then(0)
      .otherwise(pl.col("log2FoldChange"))
      .alias("log2FoldChange")
)
# Fill any remaining null log2FoldChange values with 0
deseq_results = deseq_results.with_columns(
    pl.col("log2FoldChange").fill_null(0),
)
# Save the cleaned results
deseq_results.write_csv('./deseq2_results_25deg_all_comparisons_cleaned.csv')
# Print null value counts for each column
print(deseq_results.null_count())
# Show the first few rows
deseq_results.head()

shape: (1, 8)
┌──────────┬────────────────┬───────┬──────┬────────┬──────┬────────────┬─────────┐
│ baseMean ┆ log2FoldChange ┆ lfcSE ┆ stat ┆ pvalue ┆ padj ┆ comparison ┆ gene_id │
│ ---      ┆ ---            ┆ ---   ┆ ---  ┆ ---    ┆ ---  ┆ ---        ┆ ---     │
│ u32      ┆ u32            ┆ u32   ┆ u32  ┆ u32    ┆ u32  ┆ u32        ┆ u32     │
╞══════════╪════════════════╪═══════╪══════╪════════╪══════╪════════════╪═════════╡
│ 0        ┆ 0              ┆ 2181  ┆ 2181 ┆ 2184   ┆ 2184 ┆ 0          ┆ 0       │
└──────────┴────────────────┴───────┴──────┴────────┴──────┴────────────┴─────────┘


baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,comparison,gene_id
f32,f32,f32,f32,f32,f32,str,str
3.228615,0.0,1.341761,1.218417,0.223065,1.0,"""Double.KDKO vs Wildtype""","""15S_rRNA"""
29.572617,0.0,0.644189,-0.418373,0.675675,1.0,"""Double.KDKO vs Wildtype""","""21S_rRNA"""
0.961595,0.0,2.213386,1.159758,0.246147,1.0,"""Double.KDKO vs Wildtype""","""HRA1"""
190.579895,0.0,0.36173,1.740382,0.081792,1.0,"""Double.KDKO vs Wildtype""","""ICR1"""
10.889631,-1.626008,0.817116,-1.989936,0.046598,1.0,"""Double.KDKO vs Wildtype""","""LSR1"""
