In [26]:
%pip install bioinfokit

import pandas as pd
import scipy.stats
import os
from bioinfokit import analys, visuz
from IPython.display import Image
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import logging

# Setup basic logging configuration
logging.basicConfig(
    filename="processing.log",
    filemode="w",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

Note: you may need to restart the kernel to use updated packages.


In [28]:


def list_dataset_files(path: str) -> list:
    """List all .tsv files in the dataset directory."""
    return sorted([f for f in os.listdir(path) if f.endswith(".tsv")])

def load_and_parse_file(file_path: str) -> pd.DataFrame:
    """Load the TSV file into a pandas DataFrame."""
    df = pd.read_csv(file_path, sep='\t')
    pd.set_option('display.float_format', '{:.6g}'.format)
    return df

def expand_gene_symbols(df: pd.DataFrame) -> pd.DataFrame:
    """Expand gene symbols split by '///' into separate rows."""
    df2 = pd.DataFrame(columns=['GENE_SYMBOL', 'logFC', 'P.Value'])
    notincluded = []
    for index, row in df.iterrows():
        try:
            for item in row['GENE_SYMBOL'].split('///'):
                df2.loc[len(df2.index)] = [item.strip(), row["logFC"], row["P.Value"]]
        except:
            notincluded.append(row.get('ID', index))
    return df2

def combine_duplicate_genes(df: pd.DataFrame) -> pd.DataFrame:
    """Average logFC and combine p-values using Fisher's method for duplicated genes."""
    def average(lst):
        return sum(lst) / len(lst)

    gene_dict = {}
    count = {}
    for gene in df['GENE_SYMBOL']:
        if gene not in count:
            gene_dict[gene] = {'logFC': [], 'P.Value': []}
            count[gene] = 1
        else:
            count[gene] += 1

    for k in list(count.keys()):
        if count[k] == 1:
            del gene_dict[k]

    df_clean = df.copy()
    for index, row in df.iterrows():
        gene = row['GENE_SYMBOL']
        if gene in gene_dict:
            gene_dict[gene]['logFC'].append(row['logFC'])
            gene_dict[gene]['P.Value'].append(row['P.Value'])
            df_clean.drop(index, inplace=True)

    for gene, values in gene_dict.items():
        _, combined_p = scipy.stats.combine_pvalues(values['P.Value'], method='fisher')
        mean_logfc = average(values['logFC'])
        df_clean = pd.concat([df_clean, pd.DataFrame([{
            'GENE_SYMBOL': gene,
            'logFC': mean_logfc,
            'P.Value': combined_p
        }])], ignore_index=True)

    return df_clean.sort_values(by="P.Value")

def plot_all(df: pd.DataFrame, output_prefix: str,
             lfc_col='logFC', pval_col='P.Value',
             lfc_thresh=0.2, pval_thresh=0.05):
    """Generate volcano, boxplot, and p-value histogram for a dataset."""

    # Volcano plot
    df['-log10(pval)'] = -np.log10(df[pval_col])
    df['significant'] = (abs(df[lfc_col]) > lfc_thresh) & (df[pval_col] < pval_thresh)

    plt.figure(figsize=(8, 6))
    plt.scatter(df[lfc_col], df['-log10(pval)'],
                c=df['significant'].map({True: 'red', False: 'grey'}),
                alpha=0.7, edgecolors='k')
    plt.axhline(-np.log10(pval_thresh), linestyle='--', color='blue')
    plt.axvline(-lfc_thresh, linestyle='--', color='blue')
    plt.axvline(lfc_thresh, linestyle='--', color='blue')
    plt.title("Volcano Plot")
    plt.xlabel("log2 Fold Change")
    plt.ylabel("-log10(p-value)")
    plt.tight_layout()
    plt.savefig(output_prefix + "_volcano.png", dpi=300)
    plt.close()

    # Boxplot of logFC
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[lfc_col])
    plt.title("logFC Distribution")
    plt.tight_layout()
    plt.savefig(output_prefix + "_logFC_boxplot.png", dpi=300)
    plt.close()

    # Histogram of p-values
    plt.figure(figsize=(6, 4))
    plt.hist(df[pval_col], bins=50, color='skyblue', edgecolor='black')
    plt.title("P-value Distribution")
    plt.xlabel("P-value")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig(output_prefix + "_pvalue_histogram.png", dpi=300)
    plt.close()


def process_dataset(data_path: str, file: str, results_root: str) -> pd.DataFrame:
    """Run the full pipeline for one dataset file and save output in results folder with stats."""
    dataset_name = os.path.basename(data_path)
    output_dir = os.path.join(results_root, dataset_name)
    os.makedirs(output_dir, exist_ok=True)

    file_path = os.path.join(data_path, file)
    output_prefix = os.path.join(output_dir, file.replace(".tsv", ""))

    logging.info(f"Start processing file: {file} in dataset: {dataset_name}")
    print(f"→ Processing {dataset_name}/{file}")
    try:
        df = load_and_parse_file(file_path)
        initial_gene_count = len(df)

        plot_all(df, output_prefix)
        df_expanded = expand_gene_symbols(df)
        expanded_count = len(df_expanded)

        unique_genes = df_expanded['GENE_SYMBOL'].nunique()
        total_rows = len(df_expanded)
        duplicates = total_rows - unique_genes

        df_cleaned = combine_duplicate_genes(df_expanded)
        cleaned_count = len(df_cleaned)

        output_file = os.path.join(output_dir, file.replace(".tsv", "_processed.csv"))
        df_cleaned.to_csv(output_file, index=False)

        # Print and log stats
        stats_msg = (
            f"{dataset_name}/{file}:\n"
            f"  Original genes: {initial_gene_count}\n"
            f"  Expanded rows: {expanded_count}\n"
            f"  Unique genes before deduplication: {unique_genes}\n"
            f"  Duplicated genes: {duplicates}\n"
            f"  Final unique genes after combining: {cleaned_count}\n"
        )
        print(stats_msg)
        logging.info(stats_msg)

        logging.info(f"Finished processing file: {file}")
        return df_cleaned

    except Exception as e:
        error_msg = f"Failed to process {file} in {dataset_name}: {e}"
        logging.error(error_msg)
        print(error_msg)
        return pd.DataFrame()  # return empty dataframe on failure

def batch_process_all_datasets(data_root: str, results_root: str = "../results") -> dict:
    """Process all datasets and save to results_root with logging and printed stats."""
    os.makedirs(results_root, exist_ok=True)
    results = {}
    dataset_dirs = [d for d in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, d))]

    for dataset_name in dataset_dirs:
        dataset_path = os.path.join(data_root, dataset_name)
        files = list_dataset_files(dataset_path)

        for file in files:
            df_processed = process_dataset(dataset_path, file, results_root)
            results[f"{dataset_name}/{file}"] = df_processed

    return results






In [None]:
results = batch_process_all_datasets("../data")

→ Processing GSE116280/GSE116280_Rot100nM_24h_8d.tsv
GSE116280/GSE116280_Rot100nM_24h_8d.tsv:
  Original genes: 21727
  Expanded rows: 21330
  Unique genes before deduplication: 21255
  Duplicated genes: 75
  Final unique genes after combining: 21255

→ Processing GSE116280/GSE116280_Rot50nM_24h_8d.tsv
GSE116280/GSE116280_Rot50nM_24h_8d.tsv:
  Original genes: 21729
  Expanded rows: 21332
  Unique genes before deduplication: 21257
  Duplicated genes: 75
  Final unique genes after combining: 21257

→ Processing GSE90122/GSE90122-Rifampicin.tsv
GSE90122/GSE90122-Rifampicin.tsv:
  Original genes: 42545
  Expanded rows: 30436
  Unique genes before deduplication: 22070
  Duplicated genes: 8366
  Final unique genes after combining: 22070

→ Processing GSE90122/GSE90122-SR12813.tsv
GSE90122/GSE90122-SR12813.tsv:
  Original genes: 42545
  Expanded rows: 30436
  Unique genes before deduplication: 22070
  Duplicated genes: 8366
  Final unique genes after combining: 22070

→ Processing GSE90122/GS