# Pronoms: Proteomics Normalization Examples

This notebook demonstrates the usage of the Pronoms library for normalizing proteomics data using various methods.

## Setup

First, let's import the necessary libraries and create some sample data.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

# Import normalizers from pronoms
from pronoms.normalizers import (
    MedianNormalizer,
    QuantileNormalizer,
    L1Normalizer,
    DirectLFQNormalizer,
    VSNNormalizer
)

# Set random seed for reproducibility
np.random.seed(42)

## Generate Sample Data

Let's create a synthetic proteomics dataset with systematic biases to demonstrate normalization.

In [None]:
def generate_sample_data(n_proteins=1000, n_samples=6, bias_factors=None):
    """
    Generate synthetic proteomics data with systematic biases.
    
    Parameters
    ----------
    n_proteins : int, optional
        Number of proteins, by default 1000
    n_samples : int, optional
        Number of samples, by default 6
    bias_factors : list, optional
        Bias factors for each sample, by default None
        
    Returns
    -------
    np.ndarray
        Synthetic proteomics data with shape (n_proteins, n_samples)
    """
    # Generate base protein abundances (log-normal distribution)
    protein_means = np.random.normal(10, 2, n_proteins)
    protein_stds = np.random.uniform(0.1, 0.5, n_proteins)
    
    # Generate data matrix
    data = np.zeros((n_proteins, n_samples))
    for i in range(n_proteins):
        data[i, :] = np.random.normal(protein_means[i], protein_stds[i], n_samples)
    
    # Add systematic biases if provided
    if bias_factors is not None:
        for j, factor in enumerate(bias_factors):
            data[:, j] *= factor
    
    # Exponentiate to get raw intensities
    data = np.exp(data)
    
    return data

# Generate data with systematic biases
bias_factors = [0.7, 1.0, 1.3, 0.8, 1.2, 0.9]  # Systematic biases for each sample
data = generate_sample_data(bias_factors=bias_factors)

# Create sample names
sample_names = [f"Sample {i+1}" for i in range(data.shape[1])]

# Display data shape
print(f"Data shape: {data.shape}")
print(f"Sample names: {sample_names}")

# Show summary statistics
df = pd.DataFrame(data, columns=sample_names)
df.describe()

## Visualize Raw Data

Let's visualize the raw data to see the systematic biases.

In [None]:
plt.figure(figsize=(10, 6))
plt.boxplot(data, labels=sample_names)
plt.title("Raw Data Distribution")
plt.ylabel("Intensity")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Also show on log scale
plt.figure(figsize=(10, 6))
plt.boxplot(np.log2(data), labels=sample_names)
plt.title("Log2 Raw Data Distribution")
plt.ylabel("Log2 Intensity")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 1. Median Normalization

Median normalization scales each sample by its median value.

In [None]:
# Create and apply median normalizer
median_normalizer = MedianNormalizer()
median_normalized_data = median_normalizer.normalize(data)

# Plot comparison
median_normalizer.plot_comparison(data, median_normalized_data, sample_names=sample_names)
plt.show()

## 2. Quantile Normalization

Quantile normalization makes the distribution of intensities identical across all samples.

In [None]:
# Create and apply quantile normalizer
quantile_normalizer = QuantileNormalizer()
quantile_normalized_data = quantile_normalizer.normalize(data)

# Plot comparison
fig, fig2 = quantile_normalizer.plot_comparison(data, quantile_normalized_data, sample_names=sample_names)
plt.show()

## 3. L1 Normalization

L1 normalization scales each sample to have a sum of 1.

In [None]:
# Create and apply L1 normalizer
l1_normalizer = L1Normalizer()
l1_normalized_data = l1_normalizer.normalize(data)

# Plot comparison
l1_normalizer.plot_comparison(data, l1_normalized_data, sample_names=sample_names)
plt.show()

## 4. DirectLFQ Normalization (R-based)

DirectLFQ is a label-free quantification method implemented in R.

In [None]:
try:
    # Create and apply DirectLFQ normalizer
    directlfq_normalizer = DirectLFQNormalizer(impute_missing=True)
    directlfq_normalized_data = directlfq_normalizer.normalize(data, sample_ids=sample_names)
    
    # Plot comparison
    directlfq_normalizer.plot_comparison(data, directlfq_normalized_data, sample_names=sample_names)
    plt.show()
except Exception as e:
    print(f"DirectLFQ normalization failed: {str(e)}")
    print("This is likely because the R package 'DirectLFQ' is not installed.")
    print("To install it, run the following in R:")
    print("if (!require(\"BiocManager\", quietly = TRUE)) install.packages(\"BiocManager\")")
    print("BiocManager::install(\"DirectLFQ\")")

## 5. VSN Normalization (R-based)

Variance Stabilizing Normalization (VSN) stabilizes the variance across the intensity range.

In [None]:
try:
    # Create and apply VSN normalizer
    vsn_normalizer = VSNNormalizer()
    vsn_normalized_data = vsn_normalizer.normalize(data, sample_ids=sample_names)
    
    # Plot comparison
    fig, fig2 = vsn_normalizer.plot_comparison(data, vsn_normalized_data, sample_names=sample_names)
    plt.show()
except Exception as e:
    print(f"VSN normalization failed: {str(e)}")
    print("This is likely because the R package 'vsn' is not installed.")
    print("To install it, run the following in R:")
    print("if (!require(\"BiocManager\", quietly = TRUE)) install.packages(\"BiocManager\")")
    print("BiocManager::install(\"vsn\")")

## Comparison of All Methods

Let's compare all normalization methods side by side.

In [None]:
# Create a function to plot boxplots of all methods
def plot_all_methods(raw_data, normalized_data_dict, sample_names=None):
    n_methods = len(normalized_data_dict) + 1  # +1 for raw data
    
    fig = plt.figure(figsize=(15, 10))
    gs = GridSpec(2, 3, figure=fig)
    
    # Plot raw data
    ax1 = fig.add_subplot(gs[0, 0])
    ax1.boxplot(raw_data, labels=sample_names)
    ax1.set_title("Raw Data")
    ax1.set_ylabel("Intensity")
    ax1.tick_params(axis='x', rotation=45)
    
    # Plot normalized data for each method
    positions = [(0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]
    for i, (method_name, norm_data) in enumerate(normalized_data_dict.items()):
        if i < len(positions):
            ax = fig.add_subplot(gs[positions[i]])
            ax.boxplot(norm_data, labels=sample_names)
            ax.set_title(f"{method_name} Normalized")
            ax.tick_params(axis='x', rotation=45)
            if positions[i][1] == 0:
                ax.set_ylabel("Intensity")
    
    plt.tight_layout()
    return fig

# Create a dictionary of normalized data
normalized_data = {
    "Median": median_normalized_data,
    "Quantile": quantile_normalized_data,
    "L1": l1_normalized_data
}

# Add R-based methods if available
try:
    normalized_data["DirectLFQ"] = directlfq_normalized_data
except NameError:
    pass

try:
    normalized_data["VSN"] = vsn_normalized_data
except NameError:
    pass

# Plot all methods
plot_all_methods(data, normalized_data, sample_names)
plt.show()

## Conclusion

In this notebook, we demonstrated the use of various normalization methods provided by the Pronoms library:

1. **Median Normalization**: Simple and effective for correcting systematic biases.
2. **Quantile Normalization**: Makes the distributions identical across samples.
3. **L1 Normalization**: Scales each sample to have a sum of 1.
4. **DirectLFQ Normalization**: A label-free quantification method that includes normalization (R-based).
5. **VSN Normalization**: Stabilizes variance across the intensity range (R-based).

Each method has its strengths and is suitable for different types of proteomics data and analysis goals. The choice of normalization method should be based on the specific characteristics of your data and the assumptions of your downstream analysis.