# MSI Analysis - MSIsensor-pro Results

This notebook analyzes microsatellite instability (MSI) results from the MSIsensor-pro workflow.

## Overview

- Load and explore MSI detection results
- Classify samples by MSI status (MSI-High, MSI-Low, MSS)
- Visualize MSI score distributions
- Generate summary statistics and reports


In [None]:
# Standard preamble - imports and configuration
from project_utils.notebookpreamble import *

## Setup Paths and Load Configuration

In [None]:
# Setup paths for the workflow
paths = setup_paths()

print("Workflow paths:")
for key, value in paths.items():
    print(f"  {key}: {value}")

In [None]:
# Load configuration
config = load_config()
print("Configuration loaded:")
print(f"  Sample sheet: {config.get('sample_sheet')}")
print(f"  Tumor alias: {config.get('aliases', {}).get('tumor')}")
print(f"  Reference: {config.get('ref', {}).get('species')} {config.get('ref', {}).get('build')} (release {config.get('ref', {}).get('release')})")

## Load Sample Information

In [None]:
# Load sample sheet
samples = load_samples()
print(f"Total samples: {len(samples)}")
print(f"\nSample distribution by alias:")
print(samples['alias'].value_counts())
samples.head()

## Load MSI Results

Load the merged MSI results from msisensor-pro analysis.

In [None]:
# Determine workflow mode and construct results path
workflow_mode = 'tumor_only' if 'baseline' in config.get('aliases', {}) else 'tumor_normal'
species = config['ref']['species']
build = config['ref']['build']
release = config['ref']['release']
genome_version = f"genome.dna.{species}.{build}.{release}"

results_file = paths['results'] / f"{workflow_mode}.{genome_version}.all_samples.tsv"
print(f"Loading results from: {results_file}")

# Load MSI results
msi_results = load_msi_results(results_file)
print(f"\nLoaded {len(msi_results)} samples")
msi_results.head()

## Classify MSI Status

Classify samples based on MSI score thresholds:
- **MSI-High**: MSI score > 20% (0.2)
- **MSI-Low**: MSI score > 0% but â‰¤ 20%
- **MSS** (Microsatellite Stable): MSI score = 0%

In [None]:
# Classify MSI status
msi_results['msi_status'] = classify_msi_status(msi_results['msi_score'], 
                                                  threshold_high=0.2, 
                                                  threshold_low=0.0)

# Display classification results
print("MSI Status Distribution:")
print(msi_results['msi_status'].value_counts().sort_index())
print(f"\nPercentage breakdown:")
print(msi_results['msi_status'].value_counts(normalize=True).sort_index() * 100)

msi_results

## Summary Statistics

In [None]:
# Calculate summary statistics for MSI scores
stats = summary_statistics(msi_results, score_col='msi_score')
print("MSI Score Summary Statistics:")
stats

In [None]:
# Summary by MSI status
print("\nDetailed statistics by MSI status:")
msi_results.groupby('msi_status').agg({
    'msi_score': ['count', 'mean', 'median', 'std', 'min', 'max'],
    'n_all_sites': ['mean', 'median'],
    'n_unstable_sites': ['mean', 'median']
})

## Visualizations

### MSI Score Distribution

In [None]:
# Plot MSI score distribution
fig = plot_msi_distribution(msi_results, 
                            score_col='msi_score',
                            threshold_high=0.2,
                            threshold_low=0.0,
                            title='MSI Score Distribution - All Samples')
plt.show()

### MSI Status by Sample

In [None]:
# Bar plot of MSI scores by sample
fig, ax = plt.subplots(figsize=(14, 6))

# Color code by MSI status
colors = {'MSI-High': 'red', 'MSI-Low': 'orange', 'MSS': 'green'}
bar_colors = [colors[status] for status in msi_results['msi_status']]

ax.bar(range(len(msi_results)), msi_results['msi_score'], color=bar_colors)
ax.axhline(y=0.2, color='red', linestyle='--', label='MSI-High threshold (20%)')
ax.set_xlabel('Sample Index')
ax.set_ylabel('MSI Score')
ax.set_title('MSI Score by Sample')
ax.legend()

# Add legend for colors
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=colors[status], label=status) 
                   for status in ['MSI-High', 'MSI-Low', 'MSS']]
ax.legend(handles=legend_elements, loc='upper right')

plt.tight_layout()
plt.show()

### Microsatellite Sites Analysis

In [None]:
# Scatter plot: unstable sites vs MSI score
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Unstable sites vs total sites
ax1 = axes[0]
scatter = ax1.scatter(msi_results['n_all_sites'], 
                     msi_results['n_unstable_sites'],
                     c=msi_results['msi_score'],
                     cmap='RdYlGn_r',
                     s=100,
                     alpha=0.7)
ax1.set_xlabel('Total Number of Sites')
ax1.set_ylabel('Number of Unstable Sites')
ax1.set_title('Unstable Sites vs Total Sites')
plt.colorbar(scatter, ax=ax1, label='MSI Score')

# Plot 2: MSI score vs number of unstable sites
ax2 = axes[1]
colors_status = [colors[status] for status in msi_results['msi_status']]
ax2.scatter(msi_results['n_unstable_sites'], 
           msi_results['msi_score'],
           c=colors_status,
           s=100,
           alpha=0.7)
ax2.set_xlabel('Number of Unstable Sites')
ax2.set_ylabel('MSI Score')
ax2.set_title('MSI Score vs Unstable Sites')
ax2.axhline(y=0.2, color='red', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## Export Results

Save the annotated results with MSI status classification.

In [None]:
# Save annotated results
output_file = paths['results'] / f"{workflow_mode}.{genome_version}.annotated_results.tsv"
msi_results.to_csv(output_file, sep='\t', index=False)
print(f"Annotated results saved to: {output_file}")

## Summary Report

In [None]:
print("="*60)
print("MSI ANALYSIS SUMMARY REPORT")
print("="*60)
print(f"\nWorkflow mode: {workflow_mode}")
print(f"Genome version: {genome_version}")
print(f"\nTotal samples analyzed: {len(msi_results)}")
print(f"\nMSI Status Distribution:")
for status in ['MSI-High', 'MSI-Low', 'MSS']:
    count = (msi_results['msi_status'] == status).sum()
    pct = count / len(msi_results) * 100
    print(f"  {status}: {count} ({pct:.1f}%)")
print(f"\nMSI Score Range: {msi_results['msi_score'].min():.3f} - {msi_results['msi_score'].max():.3f}")
print(f"Mean MSI Score: {msi_results['msi_score'].mean():.3f}")
print(f"Median MSI Score: {msi_results['msi_score'].median():.3f}")
print("="*60)