# Step 1: Stacked Histograms Workflow

This notebook creates basic stacked histogram visualizations for multi-dimensional genomic data across chromosomes.

## Overview
- Generate histogram data with multiple values per genomic bin
- Create Circos-compatible data formats
- Configure visualization parameters
- Execute Circos to generate circular plots

In [None]:
import pandas as pd
import numpy as np
import os
import subprocess
import matplotlib.pyplot as plt

print("Step 1: Stacked Histograms Workflow")
print("Working directory:", os.getcwd())

In [None]:
# Configuration
CHROMOSOMES = [f'hs{i}' for i in range(1, 23)] + ['hsX', 'hsY']
BIN_SIZE = 5000000  # 5MB bins
NUM_TRACKS = 5  # Number of histogram tracks

# Chromosome lengths (approximate, in base pairs)
CHROM_LENGTHS = {
    'hs1': 249250621, 'hs2': 242193529, 'hs3': 198295559, 'hs4': 190214555,
    'hs5': 181538259, 'hs6': 170805979, 'hs7': 159345973, 'hs8': 145138636,
    'hs9': 138394717, 'hs10': 133797422, 'hs11': 135086622, 'hs12': 133275309,
    'hs13': 114364328, 'hs14': 107043718, 'hs15': 101991189, 'hs16': 90338345,
    'hs17': 83257441, 'hs18': 80373285, 'hs19': 58617616, 'hs20': 64444167,
    'hs21': 46709983, 'hs22': 50818468, 'hsX': 156040895, 'hsY': 57227415
}

print(f"Configured for {len(CHROMOSOMES)} chromosomes with {BIN_SIZE/1000000}MB bins")

In [None]:
# Generate stacked histogram data
def generate_histogram_data(chromosomes, chrom_lengths, bin_size, num_tracks):
    """Generate multi-track histogram data for Circos"""
    histogram_data = []
    
    for chrom in chromosomes:
        if chrom not in chrom_lengths:
            continue
            
        max_length = chrom_lengths[chrom]
        
        for start in range(0, max_length, bin_size):
            end = min(start + bin_size - 1, max_length)
            
            # Generate random values for demonstration
            # In real analysis, these would be calculated from actual data
            values = [np.random.random() for _ in range(num_tracks)]
            value_str = ','.join([f"{v:.6f}" for v in values])
            
            histogram_data.append(f"{chrom} {start} {end} {value_str}")
    
    return histogram_data

# Generate data
hist_data = generate_histogram_data(CHROMOSOMES, CHROM_LENGTHS, BIN_SIZE, NUM_TRACKS)
print(f"Generated {len(hist_data)} histogram bins")

# Preview first few entries
print("\nFirst 5 histogram entries:")
for i in range(min(5, len(hist_data))):
    print(hist_data[i])

In [None]:
# Write histogram data to file
def write_histogram_file(data, filename='histogram.stacked2.txt'):
    """Write histogram data to Circos format file"""
    with open(filename, 'w') as f:
        for line in data:
            f.write(f" {line} \n")  # Note: Circos format has leading space
    print(f"Written {len(data)} entries to {filename}")

write_histogram_file(hist_data)

# Also create the original format file
write_histogram_file(hist_data, 'histogram.stacked.txt')

In [None]:
# Create trisomy-specific histogram (if needed)
def create_trisomy_histogram(data, filename='trisomyHistogram.txt'):
    """Create trisomy-specific histogram data"""
    # Focus on chromosomes 13, 18, 21 (common trisomies)
    trisomy_chroms = ['hs13', 'hs18', 'hs21']
    trisomy_data = [line for line in data if any(line.startswith(chrom) for chrom in trisomy_chroms)]
    
    with open(filename, 'w') as f:
        for line in trisomy_data:
            f.write(f" {line} \n")
    
    print(f"Created trisomy histogram with {len(trisomy_data)} entries")

create_trisomy_histogram(hist_data)

In [None]:
# Visualize data distribution
def plot_data_preview(data, num_samples=100):
    """Create a preview plot of the histogram data"""
    # Parse sample data for visualization
    sample_data = data[:num_samples]
    values_by_track = [[] for _ in range(NUM_TRACKS)]
    
    for line in sample_data:
        parts = line.split()
        if len(parts) >= 4:
            values = parts[3].split(',')
            for i, val in enumerate(values[:NUM_TRACKS]):
                try:
                    values_by_track[i].append(float(val))
                except ValueError:
                    continue
    
    # Create stacked histogram plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Plot 1: Distribution of values by track
    colors = ['red', 'orange', 'yellow', 'green', 'blue']
    for i, values in enumerate(values_by_track):
        if values:
            ax1.hist(values, alpha=0.7, label=f'Track {i+1}', color=colors[i], bins=20)
    
    ax1.set_xlabel('Value')
    ax1.set_ylabel('Frequency')
    ax1.set_title('Distribution of Histogram Values by Track')
    ax1.legend()
    
    # Plot 2: Sample stacked values
    x_pos = range(min(20, len(sample_data)))
    bottom = np.zeros(len(x_pos))
    
    for i in range(NUM_TRACKS):
        track_values = [values_by_track[i][j] if j < len(values_by_track[i]) else 0 
                       for j in range(len(x_pos))]
        ax2.bar(x_pos, track_values, bottom=bottom, label=f'Track {i+1}', 
               color=colors[i], alpha=0.8)
        bottom += track_values
    
    ax2.set_xlabel('Genomic Bin')
    ax2.set_ylabel('Stacked Values')
    ax2.set_title('Sample Stacked Histogram (First 20 bins)')
    ax2.legend()
    
    plt.tight_layout()
    plt.savefig('histogram_preview.png', dpi=150, bbox_inches='tight')
    plt.show()

plot_data_preview(hist_data)

In [None]:
# Execute Circos (if available)
def run_circos():
    """Execute Circos to generate the visualization"""
    if os.path.exists('circos.conf'):
        try:
            print("Executing Circos...")
            result = subprocess.run(['circos', '-conf', 'circos.conf'], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                print("✓ Circos executed successfully")
                
                # Check for output files
                for ext in ['png', 'svg']:
                    output_file = f'circos.{ext}'
                    if os.path.exists(output_file):
                        print(f"  → Generated: {output_file}")
                        
                        # Get file size
                        size = os.path.getsize(output_file)
                        print(f"     Size: {size/1024:.1f} KB")
            else:
                print(f"✗ Circos execution failed:")
                print(f"   Error: {result.stderr}")
                print(f"   Output: {result.stdout}")
                
        except FileNotFoundError:
            print("✗ Circos not found in PATH")
            print("   Install Circos to generate circular visualizations")
            print("   Data files have been prepared and are ready for Circos")
    else:
        print("✗ circos.conf not found")
        print("   Please ensure circos.conf exists in the current directory")

run_circos()

## Summary

This notebook successfully creates stacked histogram data for Circos visualization:

### Generated Files:
- ✓ `histogram.stacked2.txt` - Main histogram data
- ✓ `histogram.stacked.txt` - Alternative format
- ✓ `trisomyHistogram.txt` - Trisomy-focused data
- ✓ `histogram_preview.png` - Data visualization preview

### Features:
- **Multi-track data**: 5 histogram tracks with different values
- **Genome-wide coverage**: All human chromosomes with 5MB bins
- **Flexible configuration**: Easy to modify bin sizes and track numbers
- **Data visualization**: Preview plots to validate data distribution
- **Circos integration**: Ready for circular plot generation

### Next Steps:
1. Ensure `circos.conf` is properly configured
2. Run Circos to generate the circular visualization
3. Customize colors and styling in the configuration
4. Replace random data with actual genomic measurements

This Python implementation provides the same functionality as the original Perl-based workflow with improved maintainability and visualization capabilities.