In [1]:
# WindowCollapse50v02 - Collapse overlapping windows into insertion regions
"""
NONPARENTAL HAPLOTYPE INSERTION ANALYSIS - WINDOW OVERLAP DETECTOR

Scientific Purpose:
Identifies and collapses overlapping sliding windows to find genuine nonparental
haplotype insertion regions in 1000 Genomes trio families.

Key Discovery:
HG01505 shows 5 overlapping windows (206 max NPAs) representing single massive 
alien DNA insertion at chr3:75.5Mb region.

Method:
- Processes top 50 significant windows (≥5 NPAs each)  
- Collapses windows within 50kb into single insertion regions
- Classifies families: Hybrid (strong insertions) vs Normal (baseline)

Input: window_ranking_COMPLETE_*.json (from sliding window analysis)
Output: Family classifications and overlap statistics

Dataset: 1000 Genomes NYGC 30x, 602 validated trios, chr3 analysis
Author: [Your name] | Date: 2025-05-29 | Version: WindowCollapse50v02
"""

import os
import json
from datetime import datetime

print(f"WindowCollapse50v02 - COLLAPSE OVERLAPPING WINDOWS - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Directory setup - handles programs/ subfolder structure
current_dir = os.getcwd()
if 'programs' in current_dir:
    project_root = os.path.dirname(current_dir)
    os.chdir(project_root)
    print("Changed from programs/ to project root")
print(f"Working directory: {os.getcwd()}")

print("Loading window ranking data...")

# Load latest window ranking file with error handling
latest_file = "outputs/window_ranking_COMPLETE_103602.json"
print(f"Using file: {latest_file}")

try:
    with open(latest_file, 'r') as f:
        ranking_data = json.load(f)
    print("✅ Window ranking data loaded successfully")
except FileNotFoundError:
    print(f"❌ ERROR: {latest_file} not found")
    print("Available ranking files:")
    import glob
    for f in glob.glob("outputs/window_ranking_COMPLETE_*.json"):
        print(f"  {f}")
    exit()

print(f"Total windows in file: {ranking_data['total_windows']}")
print(f"Top 50 windows available for analysis")

# Get the window data (using correct field names from 1000G data structure)
windows = ranking_data['top_50_windows']
print(f"Analyzing {len(windows)} windows")

# Group windows by family (child_id field from 1000 Genomes data)
family_windows = {}
for window in windows:
    child_id = window['child_id']  # 1000G trio child identifier
    if child_id not in family_windows:
        family_windows[child_id] = []
    family_windows[child_id].append({
        'center_position': window['window_center'],
        'npa_count': window['npa_count'],           # Nonparental allele count
        'density': window['window_density'],         # NPAs per window (scientific metric)
        'start': window['window_start'],
        'end': window['window_end']
    })

print(f"Windows grouped across {len(family_windows)} families")

# Show initial family breakdown for scientific documentation
print("\nInitial window counts per family:")
for family_id, windows_list in family_windows.items():
    print(f"  {family_id}: {len(windows_list)} windows")

def collapse_overlapping_windows(windows, overlap_threshold=50000):
    """
    Collapse sliding windows that represent the same insertion region.
    
    Scientific rationale: Multiple overlapping windows with high NPA counts 
    likely represent a single nonparental haplotype insertion, not multiple 
    separate events. This function identifies genuine insertion regions.
    
    Args:
        windows: List of window data with NPA counts and positions
        overlap_threshold: Max distance (bp) between windows to consider overlapping
        
    Returns:
        List of collapsed insertion regions with combined statistics
    """
    if not windows:
        return []
    
    # Sort by genomic position for proper overlap detection
    sorted_windows = sorted(windows, key=lambda x: x['center_position'])
    
    collapsed_regions = []
    current_region = {
        'start_pos': sorted_windows[0]['start'],
        'end_pos': sorted_windows[0]['end'],
        'center_pos': sorted_windows[0]['center_position'],
        'total_npas': sorted_windows[0]['npa_count'],
        'max_npas': sorted_windows[0]['npa_count'],
        'max_density': sorted_windows[0]['density'],
        'window_count': 1,
        'positions': [sorted_windows[0]['center_position']]
    }
    
    for window in sorted_windows[1:]:
        center_pos = window['center_position']
        npas = window['npa_count']
        density = window['density']
        
        # Test for overlap - key algorithm for insertion region detection
        gap = center_pos - current_region['center_pos']
        if gap <= overlap_threshold:
            # Extend current insertion region
            current_region['end_pos'] = max(current_region['end_pos'], window['end'])
            current_region['start_pos'] = min(current_region['start_pos'], window['start'])
            current_region['total_npas'] += npas
            current_region['max_npas'] = max(current_region['max_npas'], npas)
            current_region['max_density'] = max(current_region['max_density'], density)
            current_region['window_count'] += 1
            current_region['positions'].append(center_pos)
        else:
            # Start new insertion region
            collapsed_regions.append(current_region)
            current_region = {
                'start_pos': window['start'],
                'end_pos': window['end'],
                'center_pos': center_pos,
                'total_npas': npas,
                'max_npas': npas,
                'max_density': density,
                'window_count': 1,
                'positions': [center_pos]
            }
    
    # Add the final region
    collapsed_regions.append(current_region)
    
    return collapsed_regions

print("\nCollapsing overlapping windows...")

# Apply overlap analysis to identify genuine insertion regions
family_regions = {}
for family_id, windows_list in family_windows.items():
    regions = collapse_overlapping_windows(windows_list)
    family_regions[family_id] = regions
    
    original_count = len(windows_list)
    collapsed_count = len(regions)
    
    # Report overlap detection results
    if original_count > collapsed_count:
        print(f"🔄 {family_id}: {original_count} windows → {collapsed_count} regions")
    else:
        print(f"✅ {family_id}: {original_count} windows (no overlap)")

# Calculate comprehensive family statistics for scientific analysis
family_stats = []
for family_id, regions in family_regions.items():
    if not regions:
        continue
    
    total_regions = len(regions)
    total_npas = sum(r['total_npas'] for r in regions)
    max_npas_in_region = max(r['max_npas'] for r in regions)
    max_density = max(r['max_density'] for r in regions)
    max_windows_in_region = max(r['window_count'] for r in regions)
    
    family_stats.append({
        'family_id': family_id,
        'total_regions': total_regions,
        'total_npas': total_npas,
        'max_npas_in_region': max_npas_in_region,        # Key metric for alien insertions
        'max_density': max_density,                       # NPAs per window (scientific significance)
        'max_windows_in_region': max_windows_in_region,   # Overlap indicator
        'regions': regions
    })

# Sort by insertion strength - identifies strongest alien insertion candidates
family_stats.sort(key=lambda x: x['max_npas_in_region'], reverse=True)

print("\n" + "="*80)
print("🔥 OVERLAPPING WINDOW ANALYSIS - TOP FAMILIES")
print("="*80)

# Detailed scientific analysis of top insertion candidates
for i, family in enumerate(family_stats):
    fid = family['family_id']
    
    print(f"\n{i+1}. {fid}:")
    print(f"   Insertion regions: {family['total_regions']}")
    print(f"   Total NPAs: {family['total_npas']}")
    print(f"   Strongest region: {family['max_npas_in_region']} NPAs (density: {family['max_density']:.2f})")
    print(f"   Max overlapping windows: {family['max_windows_in_region']}")
    
    # Detailed genomic coordinates for each insertion region
    for j, region in enumerate(family['regions']):
        span_kb = (region['end_pos'] - region['start_pos']) / 1000
        avg_npas = region['total_npas'] / region['window_count']
        
        print(f"     Region {j+1}: {region['start_pos']:,}-{region['end_pos']:,} bp ({span_kb:.1f}kb)")
        print(f"       Overlapping windows: {region['window_count']} | Max NPAs: {region['max_npas']} | Avg: {avg_npas:.1f}")

# Scientific classification: Hybrid (alien insertion) vs Normal (human baseline) families
print("\n" + "="*80)
print("📊 FAMILY CLASSIFICATION FOR DETAILED ANALYSIS")
print("="*80)

# Top 4 families = Hybrid families with strongest alien insertion signatures
hybrid_families = family_stats[:4]

# Bottom 4 families = Normal families with weakest insertion signatures (human baseline)
normal_families = family_stats[-4:] if len(family_stats) >= 8 else family_stats[4:8]

print("\n🔥 HYBRID FAMILIES (Strong Alien Insertions):")
for i, family in enumerate(hybrid_families):
    windows_collapsed = family['max_windows_in_region']
    collapse_indicator = f" ({windows_collapsed} overlapping)" if windows_collapsed > 1 else ""
    print(f"  {i+1}. {family['family_id']}: {family['max_npas_in_region']} max NPAs{collapse_indicator}")

print("\n✅ NORMAL FAMILIES (Weaker Insertions):")
for i, family in enumerate(normal_families):
    windows_collapsed = family['max_windows_in_region'] 
    collapse_indicator = f" ({windows_collapsed} overlapping)" if windows_collapsed > 1 else ""
    print(f"  {i+1}. {family['family_id']}: {family['max_npas_in_region']} max NPAs{collapse_indicator}")

# Scientific insight: Overlapping window patterns indicate genuine insertion events
print("\n" + "="*80)
print("🔍 OVERLAPPING WINDOW INSIGHTS")
print("="*80)

heavily_overlapped = [f for f in family_stats if f['max_windows_in_region'] >= 3]
print(f"Families with 3+ overlapping windows: {len(heavily_overlapped)}")
for family in heavily_overlapped:
    print(f"  {family['family_id']}: {family['max_windows_in_region']} overlapping windows in strongest region")

# Save results with comprehensive metadata for scientific reproducibility
results = {
    'analysis_metadata': {
        'version': 'WindowCollapse50v02',
        'analysis_date': datetime.now().isoformat(),
        'dataset': '1000 Genomes NYGC 30x high-coverage trio data',
        'method': 'Sliding window overlap detection for nonparental haplotype insertions',
        'overlap_threshold_bp': 50000,
        'significance_threshold': '≥5 NPAs per window'
    },
    'source_data': 'top_50_windows_only',
    'scientific_findings': {
        'total_families_analyzed': len(family_stats),
        'families_with_overlaps': len([f for f in family_stats if f['max_windows_in_region'] > 1]),
        'heavily_overlapped_families': len(heavily_overlapped),
        'strongest_insertion_candidate': family_stats[0]['family_id'] if family_stats else None,
        'max_npas_found': family_stats[0]['max_npas_in_region'] if family_stats else 0
    },
    'selected_families': {
        'hybrid_families': [f['family_id'] for f in hybrid_families],
        'normal_families': [f['family_id'] for f in normal_families]
    },
    'detailed_stats': family_stats
}

# Save with timestamp for version control
output_file = f"outputs/window_overlap_analysis_{datetime.now().strftime('%H%M%S')}.json"
with open(output_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n💾 Analysis saved: {output_file}")
print(f"✅ Identified overlapping window patterns in alien insertion families")
print(f"✅ Ready for hybrid vs normal family comparison")

# Scientific summary for GitHub documentation
print(f"\n" + "="*80)
print("📊 SCIENTIFIC SUMMARY FOR PUBLICATION")
print("="*80)
if family_stats:
    strongest = family_stats[0]
    print(f"Strongest insertion candidate: {strongest['family_id']}")
    print(f"Maximum NPAs in single region: {strongest['max_npas_in_region']}")
    print(f"Maximum overlapping windows: {strongest['max_windows_in_region']}")
    print(f"Families showing significant overlap: {len(heavily_overlapped)}/{len(family_stats)}")

WindowCollapse50v02 - COLLAPSE OVERLAPPING WINDOWS - 2025-05-29 14:34:56
Changed from programs/ to project root
Working directory: C:\Users\mremp\00XG1py\20250528Trios1k
Loading window ranking data...
Using file: outputs/window_ranking_COMPLETE_103602.json
✅ Window ranking data loaded successfully
Total windows in file: 128
Top 50 windows available for analysis
Analyzing 50 windows
Windows grouped across 21 families

Initial window counts per family:
  HG01505: 6 windows
  HG02596: 3 windows
  HG02293: 4 windows
  HG02809: 4 windows
  HG01955: 3 windows
  HG02602: 2 windows
  HG01611: 1 windows
  HG03763: 2 windows
  NA19705: 3 windows
  HG00480: 3 windows
  HG03191: 3 windows
  HG02812: 3 windows
  NA10830: 1 windows
  HG00585: 1 windows
  HG01135: 3 windows
  HG02222: 1 windows
  HG02148: 1 windows
  HG03540: 1 windows
  HG02644: 3 windows
  HG02886: 1 windows
  HG01514: 1 windows

Collapsing overlapping windows...
🔄 HG01505: 6 windows → 2 regions
🔄 HG02596: 3 windows → 1 regions
🔄 H