# Variant Effect Analysis with Chorus

This notebook demonstrates how to analyze the effects of genetic variants on regulatory activity using Chorus oracles.

## Setup

In [None]:
import chorus
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

## 1. Create Sample VCF File

In [None]:
# Create a sample VCF file with some variants
vcf_content = """##fileformat=VCFv4.3
##reference=hg38
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
chr1	1000100	rs123	A	G	30	PASS	AF=0.1
chr1	1000200	rs124	C	T	40	PASS	AF=0.2
chr1	1000300	rs125	G	C,T	50	PASS	AF=0.15,0.05
chr1	1000500	rs126	ATG	A	35	PASS	AF=0.08
chr1	1000600	rs127	T	TAAA	45	PASS	AF=0.12
"""

with open('example_variants.vcf', 'w') as f:
    f.write(vcf_content)

print("Created example_variants.vcf")

In [None]:
# Parse VCF file
variants = chorus.parse_vcf('example_variants.vcf')
print(f"Loaded {len(variants)} variants")
print("\nFirst few variants:")
print(variants.head())

## 2. Single Variant Analysis

In [None]:
# Initialize oracle
oracle = chorus.create_oracle('enformer')

# For demonstration, we'll analyze a single variant
variant = variants.iloc[0]
print(f"Analyzing variant: {variant['chrom']}:{variant['pos']} {variant['ref']}->{variant['alt']}")

In [None]:
# Define the region around the variant (±100kb)
window_size = 100000
region = f"{variant['chrom']}:{variant['pos']-window_size}-{variant['pos']+window_size}"

# Define assays to analyze
assays = [
    "DNase:K562",
    "ATAC-seq:GM12878",
    "ChIP-seq_H3K27ac:K562",
    "CAGE:HepG2"
]

# Predict variant effects (requires loaded model and reference genome)
# results = oracle.predict_variant_effect(
#     genomic_region=region,
#     variant_position=f"{variant['chrom']}:{variant['pos']}",
#     alleles=[variant['ref'], variant['alt']],
#     assay_ids=assays,
#     genome='hg38.fa'
# )

# Simulated results for demonstration
results = {
    'effect_sizes': {
        'alt_1': {
            'DNase:K562': np.random.randn() * 0.5,
            'ATAC-seq:GM12878': np.random.randn() * 0.3,
            'ChIP-seq_H3K27ac:K562': np.random.randn() * 0.4,
            'CAGE:HepG2': np.random.randn() * 0.6
        }
    },
    'variant_info': {
        'position': f"{variant['chrom']}:{variant['pos']}",
        'ref': variant['ref'],
        'alts': [variant['alt']]
    }
}

In [None]:
# Visualize effect sizes
effect_data = results['effect_sizes']['alt_1']
assay_names = list(effect_data.keys())
effect_values = list(effect_data.values())

plt.figure(figsize=(10, 6))
colors = ['red' if x < 0 else 'green' for x in effect_values]
plt.barh(assay_names, effect_values, color=colors)
plt.xlabel('Effect Size (Alt - Ref)')
plt.title(f"Variant Effect: {results['variant_info']['position']} {results['variant_info']['ref']}->{results['variant_info']['alts'][0]}")
plt.axvline(x=0, color='black', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('variant_effect_single.png')
plt.show()

## 3. Multiple Variant Analysis

In [None]:
# Analyze all variants in the VCF file
all_effects = []

for idx, variant in variants.iterrows():
    # Skip multi-allelic variants for simplicity
    if ',' in variant['alt']:
        continue
    
    # Simulate effect prediction (replace with actual prediction)
    effects = {
        'variant_id': variant['id'],
        'position': f"{variant['chrom']}:{variant['pos']}",
        'ref': variant['ref'],
        'alt': variant['alt'],
        'DNase_effect': np.random.randn() * 0.5,
        'ATAC_effect': np.random.randn() * 0.4,
        'H3K27ac_effect': np.random.randn() * 0.3,
        'CAGE_effect': np.random.randn() * 0.6
    }
    all_effects.append(effects)

effects_df = pd.DataFrame(all_effects)
print(f"Analyzed {len(effects_df)} variants")
print(effects_df)

In [None]:
# Create heatmap of variant effects
effect_cols = ['DNase_effect', 'ATAC_effect', 'H3K27ac_effect', 'CAGE_effect']
effect_matrix = effects_df[effect_cols].values

plt.figure(figsize=(8, 6))
sns.heatmap(
    effect_matrix.T,
    xticklabels=effects_df['variant_id'],
    yticklabels=['DNase', 'ATAC', 'H3K27ac', 'CAGE'],
    cmap='RdBu_r',
    center=0,
    cbar_kws={'label': 'Effect Size'}
)
plt.title('Variant Effects Across Assays')
plt.xlabel('Variant')
plt.ylabel('Assay')
plt.tight_layout()
plt.savefig('variant_effects_heatmap.png')
plt.show()

## 4. Variant Prioritization

In [None]:
# Calculate overall impact score for each variant
effects_df['max_abs_effect'] = effects_df[effect_cols].abs().max(axis=1)
effects_df['mean_abs_effect'] = effects_df[effect_cols].abs().mean(axis=1)
effects_df['n_affected_assays'] = (effects_df[effect_cols].abs() > 0.3).sum(axis=1)

# Sort by impact
prioritized = effects_df.sort_values('max_abs_effect', ascending=False)

print("Top variants by maximum effect size:")
print(prioritized[['variant_id', 'position', 'max_abs_effect', 'n_affected_assays']].head())

In [None]:
# Visualize variant prioritization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Scatter plot: max effect vs number of affected assays
scatter = ax1.scatter(
    effects_df['max_abs_effect'],
    effects_df['n_affected_assays'],
    c=effects_df['mean_abs_effect'],
    cmap='viridis',
    s=100,
    alpha=0.6
)
ax1.set_xlabel('Maximum Absolute Effect')
ax1.set_ylabel('Number of Affected Assays (|effect| > 0.3)')
ax1.set_title('Variant Impact Analysis')
plt.colorbar(scatter, ax=ax1, label='Mean Absolute Effect')

# Bar plot: top variants
top_variants = prioritized.head(5)
ax2.barh(top_variants['variant_id'], top_variants['max_abs_effect'])
ax2.set_xlabel('Maximum Absolute Effect')
ax2.set_title('Top 5 High-Impact Variants')

plt.tight_layout()
plt.savefig('variant_prioritization.png')
plt.show()

## 5. In Silico Mutagenesis

In [None]:
# Perform systematic mutagenesis of a regulatory element
# Example: TATA box region
reference_seq = "GGCATATAAAAGGCAG"
print(f"Reference sequence: {reference_seq}")
print(f"                    {''.join([' ']*3)}TATAAA")

# Generate all single nucleotide variants
mutations = []
for pos in range(len(reference_seq)):
    ref_base = reference_seq[pos]
    for alt_base in ['A', 'C', 'G', 'T']:
        if alt_base != ref_base:
            mut_seq = reference_seq[:pos] + alt_base + reference_seq[pos+1:]
            mutations.append({
                'position': pos,
                'ref': ref_base,
                'alt': alt_base,
                'mutant_seq': mut_seq,
                'in_tata': pos >= 3 and pos < 9  # TATA box positions
            })

mutations_df = pd.DataFrame(mutations)
print(f"\nGenerated {len(mutations_df)} single nucleotide variants")

In [None]:
# Simulate predicted effects for each mutation
# In practice, you would use the oracle to predict actual effects
for idx, mut in mutations_df.iterrows():
    # Simulate stronger effects for mutations in TATA box
    if mut['in_tata']:
        effect = np.random.randn() * 2.0 - 1.0  # Bias toward negative
    else:
        effect = np.random.randn() * 0.5
    mutations_df.loc[idx, 'promoter_effect'] = effect

# Create mutation effect matrix
effect_matrix = np.zeros((4, len(reference_seq)))
base_to_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

for _, mut in mutations_df.iterrows():
    effect_matrix[base_to_idx[mut['alt']], mut['position']] = mut['promoter_effect']

# Visualize
plt.figure(figsize=(12, 4))
im = plt.imshow(effect_matrix, cmap='RdBu_r', aspect='auto', vmin=-3, vmax=3)
plt.colorbar(im, label='Effect on Promoter Activity')
plt.yticks([0, 1, 2, 3], ['A', 'C', 'G', 'T'])
plt.xticks(range(len(reference_seq)), list(reference_seq))
plt.xlabel('Position in Sequence')
plt.ylabel('Mutant Base')
plt.title('In Silico Mutagenesis: TATA Box Region')

# Highlight TATA box
plt.axvspan(2.5, 8.5, alpha=0.2, color='yellow', label='TATA box')
plt.legend()
plt.tight_layout()
plt.savefig('mutagenesis_heatmap.png')
plt.show()

## 6. Sequence Optimization

In [None]:
# Example: Optimize a sequence to maximize CAGE signal
# Start with a random sequence
import random

seq_length = 200
current_seq = ''.join(random.choices('ACGT', k=seq_length))

# Simulated optimization (in practice, use actual predictions)
optimization_history = []
best_score = -np.inf
best_seq = current_seq

for iteration in range(50):
    # Make random mutation
    pos = random.randint(0, seq_length-1)
    old_base = current_seq[pos]
    new_base = random.choice([b for b in 'ACGT' if b != old_base])
    
    # Create mutant sequence
    mutant_seq = current_seq[:pos] + new_base + current_seq[pos+1:]
    
    # Simulate prediction (replace with actual oracle prediction)
    # score = oracle.predict_sequence_score(mutant_seq, 'CAGE:K562')
    score = np.random.randn() + 0.01 * iteration  # Simulate improvement
    
    # Accept if better
    if score > best_score:
        best_score = score
        best_seq = mutant_seq
        current_seq = mutant_seq
    
    optimization_history.append({
        'iteration': iteration,
        'score': score,
        'best_score': best_score,
        'position': pos,
        'mutation': f"{old_base}->{new_base}"
    })

opt_df = pd.DataFrame(optimization_history)

In [None]:
# Plot optimization progress
plt.figure(figsize=(10, 5))
plt.plot(opt_df['iteration'], opt_df['score'], 'o-', alpha=0.5, label='Current score')
plt.plot(opt_df['iteration'], opt_df['best_score'], 'r-', linewidth=2, label='Best score')
plt.xlabel('Iteration')
plt.ylabel('CAGE Signal Score')
plt.title('Sequence Optimization Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('optimization_progress.png')
plt.show()

print(f"Initial score: {optimization_history[0]['score']:.3f}")
print(f"Final best score: {best_score:.3f}")
print(f"Improvement: {best_score - optimization_history[0]['score']:.3f}")

## 7. Export Results

In [None]:
# Save all variant analysis results
output_dir = Path('variant_analysis_results')
output_dir.mkdir(exist_ok=True)

# Save variant effects
effects_df.to_csv(output_dir / 'variant_effects.csv', index=False)
print(f"Saved variant effects to {output_dir / 'variant_effects.csv'}")

# Save prioritized variants
prioritized.to_csv(output_dir / 'prioritized_variants.csv', index=False)
print(f"Saved prioritized variants to {output_dir / 'prioritized_variants.csv'}")

# Save mutagenesis results
mutations_df.to_csv(output_dir / 'mutagenesis_results.csv', index=False)
print(f"Saved mutagenesis results to {output_dir / 'mutagenesis_results.csv'}")

# Create summary report
summary = f"""Variant Analysis Summary
========================
Total variants analyzed: {len(effects_df)}
High-impact variants (|effect| > 1.0): {(effects_df['max_abs_effect'] > 1.0).sum()}
Multi-assay variants (≥3 assays affected): {(effects_df['n_affected_assays'] >= 3).sum()}

Top variant: {prioritized.iloc[0]['variant_id']}
  Position: {prioritized.iloc[0]['position']}
  Max effect: {prioritized.iloc[0]['max_abs_effect']:.3f}
  Affected assays: {prioritized.iloc[0]['n_affected_assays']}
"""

with open(output_dir / 'analysis_summary.txt', 'w') as f:
    f.write(summary)

print("\n" + summary)

## Summary

This notebook demonstrated:
1. Loading and parsing VCF files
2. Analyzing single variant effects across multiple assays
3. Batch analysis of multiple variants
4. Variant prioritization based on effect sizes
5. In silico mutagenesis of regulatory elements
6. Sequence optimization to maximize regulatory signals
7. Exporting comprehensive analysis results

These techniques can be applied to:
- Interpret disease-associated variants
- Design synthetic regulatory elements
- Understand sequence-function relationships
- Prioritize variants for experimental validation