# Enformer Oracle Example

This notebook demonstrates advanced usage of the Enformer oracle for predicting genomic regulatory activity.

## Setup

In [ ]:
import chorus
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from chorus.utils import get_genome

## 1. Initialize and Load Enformer

In [None]:
# Create Enformer oracle
enformer = chorus.EnformerOracle()

# Print model specifications
print(f"Sequence length: {enformer.sequence_length:,} bp")
print(f"Prediction window: {enformer.center_length:,} bp")
print(f"Output bins: {enformer.target_length}")
print(f"Bin size: {enformer.bin_size} bp")

In [None]:
# Load pre-trained model (this will download ~1.5GB model)
# enformer.load_pretrained_model()  # Uncomment to load

## 2. Predict Activity for a Gene Promoter

In [ ]:
# Example: MYC gene promoter region
myc_promoter = "chr8:128748315-128753680"  # hg38 coordinates

# Select assays of interest
assays = [
    "DNase:K562",
    "CAGE:K562", 
    "ChIP-seq_H3K4me3:K562",
    "ChIP-seq_H3K27ac:K562",
    "RNA-seq:K562"
]

# Make predictions (requires model loaded and reference genome)
# # Get reference genome automatically
# reference_fasta = get_genome('hg38')  # Downloads if needed
#
# results = enformer.predict_region_replacement(
#     genomic_region=myc_promoter,
#     seq="",  # Extract from reference
#     assay_ids=assays,
#     create_tracks=True,
#     genome=reference_fasta
# )

## 3. Enhancer Insertion Analysis

In [ ]:
# Example enhancer sequence (200bp)
enhancer_seq = """
GGATCCAAGGCTGCAGCAGAGGGGCAAAGTGAGGCAGCACAATCTCCAAAGAAGGCAGACCTGACA
GCAGCTCAGGGAGGGGTGGGAGCCCCGGTGAGGAGACAGAAGGAGAAAATGGGCAGAGACTCAGAG
TGGGGGCTTCTCAGGGACCCAGGCGGGTATAAAGGGAGCCCTCAGCCCC
""".replace('\n', '')

# Test insertion at different positions
insertion_positions = [
    "chr8:128740000",  # 8kb upstream of MYC
    "chr8:128745000",  # 3kb upstream of MYC  
    "chr8:128748000",  # Near TSS
]

# Analyze each insertion
for position in insertion_positions:
    print(f"\nAnalyzing insertion at {position}")
    
    # Predict effects (requires loaded model)
    # # Get reference genome automatically
    # reference_fasta = get_genome('hg38')  # Downloads if needed
    #
    # results = enformer.predict_region_insertion_at(
    #     genomic_position=position,
    #     seq=enhancer_seq,
    #     assay_ids=["CAGE:K562", "RNA-seq:K562"],
    #     genome=reference_fasta
    # )
    # 
    # # Analyze impact on gene expression
    # cage_signal = results['normalized_scores']['CAGE:K562'].mean()
    # rna_signal = results['normalized_scores']['RNA-seq:K562'].mean()
    # 
    # print(f"  CAGE signal: {cage_signal:.3f}")
    # print(f"  RNA-seq signal: {rna_signal:.3f}")

## 4. Contribution Score Analysis

In [None]:
# Example of computing contribution scores for important bases
# This would require the model to be loaded

# test_seq = "ATCG" * 98304  # 393,216 bp sequence
# 
# # One-hot encode
# one_hot = enformer._one_hot_encode(test_seq)
# 
# # Create target mask for central region
# target_mask = np.zeros(896)
# target_mask[400:500] = 1  # Focus on central 100 bins
# 
# # Compute contribution scores
# contrib_scores = enformer.compute_contribution_scores(
#     tf.constant(one_hot, dtype=tf.float32),
#     tf.constant(target_mask, dtype=tf.float32)
# )
# 
# # Find most important positions
# top_positions = np.argsort(contrib_scores.numpy())[-100:]
# print(f"Top contributing positions: {top_positions}")

## 5. Batch Prediction from BED File

In [None]:
# Create example BED file with regions of interest
regions_df = pd.DataFrame({
    'chrom': ['chr1', 'chr2', 'chr3'],
    'start': [1000000, 2000000, 3000000],
    'end': [1200000, 2200000, 3200000],
    'name': ['Region1', 'Region2', 'Region3']
})

# Save to BED file
regions_df.to_csv('test_regions.bed', sep='\t', header=False, index=False)
print("Created test_regions.bed")
print(regions_df)

In [ ]:
# Batch predict for all regions (requires loaded model)
# # Get reference genome automatically
# reference_fasta = get_genome('hg38')  # Downloads if needed
#
# results = enformer.predict_from_bed_file(
#     bed_file='test_regions.bed',
#     assay_ids=['DNase:K562', 'ATAC-seq:K562'],
#     genome=reference_fasta,
#     output_dir='./enformer_predictions'
# )
# 
# print(f"Generated {len(results)} track predictions")

## 6. Visualizing Enformer Predictions

In [None]:
# Create synthetic Enformer-style predictions for visualization
def create_enformer_style_track(name, pattern_func):
    """Create a track that mimics Enformer output."""
    positions = np.arange(896) * 128  # 896 bins, 128bp each
    values = pattern_func(positions)
    
    track_data = pd.DataFrame({
        'chrom': ['chr1'] * 896,
        'start': positions,
        'end': positions + 128,
        'value': values
    })
    
    track = chorus.Track(
        name=name,
        assay_type=name.split('_')[0],
        cell_type='K562',
        data=track_data
    )
    
    filename = f"enformer_{name}.bedgraph"
    track.to_bedgraph(filename)
    return filename

# Create different track patterns
track_files = []

# DNase - broad accessibility
track_files.append(create_enformer_style_track(
    'DNase_K562',
    lambda x: np.exp(-((x - 57344)**2) / (2 * 20000**2)) + 
              np.random.rand(len(x)) * 0.1
))

# H3K4me3 - sharp promoter peak
track_files.append(create_enformer_style_track(
    'H3K4me3_K562',
    lambda x: 2 * np.exp(-((x - 57344)**2) / (2 * 5000**2)) + 
              np.random.rand(len(x)) * 0.05
))

# CAGE - transcription start site
track_files.append(create_enformer_style_track(
    'CAGE_K562',
    lambda x: 3 * np.exp(-((x - 57344)**2) / (2 * 1000**2)) + 
              np.random.rand(len(x)) * 0.02
))

# Visualize
chorus.visualize_tracks(
    tracks_filenames=track_files,
    track_names=['DNase', 'H3K4me3', 'CAGE'],
    colors=['blue', 'green', 'red'],
    scales=[(0, 1.2), (0, 2.5), (0, 3.5)],
    figure_size=(12, 8),
    output_file='enformer_predictions_viz.png'
)

print("Visualization saved to enformer_predictions_viz.png")

## 7. Track Comparison and Correlation

In [None]:
# Compare two tracks
stats = chorus.plot_track_comparison(
    track1_file=track_files[0],  # DNase
    track2_file=track_files[1],  # H3K4me3
    track1_name='DNase',
    track2_name='H3K4me3',
    output_file='track_correlation.png'
)

print(f"Correlation: {stats['correlation']:.3f}")
print(f"P-value: {stats['p_value']:.3e}")
print(f"Number of intervals: {stats['n_intervals']}")

## 8. Advanced: Custom Sequence Design

This section shows how you might design sequences to maximize certain signals.

In [ ]:
# Example: Design a synthetic promoter sequence
# Key motifs for promoter activity
tata_box = "TATAAA"
gc_box = "GGGCGG"
caat_box = "CCAAT"
inr = "YYANWYY"  # Y=C/T, N=any, W=A/T

# Build synthetic promoter
synthetic_promoter = "NNNN" * 20  # Spacer
synthetic_promoter += gc_box * 2  # GC boxes
synthetic_promoter += "N" * 50    # Spacer 
synthetic_promoter += caat_box    # CAAT box
synthetic_promoter += "N" * 25    # Spacer
synthetic_promoter += tata_box    # TATA box
synthetic_promoter += "N" * 25    # to TSS
synthetic_promoter += "CCANACT"   # Initiator

# Replace N with random nucleotides
import random
synthetic_promoter = ''.join([
    random.choice('ACGT') if base == 'N' else base 
    for base in synthetic_promoter
])

print(f"Synthetic promoter ({len(synthetic_promoter)}bp):")
print(synthetic_promoter)

# Would predict activity with:
# # Get reference genome automatically
# reference_fasta = get_genome('hg38')  # Downloads if needed
#
# results = enformer.predict_region_replacement(
#     genomic_region="chr1:1000000-1000200",
#     seq=synthetic_promoter,
#     assay_ids=["CAGE:K562", "RNA-seq:K562", "ChIP-seq_H3K4me3:K562"],
#     genome=reference_fasta
# )

## Summary

This notebook demonstrated:
1. Loading and using the Enformer oracle
2. Predicting regulatory activity for genomic regions
3. Analyzing enhancer insertions
4. Computing contribution scores
5. Batch predictions from BED files
6. Visualizing and comparing tracks
7. Designing synthetic regulatory sequences

For variant analysis, see the `variant_analysis.ipynb` notebook.