# Chorus Basic Usage Tutorial

This notebook demonstrates the basic functionality of the Chorus library for working with genomic sequence oracles.

## Installation

First, make sure Chorus is installed:

```bash
pip install -e /path/to/chorus
```

## 1. Import Chorus and Create an Oracle

In [None]:
import chorus
import numpy as np
import pandas as pd

# Print available oracles
print("Available oracles:", list(chorus.ORACLES.keys()))

In [None]:
# Create an Enformer oracle instance
oracle = chorus.create_oracle('enformer')
print(f"Created {type(oracle).__name__}")

## 2. Load Pre-trained Model

Note: This will download the Enformer model from TensorFlow Hub (several GB).

In [None]:
# Load the pre-trained Enformer model
# oracle.load_pretrained_model()  # Uncomment to actually load

## 3. Explore Available Assays and Cell Types

In [None]:
# List available assay types
assay_types = oracle.list_assay_types()
print(f"Available assay types ({len(assay_types)}):")
print(assay_types[:5], "...")

In [None]:
# List available cell types
cell_types = oracle.list_cell_types()
print(f"Available cell types ({len(cell_types)}):")
print(cell_types)

## 4. Working with Tracks

In [None]:
# Create a sample track
track_data = pd.DataFrame({
    'chrom': ['chr1'] * 10,
    'start': range(0, 1000, 100),
    'end': range(100, 1100, 100),
    'value': np.random.rand(10) * 10
})

track = chorus.Track(
    name="sample_track",
    assay_type="DNase",
    cell_type="K562",
    data=track_data
)

print(f"Created track: {track.name}")
print(f"Data shape: {track.data.shape}")

In [None]:
# Save track to BedGraph file
track.to_bedgraph("sample_track.bedgraph")
print("Track saved to sample_track.bedgraph")

In [None]:
# Normalize the track
normalized_track = track.normalize(method='zscore')
print(f"Original values: {track.data['value'].values[:5]}")
print(f"Normalized values: {normalized_track.data['value'].values[:5]}")

## 5. Sequence Utilities

In [None]:
# Example DNA sequence
seq = "ATCGATCGATCGATCGATCGATCGATCGATCG"

# Validate sequence
is_valid = chorus.validate_sequence(seq)
print(f"Sequence valid: {is_valid}")

# Get GC content
gc_content = chorus.get_gc_content(seq)
print(f"GC content: {gc_content:.2%}")

# Reverse complement
rev_comp = chorus.reverse_complement(seq)
print(f"Original:  {seq}")
print(f"Rev comp:  {rev_comp}")

In [None]:
# Apply a variant
ref_seq = "ATCGATCGATCGATCGATCGATCGATCGATCG"
position = 10
ref_allele = "C"
alt_allele = "T"

variant_seq = chorus.apply_variant(ref_seq, position, ref_allele, alt_allele)
print(f"Reference: {ref_seq}")
print(f"Variant:   {variant_seq}")
print(f"Changed position {position}: {ref_allele} -> {alt_allele}")

## 6. Making Predictions (Requires Loaded Model)

In [None]:
# Example of how to make predictions (requires model to be loaded)
# This is commented out since loading the model takes time and resources

# # Define a genomic region
# genomic_region = "chr1:1000000-1200000"
# 
# # Define assays to predict
# assay_ids = ["DNase:K562", "ATAC-seq:K562", "RNA-seq:K562"]
# 
# # Make predictions
# results = oracle.predict_region_replacement(
#     genomic_region=genomic_region,
#     seq="",  # Empty string will extract from reference
#     assay_ids=assay_ids,
#     create_tracks=True,
#     genome="hg38.fa"
# )
# 
# print("Predictions shape:", results['raw_predictions']['DNase:K562'].shape)
# print("Created tracks:", len(results['track_objects']))

## 7. Visualization

In [None]:
# Create sample tracks for visualization
import matplotlib.pyplot as plt

# Generate sample data
n_positions = 100
positions = np.arange(n_positions) * 128  # 128bp bins

# Create multiple tracks with different patterns
tracks_to_viz = []
for i, (name, pattern) in enumerate([
    ("DNase", lambda x: np.sin(x/10) + np.random.randn(len(x))*0.1),
    ("H3K27ac", lambda x: np.cos(x/15) + np.random.randn(len(x))*0.1),
    ("RNA-seq", lambda x: np.exp(-x/50) + np.random.randn(len(x))*0.05)
]):
    values = pattern(positions/1000)
    values = (values - values.min()) / (values.max() - values.min())  # Normalize to [0,1]
    
    track_data = pd.DataFrame({
        'chrom': ['chr1'] * n_positions,
        'start': positions,
        'end': positions + 128,
        'value': values
    })
    
    # Save to file
    filename = f"example_{name}.bedgraph"
    track = chorus.Track(
        name=name,
        assay_type=name,
        cell_type="K562",
        data=track_data
    )
    track.to_bedgraph(filename)
    tracks_to_viz.append(filename)

# Visualize tracks
chorus.visualize_tracks(
    tracks_filenames=tracks_to_viz,
    track_names=["DNase", "H3K27ac", "RNA-seq"],
    colors=['blue', 'green', 'red'],
    figure_size=(10, 6),
    output_file="example_tracks.png"
)

print("Visualization saved to example_tracks.png")

## 8. Next Steps

For more advanced usage:
- See `enformer_example.ipynb` for detailed Enformer predictions
- See `variant_analysis.ipynb` for variant effect prediction
- Check the documentation for fine-tuning models on custom data