# Helix Triage Dashboard

Quickly inspect GC skew, ORFs, and k-mer hotspots for an exploratory sequence.

In [1]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

import bioinformatics
from codon import find_orfs

SEQUENCE_PATH = Path("../input/dna/human.txt")
K = 5
MAX_DIFF = 1
MIN_ORF_LENGTH = 90

raw = SEQUENCE_PATH.read_text(encoding="utf-8")
sequence = "".join(raw.upper().split())
print(f"Sequence length: {len(sequence):,}")

In [2]:
skew = bioinformatics.skew(sequence)
clusters = bioinformatics.find_kmers_with_differences(sequence, K, MAX_DIFF)
cluster_df = (
    pd.DataFrame(
        [
            {
                "canonical": canonical,
                "count": data["count"],
                "patterns": ",".join(data["patterns"]),
                "positions": data["positions"],
            }
            for canonical, data in clusters.items()
        ]
    )
    .sort_values("count", ascending=False)
)

orfs = find_orfs(sequence, min_length=MIN_ORF_LENGTH)
print(f"ORFs found: {len(orfs)}")

In [3]:
fig, axes = plt.subplots(3, 1, figsize=(10, 12), sharex=False)

# GC skew
axes[0].plot(skew)
axes[0].set_title("GC Skew")
axes[0].set_xlabel("Position")
axes[0].set_ylabel("Cumulative skew")

# ORFs
axes[1].set_title("ORFs")
axes[1].set_xlabel("Position")
axes[1].set_ylabel("Strand")
for orf in orfs:
    strand_offset = 0.1 if orf.strand == "+" else -0.1
    axes[1].plot([orf.start, orf.end], [strand_offset, strand_offset], linewidth=4)
axes[1].set_yticks([-0.1, 0.1])
axes[1].set_yticklabels(["-", "+"])

# Top k-mer clusters
top_clusters = cluster_df.head(10)
axes[2].bar(top_clusters["canonical"], top_clusters["count"])
axes[2].set_title(f"Top {len(top_clusters)} k-mer clusters (k={K}, diff<={MAX_DIFF})")
axes[2].set_ylabel("Count")
axes[2].tick_params(axis='x', rotation=45)

fig.tight_layout()
plt.show()