# Reproduce the results presented in "CRISPR-HAWK: Haplotype- and Variant-Aware Guide Design Toolkit for CRISPR-Cas"

In [None]:
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import pandas as pd
import numpy as np

import subprocess
import random
import pysam
import os

# from tqdm import tqdm
from time import time
from matplotlib.lines import Line2D
from matplotlib.font_manager import FontProperties
from matplotlib.colors import LinearSegmentedColormap

## Introduction

CRISPR-HAWK is a comprehensive and scalable framework for designing guide RNAs 
(gRNAs) and evaluating the impact of genetic variation on CRISPR-Cas on-target 
activity. Developed as an offline, user-friendly command-line tool, CRISPR-HAWK 
integrates large-scale human variation datasets, including the 1000 Genomes Project, 
the Human Genome Diversity Project (HGDP), and gnomAD, with orthogonal 
genomic annotations to systematically prioritize gRNAs targeting regions of interest.

The framework is Cas-agnostic and supports a broad range of nucleases, such as 
Cas9, SaCas9, and Cpf1 (Cas12a), while also allowing full customization of PAM 
sequences and guide lengths. This flexibility ensures compatibility with emerging 
CRISPR technologies and enables users to tailor gRNA design to specific experimental 
needs.

CRISPR-HAWK incorporates both single-nucleotide variants (SNVs) and small 
insertions and deletions (indels), and it natively handles individual- and 
population-specific haplotypes. This makes it particularly suitable for personalized 
genome editing as well as population-scale analyses. The workflow, from variant-aware 
preprocessing to gRNA discovery, is fully automated, generating ranked candidate 
gRNAs, annotated target sequences, and publication-ready visualizations.

Thanks to its modular architecture, CRISPR-HAWK can be seamlessly integrated with 
downstream tools such as CRISPRme or CRISPRitz for comprehensive off-target prediction 
and follow-up analysis of prioritized guides.

This notebook reproduce the results presented in |||**add paper-link**|||.

## Download data

### Downloading Genetic Variation Datasets (1000 Genomes, HGDP, gnomAD)

CRISPR-HAWK supports large-scale genetic diversity analyses by integrating variation 
from several major population genomics resources. This section provides instructions 
for downloading the VCF files required to reproduce the results presented in the paper.

**Overview of Supported Datasets**

- 1000 Genomes Project (Phase 3)
  <br>2,504 individuals from 26 global populations; whole-genome sequencing at ~30×.

- Human Genome Diversity Project (HGDP)
  <br>929 individuals from globally diverse populations; high-coverage WGS.

- gnomAD (v4.1)
  <br>Population-scale aggregated variation from ~76,000 genomes.

In [None]:
# define datasets urls
url_hgdp = (
    "https://ngs.sanger.ac.uk/production/hgdp/hgdp_wgs.20190516/" 
    "hgdp_wgs.20190516.full.chr{}.vcf.gz"
)
url_1kgp = (
    "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/"
    "1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/" 
    "ALL.chr{}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz"
)
url_gnomad = (
    "https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/" 
    "genomes/gnomad.genomes.v4.1.sites.chr{}.vcf.bgz"
)
variants = {"HGDP": url_hgdp, "1000G": url_1kgp, "GNOMAD": url_gnomad}

# define chromosomes
chroms = [2, 3, 7, 11]

# download files
for dataset, url in variants.items():
    vcfdir = os.path.join("vcf", dataset)
    if dataset == "GNOMAD":
        vcfdir = os.path.join(vcfdir, "raw")
    os.makedirs(vcfdir, exist_ok=True)  # create dataset folder 
    for c in chroms:        
        # download VCF and index
        ! wget -P {vcfdir} {url.format(c)}
        ! wget -P {vcfdir} {url.format(c)}.tbi

Unlike the 1000 Genomes and HGDP callsets, the gnomAD VCFs contain allele frequency 
information only and do not provide individual-level genotype data. Since 
CRISPR-HAWK requires genotypes to reconstruct haplotypes and perform variant-aware 
gRNA discovery, gnomAD VCFs must first be converted into a compatible, pseudo-genotyped 
format. To enable this, we use the CRISPR-HAWK VCF converter, which generates a 
CRISPR-HAWK–ready VCF while preserving population-level allele distributions.

In [None]:
# check gnomad folder exists
gnomad_dir = "vcf/GNOMAD"
gnomad_raw_dir = os.path.join(gnomad_dir, "raw")
assert os.path.isdir(gnomad_raw_dir)

# create converted VCFs folder
gnomad_gt_dir = os.path.join(gnomad_dir, "genotype")
os.makedirs(gnomad_gt_dir, exist_ok=True)

# convert gnoamd VCFs using crisprhawk (it may take some time)
! crisprhawk convert-gnomad-vcf -d {gnomad_raw_dir} -o {gnomad_gt_dir}


### Downloading the hg38 Reference Genome

CRISPR-HAWK requires a reference genome to extract genomic contexts, evaluate 
variant-aware target sequences, and correctly map gRNA target sites. In this 
section, we download the primary assembly FASTA filesfrom the Genome Reference 
Consortium–maintained repositories.

In [None]:
# define chromosomes
chroms = [2, 3, 7, 11]

# create genome folder
genome_dir = "genome"
os.makedirs(genome_dir, exist_ok=True)

# define genome url
url_ucsc = (
    "https://hgdownload.soe.ucsc.edu/goldenpath/hg38/chromosomes/chr{}.fa.gz"
)

# Download and unzip FASTA file for each chromosome
for c in chroms:
    print(f"Downloading FASTA for chromosome {c}")
    ! wget -nc -P {genome_dir} {url_ucsc.format(c)}
    ! gunzip -f {genome_dir}/chr{c}.fa.gz

### Creating BED Files for the Analyzed Regions

CRISPR-HAWK requires BED files to define the genomic intervals where gRNA discovery 
and variant-aware analysis will be performed. Each BED file specifies one or more 
regions of interest in the standard 3-column BED format:
```
chrom   start   end
```

Coordinates must reference the hg38 genome assembly (or whichever reference you 
downloaded in the previous step).

In [None]:
# create region folder
regions_dir = "regions"
os.makedirs(regions_dir, exist_ok=True)

# define regions
regions = {
    "BCL11A": ["chr2", 60495215, 60496479],
    "EMX1":   ["chr2", 72932853, 72934853],
    "CCR5_1":   ["chr3", 46372138, 46374138],
    "CCR5_2":   ["chr3", 46372162, 46374162],
    "TRBC1":  ["chr7", 142791004, 142793004],
    "TRBC2":  ["chr7", 142800351, 142802350],
    "HBB_1":    ["chr11", 5225803, 5227803],
    "HBB_2":    ["chr11", 5225967, 5227967],
    "HBG2_CAS9":   ["chr11", 5252879, 5256879],
    "HBG1_CAS9":   ["chr11", 5248955, 5250955],
    "HBG2_CPF1":   ["chr11", 5253874, 5255874],
    "HBG1_CPF1":   ["chr11", 5248950, 5250950],
    "FANCF":  ["chr11", 22624785, 22626785],
}

# create bed files 
for gene, (chrom, start, end) in regions.items():
    bed_fname = os.path.join(regions_dir, f"{gene}.bed")
    with open(bed_fname, mode="w") as f:
        f.write(f"{chrom}\t{start}\t{end}\n")


## Variant-Aware gRNA Retrieval on Defined Regions

With the reference genome, variant datasets, and BED files prepared, CRISPR-HAWK
can now retrieve all candidate gRNAs within the specified regions while accounting
for genetic variation across populations and individuals. This step is central to 
CRISPR-HAWK’s design philosophy: guide discovery must be variant-aware, ensuring 
that both reference and haplotype-specific target sequences are evaluated.

**What This Step Does**

For each genomic interval listed in the BED file(s), CRISPR-HAWK:

- Extracts the reference sequence from hg38.

- Applies all relevant variants (SNVs and indels) from the loaded dataset,
including 1000G, HGDP, or converted gnomAD, to reconstruct individual, and population-specific haplotypes.

- Scans both the reference and haplotype sequences to identify all gRNAs that match the user-defined:

    - PAM sequence (e.g., NGG, NAA, TTTV, …)

    - guide length

    - nuclease type (Cas9, SaCas9, Cpf1/Cas12a, etc.)

- Reports each gRNA along with:

    - its exact reference and alternative alleles

    - per-haplotype presence/absence

    - the samples in which the target site is altered

    - summary metrics for prioritization


In [None]:
# define target regions
targets = [
    "BCL11A",
    "EMX1",
    "CCR5_1",
    "CCR5_2",
    "TRBC1",
    "TRBC2",
    "FANCF",
    "HBB_1",
    "HBB_2",
    "HBG1_CAS9",
    "HBG2_CAS9",
    "HBG1_CPF1",
    "HBG1_CPF1",
]

# define genetic variants datasets
datasets = ["1000G", "HGDP", "gnomAD"]

# define pams
pams = ["NGG", "TTTV"] # Cas9, Cas12
guide_lens = [20, 23]
thread = 16

# define folders 
genome_dir = "genome"
variants_dir = "vcf"
regions_dir = "regions"
results_dir = "results"

# create results folder
os.makedirs(results_dir, exist_ok=True)

# run guide design with crisprhawk
for dataset in datasets:
    vcfdir = os.path.join(variants_dir, dataset)
    if dataset == "gnomAD":
        vcfdir = os.path.join(vcfdir, "genotype")
    results_dataset = os.path.join(results_dir, dataset)
    os.makedirs(results_dataset, exist_ok=True)  # e.g. results/1000G 
    for target in targets:
        target_region = os.path.join(regions_dir, f"{target}.bed")
        pams_ = pams if target.endswith("_CPF1") else pams[:1]
        results_target = os.path.join(results_dataset, target)  # e.g. results/1000G/BCL11A
        for pam in pams_:
            guidelen = 20 if pam == "NGG" else 23
            crisprhawk_cmd = (
                "crisprhawk search " 
                f"-f {genome_dir} " 
                f"-r {target_region} "
                f"-v {vcfdir} "
                f"-p {pam} "
                f"-g {guidelen} "
                f"--haplotype-table "
                "--threads 16 "
                f"-o {results_target}"
                "--verbosity 0"
            )
            print(f"Running search on {dataset} for target {target}")
            subprocess.call(crisprhawk_cmd, shell=True)

## Results Analysis and Visualization

In this section, we analyze and visualize the impact of genetic variation on gRNA 
design and activity across a curated set of clinically and experimentally relevant 
CRISPR targets. The analysis focuses on how population-level and individual-specific 
variants affect both gRNA sequence composition and expected on-target efficiency, 
highlighting differences between reference-designed guides and their alternative, 
haplotype-derived counterparts.

For each target region, we first quantify how genetic variation alters the landscape 
of candidate gRNAs. Specifically, we classify retrieved guides into four categories 
and summarize them using pie charts:
- gRNAs matching the reference sequence
- gRNAs with variants in the spacer region
- gRNAs with variants affecting only the PAM
- gRNAs with variants in both the spacer and the PAM

This provides an immediate overview of how frequently genetic variants modify 
targetable sequences across different loci. 

Next, we assess the functional consequences of these sequence differences. Using 
dot plots, we compare:
- The predicted on-target efficiency of reference gRNAs versus their alternative 
  versions found on variant-defined haplotypes
- The residual on-target activity of reference gRNAs when applied to alternative 
  haplotypes carrying sequence mismatches

These analyses capture both gain and loss of activity induced by genetic variation 
and enable a fine-grained comparison between reference and variant-aware gRNA designs.

All analyses are performed independently for the following target regions:
- BCL11A +58 Erythroid enhancer
- EMX1
- CCR5 (two independent target sites)
- TRBC1
- TRBC2
- FANCF
- HBB (two independent target sites)
- HBG1 and HBG2 (Cas9)
- HBG1 (Cpf1/Cas12a)

For Cpf1-based targets, residual on-target activity is not evaluated, as the 
analysis is specific to Cas9-mediated spacer–PAM interactions.

The analyses integrate variation from 1000 Genomes, HGDP, and gnomAD datasets. 
In particular, for the sg1617 guide targeting the BCL11A erythroid enhancer, we 
perform an in-depth follow-up analysis: for each alternative gRNA sequence generated 
by gnomAD variants, we run CRISPRme (using 1000G + HGDP genetic variants) to 
evaluate guide specificity genome-wide. This allows us to compare how genetic 
variation simultaneously affects on-target activity and off-target risk, 
providing a comprehensive assessment of guide performance in a population-aware 
context.

### Assigning Sample Support to Candidate gRNAs

As a first step in the results analysis, we quantify how widely each candidate 
gRNA is supported across individuals in each variant datasets. For every gRNA 
retrieved in the previous step, we compute the number of samples carrying the 
exact gRNA sequence in their reconstructed haplotypes.

These sample support counts form the basis for all downstream analyses in this 
section, including the evaluation of efficiency differences between reference 
and alternative guides, and reference on-target residual activity on alternative
haplotypes. 

For datasets providing individual-level genotypes (e.g., 1000 Genomes and HGDP), 
sample support is directly derived from haplotype reconstruction.

In [None]:
SAMPLESNUM = {"1000G": 2504, "HGDP": 929, "gnomAD": 76215}

def _compute_id(chrom, start, stop, strand):
    return f"{chrom}_{start}_{stop}_{strand}"

def compute_samples_num(df, samplesnum):
    assert "samples" in df.columns.tolist()
    # count samples carrying alternative grnas
    df["n_samples"] = df["samples"].apply(
        lambda x: len(str(x).split(",")) if pd.notna(x) and x!= "REF" else None
    )  
    # compute guide id for each grna
    df["guide_id"] = df.apply(
        lambda x: _compute_id(x["chr"], x["start"], x["stop"], x["strand"]), axis=1
    )
    # sum samples carrying alternative guides
    sum_alternative = df[df["samples"] != "REF"].groupby("guide_id")["n_samples"].sum()
    # retrieve number of samples for reference grnas
    refmask = df["samples"] == "REF"
    df.loc[refmask, "n_samples"] = samplesnum - df.loc[refmask, "guide_id"].map(sum_alternative).fillna(0)
    return df

For gnomAD-based analyses, individual genotypes are not directly available. In 
this case, CRISPR-HAWK reports the number of populations carrying each gRNA rather
than explicit sample counts. To enable downstream analyses requiring sample-level 
support, we post-process CRISPR-HAWK reports by annotating them with gnomAD 
carrier counts using pysam. This procedure estimates the number of carriers for
each variant allele underlying a gRNA and propagates this information to the 
guide level.

These sample support estimates are then used consistently with the other datasets 
to classify gRNAs and to perform comparative analyses across reference and 
variant-derived guides.

In [None]:
# gnomad-only post-processing (NB: may require hours to run)
def _retrieve_position(variant_id):
    return int(variant_id.split("-")[1])

def extract_variant_positions(df):
    # retrieve position of each variant
    variant_ids = df["variant_id"].dropna().tolist()
    return {_retrieve_position(v) for vs in variant_ids for v in vs.split(",")}

def get_relevant_variants(vcf_fname, chrom, positions):
    print(f"Scanning VCF for {len(positions)} positions on {chrom}")
    vcf = pysam.VariantFile(vcf_fname)  # load vcf
    variants, matches = [], set()

    # requires bgzipped vcf with tbi index
    for r in tqdm(vcf.fetch(chrom), desc="Scanning VCF", unit="variants"):
        if r.pos not in positions:
            continue
    
        ac = r.info.get("AC")  # read allele count
        if ac is None:
            continue

        nhomalt = r.info.get("nhomalt", 0)  # count number of homozygous for alternative

        # iterate on each alternative allele
        for i, alt in enumerate(r.alts):
            # scalar vs per-allele fields for AC and nhomalt
            if isinstance(ac, (list, tuple)):
                ac_ = ac[i] if i < len(ac) else None
            else:
                ac_ = ac
            if ac_ is None:
                continue

            if isinstance(nhomalt, (list, tuple)):
                nhomalt_ = nhomalt[i] if i < len(nhomalt) else 0
            else:
                nhomalt_ = nhomalt

            # compute number of heterozygous samples
            n_het = ac_ - 2 * nhomalt_
            n_total = nhomalt_ + n_het

            # add variants and matched position
            variant_id = f"{r.chrom}-{r.pos}-{r.ref}/{alt}"
            variants.append(
                {
                    "variant_id": variant_id,
                    "chrom": r.chrom,
                    "pos": r.pos,
                    "ref": r.ref,
                    "alt": alt,
                    "AC": ac_,
                    "n_hom": nhomalt_,
                    "n_het": n_het,
                    "n_samples": n_total
                }
            )
            matches.add(r.pos)
    print(f"Matched {len(matches)} of {len(positions)} positions")
    return pd.DataFrame(variants)


    