In [1]:
from __future__ import annotations

import os
import pandas as pd
import subprocess
import numpy as np
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

## Prepare Data Input

You will need ABE8e and evoCDA endogenous data from [Zenodo](https://doi.org/10.5281/zenodo.13737880)
- CRISPR-CLEAR-data/data/raw_FASTQs/endogenous_sequencing/ABE8e_pooled.zip
- CRISPR-CLEAR-data/data/raw_FASTQs/endogenous_sequencing/EvoCDA_pooled.zip

As well as the guide information and mismatch corrected count table from [Zenodo](https://doi.org/10.5281/zenodo.13737880):
- CRISPR-CLEAR-data/data/surf_inputs/CD19_guide_info.tsv
- CRISPR-CLEAR-data/data/surf_inputs/20230908_CD19_count_matrix.tsv

# Filepath Definitions

In [None]:
ABE8E_PRESORT_FASTQ_PATHS = []
ABE8E_LOW_FASTQ_PATHS = []
ABE8E_HIGH_FASTQ_PATHS = []

EVOCDA_PRESORT_FASTQ_PATHS = []
EVOCDA_LOW_FASTQ_PATHS = []
EVOCDA_HIGH_FASTQ_PATHS = []

WORKING_DIR = ""
RESULTS_DIR = ""

GUIDE_INFO_PATH = ""
CORRECTED_COUNT_MATRIX_PATH = ""

UNCORRECTED_ABE8E_DIR = f"{RESULTS_DIR}/Uncorrected_ABE8e"
CORRECTED_ABE8E_DIR = f"{RESULTS_DIR}/Corrected_ABE8e"
os.makedirs(UNCORRECTED_ABE8E_DIR, exist_ok=True)
os.makedirs(CORRECTED_ABE8E_DIR, exist_ok=True)

UNCORRECTED_EVOCDA_DIR = f"{RESULTS_DIR}/Uncorrected_evoCDA"
CORRECTED_EVOCDA_DIR = f"{RESULTS_DIR}/Corrected_evoCDA"
os.makedirs(UNCORRECTED_EVOCDA_DIR, exist_ok=True)
os.makedirs(CORRECTED_EVOCDA_DIR, exist_ok=True)

In [6]:
def adjust_tracks_for_plotting(
    results_path: str,
    output_path: str = "",
):
    if output_path == "":
        output_path = f"{results_path}tracks_for_plotting/"
    os.makedirs(output_path, exist_ok=True)

    try:
        pos_regions = pd.read_csv(
            f"{results_path}positive_significant_regions.bed", header=None, sep="\t"
        )
        pos_regions[2] += 1
        pos_regions = pos_regions.loc[pos_regions.groupby([0, 1, 2])[3].idxmin()]
        pos_regions[4] = 0
        pos_regions[5] = "."
        pos_regions[6] = pos_regions[1]
        pos_regions[7] = pos_regions[2]
        pos_regions.insert(8, 8, -np.log10(pos_regions[3]))
    except pd.errors.EmptyDataError:
        pos_regions = pd.DataFrame(columns=[0, 1, 2, 3, 4, 5, 6, 7, 8])

    try:
        neg_regions = pd.read_csv(
            f"{results_path}negative_significant_regions.bed", header=None, sep="\t"
        )
        neg_regions[2] += 1
        neg_regions = neg_regions.loc[neg_regions.groupby([0, 1, 2])[3].idxmin()]
        neg_regions[4] = 0
        neg_regions[5] = "."
        neg_regions[6] = neg_regions[1]
        neg_regions[7] = neg_regions[2]
        neg_regions.insert(8, 8, np.log10(neg_regions[3]))
    except pd.errors.EmptyDataError:
        neg_regions = pd.DataFrame(columns=[0, 1, 2, 3, 4, 5, 6, 7, 8])

    sig_regions = pd.concat([pos_regions, neg_regions], ignore_index=True)
    cmap = plt.get_cmap("coolwarm")
    cnorm = mpl.colors.Normalize(vmin=np.log10(0.0001), vmax=-np.log10(0.0001))
    scalar_map = cm.ScalarMappable(norm=cnorm, cmap=cmap)
    sig_regions[8] = sig_regions[8].apply(
        lambda x: ",".join([str(val) for val in scalar_map.to_rgba(x, bytes=True)[:3]])
    )
    sig_regions.to_csv(
        f"{output_path}significant_regions.bed", sep="\t", header=None, index=None
    )

    raw_scores = pd.read_csv(
        f"{results_path}raw_scores.bedgraph", header=None, sep="\t"
    )
    raw_scores[2] += 1
    raw_scores.loc[raw_scores[3] > 3, 3] = 3
    raw_scores.loc[raw_scores[3] < -3, 3] = -3
    raw_scores["rep"] = raw_scores.groupby([0, 1, 2]).cumcount() + 1
    raw_scores = raw_scores[raw_scores["rep"] <= 3]

    for rep in raw_scores["rep"].unique():
        temp_scores = raw_scores[raw_scores["rep"] == rep].drop("rep", axis=1)
        # temp_scores = pd.merge(temp_scores, pvals, left_on=4, right_index=True)
        temp_scores.to_csv(
            f"{output_path}rep{rep}_raw_scores.bedgraph",
            sep="\t",
            header=False,
            index=False,
        )

    deconv_scores = pd.read_csv(
        f"{results_path}deconvolved_scores.bedgraph", header=None, sep="\t"
    )
    deconv_scores[2] += 1
    deconv_scores.to_csv(
        f"{output_path}deconvolved_scores.bedgraph",
        sep="\t",
        header=False,
        index=False,
    )

    power = pd.read_csv(
        f"{results_path}neglog10_pvals.bedgraph", header=None, sep="\t"
    )
    power[2] += 1
    power.to_csv(
        f"{output_path}neglog10_pvals.bedgraph",
        sep="\t",
        header=False,
        index=False
    )

In [8]:
guide_info = pd.read_csv(GUIDE_INFO_PATH, sep="\t", index_col=0)
display(guide_info.head())

Unnamed: 0_level_0,Chr,Start,Stop,sgRNA_Sequence.1,Strand,sgRNA_Type_ABE,sgRNA_Type_CBE,Cutsite,amplicon_position,pam,is_ngn,is_ngg,is_nnn,is_ABE,is_CBE,is_ABE_editable,is_CBE_editable,ABE_explainable,CBE_explainable
sgRNA_Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
GGGGAATGACATGCTCTAGT,chr16,28930678.0,28930701.0,GGGGAATGACATGCTCTAGT,+,observation,observation,28930692.0,42.0,GAA,False,False,True,True,False,True,False,True,False
GAATGACATGCTCTAGTGAA,chr16,28930681.0,28930704.0,GAATGACATGCTCTAGTGAA,+,observation,observation,28930695.0,45.0,AGC,True,False,True,True,True,True,False,True,False
TGACATGCTCTAGTGAAAGC,chr16,28930684.0,28930707.0,TGACATGCTCTAGTGAAAGC,+,observation,observation,28930698.0,48.0,CAG,False,False,True,True,True,True,False,True,False
CATGCTCTAGTGAAAGCCAG,chr16,28930687.0,28930710.0,CATGCTCTAGTGAAAGCCAG,+,observation,observation,28930701.0,51.0,TCT,False,False,True,True,True,True,False,True,False
GCTCTAGTGAAAGCCAGTCT,chr16,28930690.0,28930713.0,GCTCTAGTGAAAGCCAGTCT,+,observation,observation,28930704.0,54.0,GGG,True,True,True,True,True,True,True,True,True


# Uncorrected ABE8e SURF Count

In [None]:
UNCORRECTED_ABE8E_SURF_COLUMNS = [
    "Chr",
    "Start",
    "Stop",
    "Strand",
    "sgRNA_Type_ABE",
    "sgRNA_Sequence.1",
]
# Prepare FastQs (old)
# presort_fastqs = [f"{ABE8E_FASTQ_DIR}/AAGTAGAG_AAGTAGAG_R1-CD19_presort_1_ABE8.fastq",
#                 f"{ABE8E_FASTQ_DIR}/AAGTAGAG_CATGATCG_R1-CD19_presort_2_ABE8.fastq",
#                 f"{ABE8E_FASTQ_DIR}/AAGTAGAG_AACGCATT_R1-CD19_presort_3_ABE8.fastq",]

# low_fastqs = [f"{ABE8E_FASTQ_DIR}/AAGTAGAG_ACACGATC_R1-CD19_low_1_ABE8.fastq",
#             f"{ABE8E_FASTQ_DIR}/AAGTAGAG_CGTTACCA_R1-CD19_low_2_ABE8.fastq",
#             f"{ABE8E_FASTQ_DIR}/AAGTAGAG_AGGTAAGG_R1-CD19_low_3_ABE8.fastq",]

# high_fastqs = [f"{ABE8E_FASTQ_DIR}/AAGTAGAG_CGCGCGGT_R1-CD19_high_1_ABE8.fastq",
#             f"{ABE8E_FASTQ_DIR}/AAGTAGAG_TCCTTGGT_R1-CD19_high_2_ABE8.fastq",
#             f"{ABE8E_FASTQ_DIR}/AAGTAGAG_AACAATGG_R1-CD19_high_3_ABE8.fastq",]

# Set non-explainable guides as negative controls
library_file = guide_info[UNCORRECTED_ABE8E_SURF_COLUMNS]
library_file.loc[~library_file["sgRNA_Sequence.1"].str[2:9].str.contains("A"), "sgRNA_Type_ABE"] = "negative_control"

library_file.rename(
    columns={
        "sgRNA_Type_ABE": "sgRNA_Type",
        "sgRNA_Sequence.1": "sgRNA_Sequence"},
    ).to_csv(f"{UNCORRECTED_ABE8E_DIR}/library_file.csv", index=False)


# Generate Presort vs. Low Counts
os.makedirs(f"{UNCORRECTED_ABE8E_DIR}/presort_vs_low", exist_ok=True)
subprocess.run(
        "module load singularity\n"+f"singularity run -B {WORKING_DIR} docker://jdd056/crisprsurf:vectorization SURF_count -f {UNCORRECTED_ABE8E_DIR}/library_file.csv -control_fastqs {' '.join(ABE8E_PRESORT_FASTQ_PATHS)} -sample_fastqs {' '.join(ABE8E_LOW_FASTQ_PATHS)} -pert be -out_dir {UNCORRECTED_ABE8E_DIR}/presort_vs_low",
        shell=True,
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        )

# Filter out guides with less than 500 counts
library_file = pd.read_csv(f"{UNCORRECTED_ABE8E_DIR}/presort_vs_low/sgRNAs_summary_table.csv")
library_file = library_file.loc[(library_file.filter(like="_Count").apply(lambda x: x > 500).all(axis=1)), :]
library_file.to_csv(f"{UNCORRECTED_ABE8E_DIR}/presort_vs_low/sgRNAs_summary_table.csv", index=False)

# Generate Low vs. High Counts
os.makedirs(f"{UNCORRECTED_ABE8E_DIR}/low_vs_high", exist_ok=True)
subprocess.run(
        "module load singularity\n"+f"singularity run -B {WORKING_DIR} docker://jdd056/crisprsurf:vectorization SURF_count -f {UNCORRECTED_ABE8E_DIR}/library_file.csv -control_fastqs {' '.join(ABE8E_LOW_FASTQ_PATHS)} -sample_fastqs {' '.join(ABE8E_HIGH_FASTQ_PATHS)} -pert be -out_dir {UNCORRECTED_ABE8E_DIR}/low_vs_high",
        shell=True,
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        )

library_file = pd.read_csv(f"{UNCORRECTED_ABE8E_DIR}/low_vs_high/sgRNAs_summary_table.csv")
library_file = library_file.loc[(library_file.filter(like="_Count").apply(lambda x: x > 500).all(axis=1)), :]
library_file.to_csv(f"{UNCORRECTED_ABE8E_DIR}/low_vs_high/sgRNAs_summary_table.csv", index=False)

# Uncorrected ABE8e SURF Deconvolution

In [9]:
chrom = "chr16"
start = guide_info[guide_info["Chr"] == chrom]["Start"].min()
stop = guide_info[guide_info["Chr"] == chrom]["Stop"].max()
region = f"{chrom}:{int(start)}-{int(stop)}"

genome = "hg38"
null_dist = "negative_control"
pert_range = "6"
sim_n = 2000

for comparison in ["presort_vs_low", "low_vs_high"]:
    subprocess.run(
        "module load singularity\n"+f"singularity run -B {WORKING_DIR} docker://jdd056/crisprsurf:vectorization SURF_deconvolution -f {UNCORRECTED_ABE8E_DIR}/{comparison}/sgRNAs_summary_table.csv -pert be -range {pert_range} -null_dist {null_dist} -genome {genome} -lambda_val 1 -sim_n {sim_n} -out_dir {UNCORRECTED_ABE8E_DIR}/{comparison}/",
        shell=True,
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    adjust_tracks_for_plotting(
        results_path=f"{UNCORRECTED_ABE8E_DIR}/{comparison}/"
    )


# Uncorrected evoCDA SURF Count

In [None]:

UNCORRECTED_EVOCDA_SURF_COLUMNS = [
    "Chr",
    "Start",
    "Stop",
    "Strand",
    "is_ngg",
    "CBE_explainable",
    "sgRNA_Sequence.1",
    "sgRNA_Type_CBE",
]

# Prepare FastQs (old)
# FASTQ_BASENAME = "BSF_1264_HKGCWDRX2_1"

# presort_indices = [20, 23, 26]
# low_indices = [22, 25, 28]
# high_indices = [21, 24, 27]

# presort_fastqs = [f"{EVOCDA_FASTQ_DIR}/{FASTQ_BASENAME}#p7{EVOCDA_FASTQ_BARCODE_SHEET.iloc[index]['barcode P7 rev']}_p5{EVOCDA_FASTQ_BARCODE_SHEET.iloc[index]['barcode P5 fwd']}.R1.fastq" for index in presort_indices]
# low_fastqs = [f"{EVOCDA_FASTQ_DIR}/{FASTQ_BASENAME}#p7{EVOCDA_FASTQ_BARCODE_SHEET.iloc[index]['barcode P7 rev']}_p5{EVOCDA_FASTQ_BARCODE_SHEET.iloc[index]['barcode P5 fwd']}.R1.fastq" for index in low_indices]
# high_fastqs = [f"{EVOCDA_FASTQ_DIR}/{FASTQ_BASENAME}#p7{EVOCDA_FASTQ_BARCODE_SHEET.iloc[index]['barcode P7 rev']}_p5{EVOCDA_FASTQ_BARCODE_SHEET.iloc[index]['barcode P5 fwd']}.R1.fastq" for index in high_indices]

# Set non-explainable guides as negative controls
library_file = guide_info.loc[(guide_info["CBE_explainable"]) | (~guide_info["is_ngg"]), UNCORRECTED_EVOCDA_SURF_COLUMNS]
library_file.loc[~library_file["is_ngg"], "sgRNA_Type_CBE"] = "negative_control"
library_file.loc[~library_file["sgRNA_Sequence.1"].str[0:14].str.contains("C"), "sgRNA_Type_CBE"] = "negative_control"

library_file.drop(columns=["is_ngg", "CBE_explainable"], inplace=True)
library_file.rename(
    columns={
        "sgRNA_Type_CBE": "sgRNA_Type",
        "sgRNA_Sequence.1": "sgRNA_Sequence"}, inplace=True
    )
library_file.to_csv(f"{UNCORRECTED_EVOCDA_DIR}/library_file.csv", index=False)

# Generate Presort vs. Low Counts
os.makedirs(f"{UNCORRECTED_EVOCDA_DIR}/presort_vs_low", exist_ok=True)
subprocess.run(
        "module load singularity\n"+f"singularity run -B {WORKING_DIR} docker://jdd056/crisprsurf:vectorization SURF_count -f {UNCORRECTED_EVOCDA_DIR}/library_file.csv -control_fastqs {' '.join(EVOCDA_PRESORT_FASTQ_PATHS)} -sample_fastqs {' '.join(EVOCDA_LOW_FASTQ_PATHS)} -pert be -out_dir {UNCORRECTED_EVOCDA_DIR}/presort_vs_low",
        shell=True,
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        )

#Filter out guides with less than 500 counts
library_file = pd.read_csv(f"{UNCORRECTED_EVOCDA_DIR}/presort_vs_low/sgRNAs_summary_table.csv")
library_file = library_file.loc[(library_file.filter(like="_Count").apply(lambda x: x > 500).all(axis=1)), :]
library_file.to_csv(f"{UNCORRECTED_EVOCDA_DIR}/presort_vs_low/sgRNAs_summary_table.csv", index=False)

# Generate Low vs. High Counts
os.makedirs(f"{UNCORRECTED_EVOCDA_DIR}/low_vs_high", exist_ok=True)
subprocess.run(
        "module load singularity\n"+f"singularity run -B {WORKING_DIR} docker://jdd056/crisprsurf:vectorization SURF_count -f {UNCORRECTED_EVOCDA_DIR}/library_file.csv -control_fastqs {' '.join(EVOCDA_LOW_FASTQ_PATHS)} -sample_fastqs {' '.join(EVOCDA_HIGH_FASTQ_PATHS)} -pert be -out_dir {UNCORRECTED_EVOCDA_DIR}/low_vs_high",
        shell=True,
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        )

#Filter out guides with less than 500 counts
library_file = pd.read_csv(f"{UNCORRECTED_EVOCDA_DIR}/low_vs_high/sgRNAs_summary_table.csv")
library_file = library_file.loc[(library_file.filter(like="_Count").apply(lambda x: x > 500).all(axis=1)), :]
library_file.to_csv(f"{UNCORRECTED_EVOCDA_DIR}/low_vs_high/sgRNAs_summary_table.csv", index=False)

# Uncorrected evoCDA SURF Deconvolution

In [10]:
chrom = "chr16"
start = guide_info[guide_info["Chr"] == chrom]["Start"].min()
stop = guide_info[guide_info["Chr"] == chrom]["Stop"].max()
region = f"{chrom}:{int(start)}-{int(stop)}"

lambdas = [0.2, 0.5, 1, 10, 30, 50]
genome = "hg38"
null_dist = "negative_control"
pert_range = 13
sim_n = 2000

for comparison in ["presort_vs_low", "low_vs_high"]:
    subprocess.run(
        "module load singularity\n"+f"singularity run -B {WORKING_DIR} docker://jdd056/crisprsurf:vectorization SURF_deconvolution -f {UNCORRECTED_EVOCDA_DIR}/{comparison}/sgRNAs_summary_table.csv -pert be -range {pert_range} -null_dist {null_dist} -genome {genome} -lambda_val 1 -sim_n {sim_n} -out_dir {UNCORRECTED_EVOCDA_DIR}/{comparison}/",
        shell=True,
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    adjust_tracks_for_plotting(
        results_path=f"{UNCORRECTED_EVOCDA_DIR}/{comparison}/"
    )


# Mismatch Corrected Data Cleaning

In [8]:
corrected_count_matrix = pd.read_csv(CORRECTED_COUNT_MATRIX_PATH, sep="\t", index_col=0
).T

display(corrected_count_matrix.head())


Unnamed: 0,ABE8e_presort_1,ABE8e_presort_2,ABE8e_presort_3,ABE8e_low_1,ABE8e_low_2,ABE8e_low_3,ABE8e_high_1,ABE8e_high_2,ABE8e_high_3,evoCDA_presort_1,evoCDA_presort_2,evoCDA_presort_3,evoCDA_low_1,evoCDA_low_2,evoCDA_low_3,evoCDA_high_1,evoCDA_high_2,evoCDA_high_3,None_plasmid_None
GGGGAATGACATGCTCTAGT,24716.0,26162.0,42719.0,44645.0,27974.0,29535.0,19093.0,26987.0,29450.0,21567.0,21371.0,19518.0,21613.0,18859.0,10665.0,15123.0,21071.0,9354.0,31089.0
GAATGACATGCTCTAGTGAA,11750.0,28955.0,15677.0,30217.0,23626.0,12471.0,34673.0,21088.0,29720.0,9854.0,12133.0,12043.0,20517.0,24400.0,13395.0,13876.0,8678.0,10417.0,20674.0
TGACATGCTCTAGTGAAAGC,7392.0,19317.0,20414.0,62691.0,31555.0,27629.0,9114.0,28375.0,42553.0,21156.0,18809.0,17363.0,14987.0,20133.0,4850.0,11327.0,23534.0,14495.0,30476.0
CATGCTCTAGTGAAAGCCAG,44819.0,71795.0,16880.0,50966.0,38807.0,40022.0,43136.0,57876.0,49008.0,35697.0,33642.0,27713.0,32942.0,20695.0,18093.0,20641.0,28744.0,13410.0,39160.0
GCTCTAGTGAAAGCCAGTCT,100774.0,49923.0,46733.0,98625.0,48256.0,81950.0,73324.0,56437.0,71415.0,30653.0,27542.0,28107.0,32660.0,30404.0,33348.0,17810.0,23773.0,15340.0,38905.0


## Mismatch Corrected ABE8e SURF Count

In [9]:
CORRECTED_ABE8E_SURF_COLUMNS = [
    "Chr",
    "Start",
    "Stop",
    "Strand",
    "sgRNA_Type_ABE",
    "sgRNA_Sequence.1",
]

# Set non-explainable guides as negative controls
corrected_ABE8e_guides = guide_info.loc[guide_info["ABE_explainable"], CORRECTED_ABE8E_SURF_COLUMNS]
corrected_ABE8e_guides.loc[~corrected_ABE8e_guides["sgRNA_Sequence.1"].str[2:9].str.contains("A"), "sgRNA_Type_ABE"] = "negative_control"

corrected_ABE8e_presort_vs_low = pd.merge(
    corrected_ABE8e_guides,
    corrected_count_matrix[
        [
            "ABE8e_presort_1",
            "ABE8e_presort_2",
            "ABE8e_presort_3",
            "ABE8e_low_1",
            "ABE8e_low_2",
            "ABE8e_low_3",
        ]
    ],
    left_index=True,
    right_index=True,
)
corrected_ABE8e_presort_vs_low.reset_index(inplace=True, drop=True)
corrected_ABE8e_presort_vs_low.rename(
    columns={
        "sgRNA_Type_ABE": "sgRNA_Type",
        "sgRNA_Sequence.1": "sgRNA_Sequence",
        "ABE8e_presort_1": "Replicate1_Control_Count",
        "ABE8e_presort_2": "Replicate2_Control_Count",
        "ABE8e_presort_3": "Replicate3_Control_Count",
        "ABE8e_low_1": "Replicate1_Sample_Count",
        "ABE8e_low_2": "Replicate2_Sample_Count",
        "ABE8e_low_3": "Replicate3_Sample_Count",
    },
    inplace=True,
)

# Filter out guides with less than 500 counts
corrected_ABE8e_presort_vs_low = corrected_ABE8e_presort_vs_low.loc[(corrected_ABE8e_presort_vs_low.filter(like="_Count").apply(lambda x: x > 500).all(axis=1)), :]

os.makedirs(f"{CORRECTED_ABE8E_DIR}/presort_vs_low", exist_ok=True)
corrected_ABE8e_presort_vs_low.to_csv(
    f"{CORRECTED_ABE8E_DIR}/presort_vs_low/presort_vs_low.csv", index=False
)

corrected_ABE8e_low_vs_high = pd.merge(
    corrected_ABE8e_guides,
    corrected_count_matrix[
        [
            "ABE8e_low_1",
            "ABE8e_low_2",
            "ABE8e_low_3",
            "ABE8e_high_1",
            "ABE8e_high_2",
            "ABE8e_high_3",
        ]
    ],
    left_index=True,
    right_index=True,
)
corrected_ABE8e_low_vs_high.reset_index(inplace=True, drop=True)
corrected_ABE8e_low_vs_high.rename(
    columns={
        "sgRNA_Type_ABE": "sgRNA_Type",
        "sgRNA_Sequence.1": "sgRNA_Sequence",
        "ABE8e_low_1": "Replicate1_Control_Count",
        "ABE8e_low_2": "Replicate2_Control_Count",
        "ABE8e_low_3": "Replicate3_Control_Count",
        "ABE8e_high_1": "Replicate1_Sample_Count",
        "ABE8e_high_2": "Replicate2_Sample_Count",
        "ABE8e_high_3": "Replicate3_Sample_Count",
    },
    inplace=True,
)

# Filter out guides with less than 500 counts
corrected_ABE8e_low_vs_high = corrected_ABE8e_low_vs_high.loc[(corrected_ABE8e_low_vs_high.filter(like="_Count").apply(lambda x: x > 500).all(axis=1)), :]

os.makedirs(f"{CORRECTED_ABE8E_DIR}/low_vs_high", exist_ok=True)
corrected_ABE8e_low_vs_high.to_csv(
    f"{CORRECTED_ABE8E_DIR}/low_vs_high/low_vs_high.csv", index=False
)

In [10]:
for comparison in ["presort_vs_low", "low_vs_high"]:
    subprocess.run(
        "module load singularity\n"+f"singularity run -B {WORKING_DIR} docker://jdd056/crisprsurf:vectorization SURF_count -f {CORRECTED_ABE8E_DIR}/{comparison}/{comparison}.csv -pert be -out_dir {CORRECTED_ABE8E_DIR}/{comparison}",
        shell=True,
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )

# Mismatch Corrected ABE8e SURF Deconvolution

In [11]:
chrom = "chr16"
start = guide_info[guide_info["Chr"] == chrom]["Start"].min()
stop = guide_info[guide_info["Chr"] == chrom]["Stop"].max()
region = f"{chrom}:{int(start)}-{int(stop)}"

genome = "hg38"
null_dist = "negative_control"
pert_range = 6
sim_n = 2000

for comparison in ["presort_vs_low", "low_vs_high"]:
    subprocess.run(
        "module load singularity\n"+f"singularity run -B {WORKING_DIR} docker://jdd056/crisprsurf:vectorization SURF_deconvolution -f {CORRECTED_ABE8E_DIR}/{comparison}/sgRNAs_summary_table.csv -pert be -range {pert_range} -null_dist {null_dist} -genome {genome} -lambda_val 1 -sim_n {sim_n} -out_dir {CORRECTED_ABE8E_DIR}/{comparison}/",
        shell=True,
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,

    )
    adjust_tracks_for_plotting(
        results_path=f"{CORRECTED_ABE8E_DIR}/{comparison}/"
    )

# Mismatch Corrected evoCDA SURF Count

In [12]:
CORRECTED_EVOCDA_SURF_COLUMNS = [
    "Chr",
    "Start",
    "Stop",
    "Strand",
    "sgRNA_Type_CBE",
    "sgRNA_Sequence.1",
]

# Set non-explainable guides as negative controls
corrected_evoCDA_guides = guide_info.loc[(guide_info["CBE_explainable"]) | (~guide_info["is_ngg"]), CORRECTED_EVOCDA_SURF_COLUMNS]
corrected_evoCDA_guides.loc[~guide_info["is_ngg"], "sgRNA_Type_CBE"] = "negative_control"
corrected_evoCDA_guides.loc[~corrected_evoCDA_guides["sgRNA_Sequence.1"].str[0:14].str.contains("C"), "sgRNA_Type_CBE"] = "negative_control"

corrected_evoCDA_presort_vs_low = pd.merge(
    corrected_evoCDA_guides,
    corrected_count_matrix[
        [
            "evoCDA_presort_1",
            "evoCDA_presort_2",
            "evoCDA_presort_3",
            "evoCDA_low_1",
            "evoCDA_low_2",
            "evoCDA_low_3",
        ]
    ],
    left_index=True,
    right_index=True,
)
corrected_evoCDA_presort_vs_low.reset_index(inplace=True, drop=True)
corrected_evoCDA_presort_vs_low.rename(
    columns={
        "sgRNA_Type_CBE": "sgRNA_Type",
        "sgRNA_Sequence.1": "sgRNA_Sequence",
        "evoCDA_presort_1": "Replicate1_Control_Count",
        "evoCDA_presort_2": "Replicate2_Control_Count",
        "evoCDA_presort_3": "Replicate3_Control_Count",
        "evoCDA_low_1": "Replicate1_Sample_Count",
        "evoCDA_low_2": "Replicate2_Sample_Count",
        "evoCDA_low_3": "Replicate3_Sample_Count",
    },
    inplace=True,
)

corrected_evoCDA_presort_vs_low = corrected_evoCDA_presort_vs_low.loc[(corrected_evoCDA_presort_vs_low.filter(like="_Count").apply(lambda x: x > 500).all(axis=1)), :]

os.makedirs(f"{CORRECTED_EVOCDA_DIR}/presort_vs_low", exist_ok=True)
corrected_evoCDA_presort_vs_low.to_csv(
    f"{CORRECTED_EVOCDA_DIR}/presort_vs_low/presort_vs_low.csv", index=False
)

corrected_evoCDA_low_vs_high = pd.merge(
    corrected_evoCDA_guides,
    corrected_count_matrix[
        [
            "evoCDA_low_1",
            "evoCDA_low_2",
            "evoCDA_low_3",
            "evoCDA_high_1",
            "evoCDA_high_2",
            "evoCDA_high_3",
        ]
    ],
    left_index=True,
    right_index=True,
)
corrected_evoCDA_low_vs_high.reset_index(inplace=True, drop=True)
corrected_evoCDA_low_vs_high.rename(
    columns={
        "sgRNA_Type_CBE": "sgRNA_Type",
        "sgRNA_Sequence.1": "sgRNA_Sequence",
        "evoCDA_low_1": "Replicate1_Control_Count",
        "evoCDA_low_2": "Replicate2_Control_Count",
        "evoCDA_low_3": "Replicate3_Control_Count",
        "evoCDA_high_1": "Replicate1_Sample_Count",
        "evoCDA_high_2": "Replicate2_Sample_Count",
        "evoCDA_high_3": "Replicate3_Sample_Count",
    },
    inplace=True,
)

corrected_evoCDA_low_vs_high = corrected_evoCDA_low_vs_high.loc[(corrected_evoCDA_low_vs_high.filter(like="_Count").apply(lambda x: x > 500).all(axis=1)), :]

os.makedirs(f"{CORRECTED_EVOCDA_DIR}/low_vs_high", exist_ok=True)
corrected_evoCDA_low_vs_high.to_csv(
    f"{CORRECTED_EVOCDA_DIR}/low_vs_high/low_vs_high.csv", index=False
)

In [13]:
for comparison in ["presort_vs_low", "low_vs_high"]:
    subprocess.run(
        "module load singularity\n"+f"singularity run -B {WORKING_DIR} docker://jdd056/crisprsurf:vectorization SURF_count -f {CORRECTED_EVOCDA_DIR}/{comparison}/{comparison}.csv -pert be -out_dir {CORRECTED_EVOCDA_DIR}/{comparison}",
        shell=True,
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )

# Mismatch Corrected evoCDA SURF Deconvolution

In [12]:
chrom = "chr16"
start = guide_info[guide_info["Chr"] == chrom]["Start"].min()
stop = guide_info[guide_info["Chr"] == chrom]["Stop"].max()
region = f"{chrom}:{int(start)}-{int(stop)}"

genome = "hg38"
null_dist = "negative_control"
pert_range = 13
sim_n=2000

for comparison in ["presort_vs_low", "low_vs_high"]:
    subprocess.run(
        "module load singularity\n"+f"singularity run -B {WORKING_DIR} docker://jdd056/crisprsurf:vectorization SURF_deconvolution -f {CORRECTED_EVOCDA_DIR}/{comparison}/sgRNAs_summary_table.csv -pert be -range {pert_range} -null_dist {null_dist} -genome {genome} -lambda_val 1  -sim_n {sim_n} -out_dir {CORRECTED_EVOCDA_DIR}/{comparison}/",
        shell=True,
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,

    )
    adjust_tracks_for_plotting(
        results_path=f"{CORRECTED_EVOCDA_DIR}/{comparison}/"
    )

In [5]:
# Add Negative Control Points
for screen in ["Uncorrected_evoCDA", "Uncorrected_ABE8e", "Corrected_evoCDA", "Corrected_ABE8e"]:
    guides_file = pd.read_csv(f"{RESULTS_DIR}/{screen}/low_vs_high/sgRNAs_summary_table_updated.csv")
    guides_file = guides_file[guides_file["Chr"] == "chr16"]
    neg_controls = guides_file[guides_file["sgRNA_Type"] == "negative_control"]
    neg_controls = pd.concat([neg_controls[["Chr", "Start", "Stop", "Log2FC_Replicate1", "sgRNA_Sequence"]].rename(columns={"Log2FC_Replicate1":"Score"}),neg_controls[["Chr", "Start", "Stop", "Log2FC_Replicate2", "sgRNA_Sequence"]].rename(columns={"Log2FC_Replicate2":"Score"}), neg_controls[["Chr", "Start", "Stop", "Log2FC_Replicate3", "sgRNA_Sequence"]].rename(columns={"Log2FC_Replicate3":"Score"})])
    neg_controls["Start"] = guides_file["Perturbation_Index"].astype(int)
    neg_controls["Stop"] = (guides_file["Perturbation_Index"]+1).astype(int)
    neg_controls.to_csv(f"{RESULTS_DIR}/{screen}/low_vs_high/tracks_for_plotting/neg_raw_scores.bedgraph", index=False, sep="\t", header=False)

# Optional: sgRNAs per Significant Region

In [40]:
import glob
import shutil
guide_info = pd.read_csv(GUIDE_INFO_PATH, sep="\t", index_col=0).drop(columns="sgRNA_Sequence.1")

for sig_region_path in glob.glob(f"{RESULTS_DIR}/*/*/significant_regions.csv"):
    if os.path.exists(sig_region_path.replace("significant_regions.csv", "significant_regions")):
        shutil.rmtree(sig_region_path.replace("significant_regions.csv", "significant_regions"))
    os.makedirs(sig_region_path.replace("significant_regions.csv", "significant_regions"), exist_ok=True)

    library_file = pd.read_csv(sig_region_path.replace("significant_regions.csv", "sgRNAs_summary_table_updated.csv"), index_col="sgRNA_Sequence")
    if "Unnamed: 0" in library_file.columns:
        library_file.drop(columns="Unnamed: 0")

    with open(sig_region_path, "r") as sig_region_file:
        sig_regions = sig_region_file.readlines()[1:]
        if len(sig_regions) == 0:
            continue
        for idx, sig_region in enumerate(sig_regions):
            sig_guides = sig_region.strip().split(",")[9:]
            sig_guide_df = pd.concat([guide_info.loc[sig_guides], library_file.loc[sig_guides].iloc[:,6:]], axis=1)
            sig_guide_df.to_csv(sig_region_path.replace("significant_regions.csv", f"significant_regions/sig_region_{idx}_{sig_region.strip().split(',')[4]}.csv"))