# Run CRISPResso2 on EvoCDA Pooled Data

### Import Packages

In [None]:
import pandas as pd
import os
from Bio import SeqIO
from Bio.Seq import Seq

### Download Data

As an example, download the EvoCDA endogenous data from GEO that was used in the CRISPR-CLEAR manuscript: [GSE278581](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE278581)



- GSM8549757	evoCDA, replicate 1, presort, pooled, endogenous
- GSM8549758	evoCDA, replicate 1, CD19 positive, pooled, endogenous
- GSM8549759	evoCDA, replicate 1, CD19 negative, pooled, endogenous
- GSM8549760	evoCDA, replicate 2, presort, pooled, endogenous
- GSM8549761	evoCDA, replicate 2, CD19 positive, pooled, endogenous
- GSM8549762	evoCDA, replicate 2, CD19 negative, pooled, endogenous
- GSM8549763	evoCDA, replicate 3, presort, pooled, endogenous
- GSM8549764	evoCDA, replicate 3, CD19 positive, pooled, endogenous
- GSM8549765	evoCDA, replicate 3, CD19 negative, pooled, endogenous
- GSM8549766	evoCDA, unedited, pooled, endogenous

You can alternatively download the data from [Zenodo](https://doi.org/10.5281/zenodo.13737880) which may be easier: Unzip the file CRISPR-CLEAR-data/data/raw_FASTQs/endogenous_sequencing/EvoCDA_pooled.zip



### Prepare sample sheet for EvoCDA data

In [None]:
# Set the directory of the downloaded data, replace with your directory
unmerged_all_dir = "/data/pinello/PROJECTS/2021_11_BB_Shared_Tiling_Screen_Analysis/20220504_Davide_pilot_BE_analysis/Base_editors_screen_data/"

In [None]:
# OPTIONAL: In all filenames, need to replace '#' character with '_' to allow correct parsing of filename in CRISPResso2
def replicate_hashtag(directory_path):
    print(directory_path)
    file_list = os.listdir(directory_path)
    for filename in file_list:
        # Check if the '#' character is present in the filename
        if '#' in filename:
            # Replace '#' with '_'
            new_filename = filename.replace('#', '_')

            # Create the full old and new file paths
            old_file_path = os.path.join(directory_path, filename)
            new_file_path = os.path.join(directory_path, new_filename)

            # Rename the file
            os.rename(old_file_path, new_file_path)
            print('Renamed: {} to {}'.format(filename, new_filename))
            
replicate_hashtag(unmerged_all_dir)

In [None]:
# Get filenames from directory
unmerged_all_fns = os.listdir(unmerged_all_dir)

In [None]:
unmerged_all_fns

In [None]:
# Get the R1 and R2 files
r1_unmerged_fastq = [fn for fn in unmerged_all_fns if ("R1" in fn and "EvoCDA" in fn and "gz" in fn and "m7" not in fn)]
r2_unmerged_fastq = [fn for fn in unmerged_all_fns if ("R2" in fn and "EvoCDA" in fn and "gz" in fn and "m7" not in fn)]

In [None]:
r1_unmerged_fastq

In [None]:
r2_unmerged_fastq 

In [None]:
# Sort the R1 and R2 files so that corresponding files are in same list index
r1_unmerged_fastq = sorted(r1_unmerged_fastq)
r2_unmerged_fastq = sorted(r2_unmerged_fastq)

In [None]:
r1_unmerged_fastq.append('DS_AK_NALM6_BEscreen_WT_1_presort_S1_L001_R1_001.fastq.gz')
r2_unmerged_fastq.append('DS_AK_NALM6_BEscreen_WT_1_presort_S1_L001_R2_001.fastq.gz')

In [None]:
# Prepare the sample metadata values
editors_list = ["evoCDA"] * 9 + ["no_editor"]
population_list = ["low", "high", "presort"] * 3 + ["presort"] 
replicate_list = [1] * 3 + [2] * 3 + [3] * 3 + [1]
experiment_list = ["evoCDA_pooled"] * 9 + ["mock"]

In [None]:
# Create dataframe from metadata files and save sample sheet
sample_sheet_df = pd.DataFrame({
    "editors": editors_list,
    "population": population_list,
    "replicate": replicate_list,
    "experiment": experiment_list,
    "R1_FASTQ": r1_unmerged_fastq,
    "R2_FASTQ": r2_unmerged_fastq
})

sample_sheet_df = sample_sheet_df.loc[:, ["editors", "population", "replicate", "experiment", "R1_FASTQ", "R2_FASTQ"]]
sample_sheet_df.to_csv("/data/pinello/PROJECTS/2024_01_EC_Crispresso_Millipede_Reporting/CD19/Metadata/20230922_evoCDA_sample_sheet.tsv", sep="\t")

In [None]:
sample_sheet_df

### Read in guide sequences

In [None]:
# Read in the guide library file (contain in the repository), replace with filepath:
pooled_guide_sequence_df = pd.read_table("/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Data/CD19_sgRNA_count_libraries_DS.txt")
pooled_guide_sequence_df.columns = [col.strip() for col in pooled_guide_sequence_df.columns]
pooled_guide_sequences = [str(val) for val in pooled_guide_sequence_df["sgRNA"].values.tolist()]

In [None]:
pooled_guide_sequences

## Run CRISPRessoBatch

Prepare amplicon sequence and guide list

In [None]:
# Set the amplicon sequence and filter for guides that are fully present within the sequence (may remove guides on the edges depending on amplicon)
pooled_amplicon_sequence = Seq("ACATGCTCTAGTGAAAGCCAGTCTGGGCAGCTGGGTAGCTAATGAGGGGATTAGAGAGATTTTGTTGAATGAAAGGCAGATTGAGTCCTGCTACTCGCCCCCTTCATTCCCCTTCATTCATGCCTCATTCTTCCGCCTCCCAGCCGCCTCAACTGGCCAAAGGGAAGTGGAGGCCCTGCCACCTGTAGGGAGGGTCCCCTGGGGCTTGCCCACAGCAAACAGGAAGTCACAGCCTGGTGAGATGGGCCTGGGAATCAGCCACTGAGAAAGTGGGTCTCTTGGGTCCCTGAATTCTTTTTCTGAGTCCCTGCAGCAGTGAAAAAGACACAGAGGCACATAGAGAGTG")

pooled_amplicon_sequence_revcomp = pooled_amplicon_sequence.reverse_complement() 
pooled_amplicon_sequence = str(pooled_amplicon_sequence)
pooled_amplicon_sequence_revcomp = str(pooled_amplicon_sequence_revcomp)

guide_sequences_present = [seq for seq in pooled_guide_sequences if ((seq in pooled_amplicon_sequence) or (seq in pooled_amplicon_sequence_revcomp))]

# For visualization purposes, set a guide that is in the middle of the amplicon (does not need to be exact)
pooled_middle_guide = "GAGGCCCTGCCACCTGTAGG" # For the merged read
r1_middle_guide = "ATTCATGCCTCATTCTTCCG" # For the R1 read
r2_middle_guide = "ACAGCAAACAGGAAGTCACA" # For the R2 read

In [None]:
guide_sequences_present

In [None]:
print("There are {} guides present in amplicon sequence out of {}".format(len(guide_sequences_present), len(pooled_guide_sequences)))

Prepare filenames

In [None]:
# Prepare R1 and R2 FASTQ files for CRISPResso2Batch settings file
filenames_single_end_dir = "/data/pinello/PROJECTS/2021_11_BB_Shared_Tiling_Screen_Analysis/20220504_Davide_pilot_BE_analysis/Base_editors_screen_data/"

names = sample_sheet_df["experiment"].str.cat(sample_sheet_df[["population", "replicate"]].astype(str), sep="_")
sample_sheet_df["names"] = names
filenames_R1 = filenames_single_end_dir + sample_sheet_df["R1_FASTQ"]
filenames_R2 = filenames_single_end_dir + sample_sheet_df["R2_FASTQ"] 

In [None]:
for fn in filenames_R1:
    print(fn)

Create settings files for pooled screen

In [None]:
# Prepare sample names that will be batched
pooled_names_enumeration = [(index, name) for index, name in enumerate(names) if ("pooled" in name) or ("mock_presort_1" in name)]

In [None]:
pooled_names_enumeration

In [None]:
# Create directory for all the different settings file
settings_output_dir = "./20240122_CRISPResso2Batch_Settings"

In [None]:
# Create and write the settings files
evoCDA_pooled_unmerged_PE_settings_fn = 'Davide_CD19_enhancer_BE_screen_evoCDA_pooled_unmerged_PE.settings'
evoCDA_pooled_unmerged_PE_middle_settings_fn = 'Davide_CD19_enhancer_BE_screen_evoCDA_pooled_unmerged_PE_middle.settings'

evoCDA_pooled_unmerged_R1_settings_fn = 'Davide_CD19_enhancer_BE_screen_evoCDA_pooled_unmerged_R1.settings'
evoCDA_pooled_unmerged_R1_middle_settings_fn = 'Davide_CD19_enhancer_BE_screen_evoCDA_pooled_unmerged_R1_middle.settings'

evoCDA_pooled_unmerged_R2_settings_fn = 'Davide_CD19_enhancer_BE_screen_evoCDA_pooled_unmerged_R2.settings'
evoCDA_pooled_unmerged_R2_middle_settings_fn = 'Davide_CD19_enhancer_BE_screen_evoCDA_pooled_unmerged_R2_middle.settings'

# Un-merged settings (Provide both R1 and R2, CRISPResso2 will do merging)
with open(settings_output_dir + "/" + evoCDA_pooled_unmerged_PE_settings_fn, 'w') as out_handle:
    out_handle.write('name\tfastq_r1\tfastq_r2\tguide_seq\n')
    for sample_index, _ in pooled_names_enumeration:
        out_handle.write('sample_%s\t%s\t%s\t%s\n' % (names[sample_index], filenames_R1[sample_index],filenames_R2[sample_index], ",".join(guide_sequences_present)))

with open(settings_output_dir + "/" + evoCDA_pooled_unmerged_PE_middle_settings_fn, 'w') as out_handle:
    out_handle.write('name\tfastq_r1\tfastq_r2\tguide_seq\n')
    for sample_index, _ in pooled_names_enumeration:
        out_handle.write('sample_%s\t%s\t%s\t%s\n' % (names[sample_index], filenames_R1[sample_index], filenames_R2[sample_index], pooled_middle_guide))

# R1 settings (only provide R1, no merging)
with open(settings_output_dir + "/" + evoCDA_pooled_unmerged_R1_settings_fn, 'w') as out_handle:
    out_handle.write('name\tfastq_r1\tguide_seq\n')
    for sample_index, _ in pooled_names_enumeration:
        out_handle.write('sample_%s\t%s\t%s\n' % (names[sample_index], filenames_R1[sample_index],",".join(guide_sequences_present)))

with open(settings_output_dir + "/" + evoCDA_pooled_unmerged_R1_middle_settings_fn, 'w') as out_handle:
    out_handle.write('name\tfastq_r1\tguide_seq\n')
    for sample_index, _ in pooled_names_enumeration:
        out_handle.write('sample_%s\t%s\t%s\n' % (names[sample_index], filenames_R1[sample_index], r1_middle_guide))

# R2 settings (only provide R2, no merging)
with open(settings_output_dir + "/" + evoCDA_pooled_unmerged_R2_settings_fn, 'w') as out_handle:
    out_handle.write('name\tfastq_r1\tguide_seq\n')
    for sample_index, _ in pooled_names_enumeration:
        out_handle.write('sample_%s\t%s\t%s\n' % (names[sample_index], filenames_R2[sample_index],",".join(guide_sequences_present)))

with open(settings_output_dir + "/" + evoCDA_pooled_unmerged_R2_middle_settings_fn, 'w') as out_handle:
    out_handle.write('name\tfastq_r1\tguide_seq\n')
    for sample_index, _ in pooled_names_enumeration:
        out_handle.write('sample_%s\t%s\t%s\n' % (names[sample_index], filenames_R2[sample_index], r2_middle_guide))


Run CRISPResso2Batch

In [None]:
# Set the directory of all CRISPResso2 outputs
root_results_output_dir = "./20240122_CRISPResso2Batch_Results"
os.makedirs(root_results_output_dir)

In [None]:
# Set the directory of CRISPResso2 sub-run outputs
results_output_dir = root_results_output_dir + "/evoCDA_pooled"

evoCDA_pooled_unmerged_PE_results_dir = results_output_dir + "/unmerged_PE"
evoCDA_pooled_unmerged_PE_middle_results_dir = results_output_dir + "/unmerged_PE_middle"

evoCDA_pooled_unmerged_R1_results_dir = results_output_dir + "/unmerged_R1"
evoCDA_pooled_unmerged_R1_middle_results_dir = results_output_dir + "/unmerged_R1_middle"

evoCDA_pooled_unmerged_R2_results_dir = results_output_dir + "/unmerged_R2"
evoCDA_pooled_unmerged_R2_middle_results_dir = results_output_dir + "/unmerged_R2_middle"


os.makedirs(evoCDA_pooled_unmerged_PE_results_dir)
os.makedirs(evoCDA_pooled_unmerged_PE_middle_results_dir)

os.makedirs(evoCDA_pooled_unmerged_R1_results_dir)
os.makedirs(evoCDA_pooled_unmerged_R1_middle_results_dir)

os.makedirs(evoCDA_pooled_unmerged_R2_results_dir)
os.makedirs(evoCDA_pooled_unmerged_R2_middle_results_dir)

In [None]:
# Set the name of each CRISPResso2 run
evoCDA_pooled_unmerged_PE_name = "evoCDA_pooled_unmerged_PE"
evoCDA_pooled_unmerged_PE_middle_name = "evoCDA_pooled_unmerged_PE_middle"
evoCDA_pooled_unmerged_R1_name = "evoCDA_pooled_unmerged_R1"
evoCDA_pooled_unmerged_R1_middle_name = "evoCDA_pooled_unmerged_R1_middle"
evoCDA_pooled_unmerged_R2_name = "evoCDA_pooled_unmerged_R2"
evoCDA_pooled_unmerged_R2_middle_name = "evoCDA_pooled_unmerged_R2_middle"

Run CRISPRessoBatch

In [None]:
%%time

#evoCDA_pooled_unmerged_PE: Run on CRISPResso2 merged R1+R2 for each guide individually
!CRISPRessoBatch -bs {settings_output_dir + "/" + evoCDA_pooled_unmerged_PE_settings_fn} -a {pooled_amplicon_sequence} \
-an cd19 -q 30 \
--exclude_bp_from_left 3 \
--exclude_bp_from_right 3 --no_rerun -n {evoCDA_pooled_unmerged_PE_name} \
--min_frequency_alleles_around_cut_to_plot 0.001 --max_rows_alleles_around_cut_to_plot 500 -p 64  \
--plot_window_size 4 --base_editor_output -w 0 -bo {evoCDA_pooled_unmerged_PE_results_dir}

In [None]:
%%time

#evoCDA_pooled_unmerged_PE_middle: Run on CRISPResso2 merged R1+R2 for middle guide to view entire amplicon
!CRISPRessoBatch -bs {settings_output_dir + "/" + evoCDA_pooled_unmerged_PE_middle_settings_fn} -a {pooled_amplicon_sequence} \
-an cd19 -q 30 \
--exclude_bp_from_left 3 \
--exclude_bp_from_right 3 --no_rerun -n {evoCDA_pooled_unmerged_PE_middle_name} \
--min_frequency_alleles_around_cut_to_plot 0.001 --max_rows_alleles_around_cut_to_plot 500 -p 64  \
--plot_window_size 85 --base_editor_output -w 0 -bo {evoCDA_pooled_unmerged_PE_middle_results_dir}

In [None]:
%%time

#evoCDA_pooled_unmerged_R1: Run on R1-only for each guide individually
!CRISPRessoBatch -bs {settings_output_dir + "/" + evoCDA_pooled_unmerged_R1_settings_fn} -a {pooled_amplicon_sequence[:250]} \
-an cd19 -q 30 \
--exclude_bp_from_left 3 \
--exclude_bp_from_right 3 --no_rerun -n {evoCDA_pooled_unmerged_R1_name} \
--min_frequency_alleles_around_cut_to_plot 0.001 --max_rows_alleles_around_cut_to_plot 500 -p 64  \
--plot_window_size 4 --base_editor_output -w 0 -bo {evoCDA_pooled_unmerged_R1_results_dir}

In [None]:
%%time

#evoCDA_pooled_unmerged_R1_middle: Run on R1-only for middle guide to view entire amplicon
!CRISPRessoBatch -bs {settings_output_dir + "/" + evoCDA_pooled_unmerged_R1_middle_settings_fn} -a {pooled_amplicon_sequence[:250]} \
-an cd19 -q 30 \
--exclude_bp_from_left 3 \
--exclude_bp_from_right 3 --no_rerun -n {evoCDA_pooled_unmerged_R1_middle_name} \
--min_frequency_alleles_around_cut_to_plot 0.001 --max_rows_alleles_around_cut_to_plot 500 -p 64  \
--plot_window_size 42 --base_editor_output -w 0 -bo {evoCDA_pooled_unmerged_R1_middle_results_dir}

In [None]:
%%time

#evoCDA_pooled_unmerged_R2: Run on R2-only for each guide individually
!CRISPRessoBatch -bs {settings_output_dir + "/" + evoCDA_pooled_unmerged_R2_settings_fn} -a {pooled_amplicon_sequence[-250:]} \
-an cd19 -q 30 \
--exclude_bp_from_left 3 \
--exclude_bp_from_right 3 --no_rerun -n {evoCDA_pooled_unmerged_R2_name} \
--min_frequency_alleles_around_cut_to_plot 0.001 --max_rows_alleles_around_cut_to_plot 500 -p 64  \
--plot_window_size 4 --base_editor_output -w 0 -bo {evoCDA_pooled_unmerged_R2_results_dir}

In [None]:
%%time

#evoCDA_pooled_unmerged_R2_middle: Run on R2-only for middle guide to view entire amplicon
!CRISPRessoBatch -bs {settings_output_dir + "/" + evoCDA_pooled_unmerged_R2_middle_settings_fn} -a {pooled_amplicon_sequence[-250:]} \
-an cd19 -q 30 \
--exclude_bp_from_left 3 \
--exclude_bp_from_right 3 --no_rerun -n {evoCDA_pooled_unmerged_R2_middle_name} \
--min_frequency_alleles_around_cut_to_plot 0.001 --max_rows_alleles_around_cut_to_plot 500 -p 64  \
--plot_window_size 42 --base_editor_output -w 0 -bo {evoCDA_pooled_unmerged_R2_middle_results_dir}