# Run CRISPResso2 on ABE8e sg218 Data

### Import Packages

In [2]:
import pandas as pd
import os

In [3]:
# Install BioPython "conda install conda-forge::biopython" 
from Bio import SeqIO
from Bio.Seq import Seq

### Download Data

As an example, download the sg219 endogenous data from GEO that was used in the CRISPR-CLEAR manuscript: [GSE278581](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE278581)



- GSM8549747	ABE8e, unedited, pooled, endogenous
- GSM8549748	ABE8e, replicate 1, presort, sg218
- GSM8549749	ABE8e, replicate 1, CD19 positive, sg218
- GSM8549750	ABE8e, replicate 1, CD19 negative, sg218
- GSM8549751	ABE8e, replicate 2, presort, sg218
- GSM8549752	ABE8e, replicate 2, CD19 positive, sg218
- GSM8549753	ABE8e, replicate 2, CD19 negative, sg218
- GSM8549754	ABE8e, replicate 3, presort, sg218
- GSM8549755	ABE8e, replicate 3, CD19 positive, sg218
- GSM8549756	ABE8e, replicate 3, CD19 negative, sg218

You can alternatively download the data from [Zenodo](https://doi.org/10.5281/zenodo.13737880) which may be easier: Unzip the file CRISPR-CLEAR-data/data/raw_FASTQs/endogenous_sequencing/sg219_fastqs.zip



### Prepare sample sheet

In [4]:
# Set the directory of the downloaded data, replace with your directory
unmerged_all_dir = "/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Data/ABE8e_Pooled_Endogenous_Redo_AND_ABE8e_sg219_Endogenous_AND_EvoCDA_Hits_Endogenous/ProcessedData/BSF_1488_000000000-KJJ4P_L1"

In [6]:
# OPTIONAL: In all filenames, need to replace '#' character with '_' to allow correct parsing of filename in CRISPResso2
def replicate_hashtag(directory_path):
    print(directory_path)
    file_list = os.listdir(directory_path)
    for filename in file_list:
        # Check if the '#' character is present in the filename
        if '#' in filename:
            # Replace '#' with '_'
            new_filename = filename.replace('#', '_')

            # Create the full old and new file paths
            old_file_path = os.path.join(directory_path, filename)
            new_file_path = os.path.join(directory_path, new_filename)

            # Rename the file
            os.rename(old_file_path, new_file_path)
            print('Renamed: {} to {}'.format(filename, new_filename))
            
replicate_hashtag(unmerged_all_dir)

/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Data/ABE8e_Pooled_Endogenous_Redo_AND_ABE8e_sg219_Endogenous_AND_EvoCDA_Hits_Endogenous/ProcessedData/BSF_1488_000000000-KJJ4P_L1


In [7]:
# Get filenames from directory
unmerged_all_fns = os.listdir(unmerged_all_dir)

In [26]:
# Get the R1 and R2 files
r1_unmerged_fastq = [fn for fn in unmerged_all_fns if ("R1.fastq" in fn) and ("PAX5sg" in fn)]
r2_unmerged_fastq = [fn for fn in unmerged_all_fns if ("R2.fastq" in fn) and ("PAX5sg" in fn)]


In [27]:
# Sort the R1 and R2 files so that corresponding files are in same list index
r1_unmerged_fastq = sorted(r1_unmerged_fastq)
r2_unmerged_fastq = sorted(r2_unmerged_fastq)

In [28]:
r1_unmerged_fastq

['BSF_1488_000000000-KJJ4P_1_CD19_high_PAX5sg_1_S139294.R1.fastq',
 'BSF_1488_000000000-KJJ4P_1_CD19_high_PAX5sg_2_S139299.R1.fastq',
 'BSF_1488_000000000-KJJ4P_1_CD19_high_PAX5sg_3_S139300.R1.fastq',
 'BSF_1488_000000000-KJJ4P_1_CD19_low_PAX5sg_1_S139295.R1.fastq',
 'BSF_1488_000000000-KJJ4P_1_CD19_low_PAX5sg_2_S139296.R1.fastq',
 'BSF_1488_000000000-KJJ4P_1_CD19_low_PAX5sg_3_S139301.R1.fastq',
 'BSF_1488_000000000-KJJ4P_1_CD19_presort_PAX5sg_1_S139292.R1.fastq',
 'BSF_1488_000000000-KJJ4P_1_CD19_presort_PAX5sg_2_S139297.R1.fastq',
 'BSF_1488_000000000-KJJ4P_1_CD19_presort_PAX5sg_3_S139298.R1.fastq']

In [29]:
# Prepare the sample metadata values
editors_list = ["ABE8e"] * 9
population_list = ["high"] * 3 + ["low"] * 3 + ["presort"] * 3
replicate_list = [1,2,3] * 3
experiment_list = ["ABE8e_PAX5sg"] * 9

In [33]:
# Create dataframe from metadata files and save sample sheet
sample_sheet_df = pd.DataFrame({
    "editors": editors_list,
    "population": population_list,
    "replicate": replicate_list,
    "experiment": experiment_list,
    "R1_FASTQ": r1_unmerged_fastq,
    "R2_FASTQ": r2_unmerged_fastq
})

sample_sheet_df = sample_sheet_df.loc[:, ["editors", "population", "replicate", "experiment", "R1_FASTQ", "R2_FASTQ"]]
sample_sheet_df.to_csv("/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Data/ABE8e_Pooled_Endogenous_Redo_AND_ABE8e_sg219_Endogenous_AND_EvoCDA_Hits_Endogenous/MetaData/20230922_sample_sheet_2.tsv", sep="\t")

In [34]:
sample_sheet_df

Unnamed: 0,editors,population,replicate,experiment,R1_FASTQ,R2_FASTQ
0,ABE8e,high,1,ABE8e_PAX5sg,BSF_1488_000000000-KJJ4P_1_CD19_high_PAX5sg_1_...,BSF_1488_000000000-KJJ4P_1_CD19_high_PAX5sg_1_...
1,ABE8e,high,2,ABE8e_PAX5sg,BSF_1488_000000000-KJJ4P_1_CD19_high_PAX5sg_2_...,BSF_1488_000000000-KJJ4P_1_CD19_high_PAX5sg_2_...
2,ABE8e,high,3,ABE8e_PAX5sg,BSF_1488_000000000-KJJ4P_1_CD19_high_PAX5sg_3_...,BSF_1488_000000000-KJJ4P_1_CD19_high_PAX5sg_3_...
3,ABE8e,low,1,ABE8e_PAX5sg,BSF_1488_000000000-KJJ4P_1_CD19_low_PAX5sg_1_S...,BSF_1488_000000000-KJJ4P_1_CD19_low_PAX5sg_1_S...
4,ABE8e,low,2,ABE8e_PAX5sg,BSF_1488_000000000-KJJ4P_1_CD19_low_PAX5sg_2_S...,BSF_1488_000000000-KJJ4P_1_CD19_low_PAX5sg_2_S...
5,ABE8e,low,3,ABE8e_PAX5sg,BSF_1488_000000000-KJJ4P_1_CD19_low_PAX5sg_3_S...,BSF_1488_000000000-KJJ4P_1_CD19_low_PAX5sg_3_S...
6,ABE8e,presort,1,ABE8e_PAX5sg,BSF_1488_000000000-KJJ4P_1_CD19_presort_PAX5sg...,BSF_1488_000000000-KJJ4P_1_CD19_presort_PAX5sg...
7,ABE8e,presort,2,ABE8e_PAX5sg,BSF_1488_000000000-KJJ4P_1_CD19_presort_PAX5sg...,BSF_1488_000000000-KJJ4P_1_CD19_presort_PAX5sg...
8,ABE8e,presort,3,ABE8e_PAX5sg,BSF_1488_000000000-KJJ4P_1_CD19_presort_PAX5sg...,BSF_1488_000000000-KJJ4P_1_CD19_presort_PAX5sg...


### Read in guide sequences

In [35]:
# Read in the guide library file (contain in the repository), replace with filepath:
guide_library_fn = "/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Data/CD19_sgRNA_count_libraries_DS.txt"
pooled_guide_sequence_df = pd.read_table(guide_library_fn)
pooled_guide_sequence_df.columns = [col.strip() for col in pooled_guide_sequence_df.columns]
pooled_guide_sequences = [str(val) for val in pooled_guide_sequence_df["sgRNA"].values.tolist()]

  This is separate from the ipykernel package so we can avoid doing imports until


In [38]:
editor = ["ABE8e"]
sgID = ["219"]
guide_sequence = ["ACAGGAAGTCACAGCCTGGT"]

validation_origin = ["ABE8e_v1_endogenous"]


CD19_guide_validation_sequences_df = pd.DataFrame({
    "editor": editor,
    "sgID": sgID,
    "screen_origin": validation_origin,
    "guide_sequence": guide_sequence
})

CD19_guide_validation_sequences_df = CD19_guide_validation_sequences_df.loc[:, ["editor", "sgID", "screen_origin", "guide_sequence"]]

# Add the field corresponding to the CRISPRessoBatch sample name:
CD19_guide_validation_sequences_df["crispresso_name"] = CD19_guide_validation_sequences_df["editor"] + "_" + CD19_guide_validation_sequences_df["sgID"]
CD19_guide_validation_sequences_df["crispresso_name"][0] = "ABE8e_PAX5sg"
CD19_guide_validation_sequences_df.to_csv("/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Data/20230922_CD19_guide_validation_sequences_2.tsv", sep="\t")

In [39]:
CD19_guide_validation_sequences_df

Unnamed: 0,editor,sgID,screen_origin,guide_sequence,crispresso_name
0,ABE8e,219,ABE8e_v1_endogenous,ACAGGAAGTCACAGCCTGGT,ABE8e_PAX5sg


## Run CRISPRessoBatch

Prepare amplicon sequence and guide list

In [41]:
pooled_amplicon_sequence = Seq("ACATGCTCTAGTGAAAGCCAGTCTGGGCAGCTGGGTAGCTAATGAGGGGATTAGAGAGATTTTGTTGAATGAAAGGCAGATTGAGTCCTGCTACTCGCCCCCTTCATTCCCCTTCATTCATGCCTCATTCTTCCGCCTCCCAGCCGCCTCAACTGGCCAAAGGGAAGTGGAGGCCCTGCCACCTGTAGGGAGGGTCCCCTGGGGCTTGCCCACAGCAAACAGGAAGTCACAGCCTGGTGAGATGGGCCTGGGAATCAGCCACTGAGAAAGTGGGTCTCTTGGGTCCCTGAATTCTTTTTCTGAGTCCCTGCAGCAGTGAAAAAGACACAGAGGCACATAGAGAGTG")

pooled_amplicon_sequence_revcomp = pooled_amplicon_sequence.reverse_complement() 
pooled_amplicon_sequence = str(pooled_amplicon_sequence)
pooled_amplicon_sequence_revcomp = str(pooled_amplicon_sequence_revcomp)

guide_sequences_present = [seq for seq in pooled_guide_sequences if ((seq in pooled_amplicon_sequence) or (seq in pooled_amplicon_sequence_revcomp))]

pooled_middle_guide = "GAGGCCCTGCCACCTGTAGG" # Set guide in middle of amplicon for visualization
r1_middle_guide = "ATTCATGCCTCATTCTTCCG"
r2_middle_guide = "ACAGCAAACAGGAAGTCACA"
# TODO (20230925): Might need another middle guide for R1 and R2 individually

In [42]:
guide_sequences_present

['CATGCTCTAGTGAAAGCCAG',
 'GCTCTAGTGAAAGCCAGTCT',
 'CTAGTGAAAGCCAGTCTGGG',
 'GTGAAAGCCAGTCTGGGCAG',
 'AAAGCCAGTCTGGGCAGCTG',
 'GCCAGTCTGGGCAGCTGGGT',
 'AGTCTGGGCAGCTGGGTAGC',
 'CTGGGCAGCTGGGTAGCTAA',
 'GGCAGCTGGGTAGCTAATGA',
 'AGCTGGGTAGCTAATGAGGG',
 'TGGGTAGCTAATGAGGGGAT',
 'GCTAATGAGGGGATTAGAGA',
 'AATGAGGGGATTAGAGAGAT',
 'GAGGGGATTAGAGAGATTTT',
 'GGGATTAGAGAGATTTTGTT',
 'ATTAGAGAGATTTTGTTGAA',
 'GAGATTTTGTTGAATGAAAG',
 'ATTTTGTTGAATGAAAGGCA',
 'TTGTTGAATGAAAGGCAGAT',
 'TTGAATGAAAGGCAGATTGA',
 'AATGAAAGGCAGATTGAGTC',
 'GAAAGGCAGATTGAGTCCTG',
 'AGGCAGATTGAGTCCTGCTA',
 'CAGATTGAGTCCTGCTACTC',
 'ATTGAGTCCTGCTACTCGCC',
 'GAGTCCTGCTACTCGCCCCC',
 'TCCTGCTACTCGCCCCCTTC',
 'TGCTACTCGCCCCCTTCATT',
 'TACTCGCCCCCTTCATTCCC',
 'TCGCCCCCTTCATTCCCCTT',
 'CCCCCTTCATTCCCCTTCAT',
 'CCTTCATTCCCCTTCATTCA',
 'TCATTCCCCTTCATTCATGC',
 'TTCCCCTTCATTCATGCCTC',
 'CCCTTCATTCATGCCTCATT',
 'TTCATTCATGCCTCATTCTT',
 'ATTCATGCCTCATTCTTCCG',
 'CATGCCTCATTCTTCCGCCT',
 'GCCTCATTCTTCCGCCTCCC',
 'TCATTCTTCCGCCTCCCAGC',


In [43]:
print("There are {} guides present in amplicon sequence out of {}".format(len(guide_sequences_present), len(pooled_guide_sequences)))

There are 192 guides present in amplicon sequence out of 206


Prepare filenames

In [45]:
# Prepare R1 and R2 FASTQ files for CRISPResso2Batch settings file
filenames_single_end_dir = "/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Data/ABE8e_Pooled_Endogenous_Redo_AND_ABE8e_sg219_Endogenous_AND_EvoCDA_Hits_Endogenous/ProcessedData/BSF_1488_000000000-KJJ4P_L1/"

names = sample_sheet_df["experiment"].str.cat(sample_sheet_df[["population", "replicate"]].astype(str), sep="_")
sample_sheet_df["names"] = names
filenames_R1 = filenames_single_end_dir + sample_sheet_df["R1_FASTQ"] # TODO, prefix with directory
filenames_R2 = filenames_single_end_dir + sample_sheet_df["R2_FASTQ"] # TODO, prefix with directory

In [46]:
# Create directory for all the different settings file
settings_output_dir = "./20241004_CRISPResso2Batch_Settings"
os.makedirs(settings_output_dir)

Create settings files for pooled screen

In [47]:
# Prepare sample names that will be batched
pooled_names_enumeration = [(index, name) for index, name in enumerate(names) if ("pooled" in name) or ("mock_presort_1" in name)]

In [48]:
pooled_names_enumeration

[]

Create settings files for validation guides

In [50]:
global skipped_ids, available_ids
skipped_ids = []
available_ids = []
def generate_validation_guide_crispresso_settings(validation_guide_row):
    validation_name = validation_guide_row["editor"] + "_sg" + validation_guide_row["sgID"]
    crispresso_name = validation_guide_row["crispresso_name"]
    print(validation_name)
    
    unmerged_PE_settings_fn = 'Davide_CD19_enhancer_BE_screen_{}_unmerged_PE.settings'.format(validation_name)
    unmerged_R1_settings_fn = 'Davide_CD19_enhancer_BE_screen_{}_unmerged_R1.settings'.format(validation_name)
    unmerged_R2_settings_fn = 'Davide_CD19_enhancer_BE_screen_{}_unmerged_R2.settings'.format(validation_name)
    
    validation_names_enumeration = [(index, name) for index, name in enumerate(names) if crispresso_name in name]
    print(validation_names_enumeration)
    
    if len(validation_names_enumeration) == 0:
        skipped_ids.append(validation_name)
    else:
        available_ids.append(validation_name)

        # Un-merged settings
        with open(settings_output_dir + "/" + unmerged_PE_settings_fn, 'w') as out_handle:
            out_handle.write('name\tfastq_r1\tfastq_r2\tguide_seq\n')
            for sample_index, _ in validation_names_enumeration:
                out_handle.write('sample_%s\t%s\t%s\t%s\n' % (names[sample_index], filenames_R1[sample_index], filenames_R2[sample_index], validation_guide_row["guide_sequence"]))

        # R1 settings
        with open(settings_output_dir + "/" + unmerged_R1_settings_fn, 'w') as out_handle:
            out_handle.write('name\tfastq_r1\tguide_seq\n')
            for sample_index, _ in validation_names_enumeration:
                out_handle.write('sample_%s\t%s\t%s\n' % (names[sample_index], filenames_R1[sample_index], validation_guide_row["guide_sequence"]))

        # R2 settings
        with open(settings_output_dir + "/" + unmerged_R2_settings_fn, 'w') as out_handle:
            out_handle.write('name\tfastq_r1\tguide_seq\n')
            for sample_index, _ in validation_names_enumeration:
                out_handle.write('sample_%s\t%s\t%s\n' % (names[sample_index], filenames_R2[sample_index], validation_guide_row["guide_sequence"]))

for _, row in CD19_guide_validation_sequences_df.iterrows():
    generate_validation_guide_crispresso_settings(row)            

ABE8e_sg219
[(0, 'ABE8e_PAX5sg_high_1'), (1, 'ABE8e_PAX5sg_high_2'), (2, 'ABE8e_PAX5sg_high_3'), (3, 'ABE8e_PAX5sg_low_1'), (4, 'ABE8e_PAX5sg_low_2'), (5, 'ABE8e_PAX5sg_low_3'), (6, 'ABE8e_PAX5sg_presort_1'), (7, 'ABE8e_PAX5sg_presort_2'), (8, 'ABE8e_PAX5sg_presort_3')]


In [51]:
skipped_ids

[]

In [52]:
available_ids

['ABE8e_sg219']

Run CRISPResso2Batch

In [75]:
root_results_output_dir = "./20231002_v3_CRISPResso2Batch_Results"
os.makedirs(root_results_output_dir)

In [101]:
%%time
for _, validation_guide_row in CD19_guide_validation_sequences_df.iterrows():
    validation_name = validation_guide_row["editor"] + "_sg" + validation_guide_row["sgID"]
    crispresso_name = validation_guide_row["crispresso_name"]
    
    validation_names_enumeration = [(index, name) for index, name in enumerate(names) if crispresso_name in name]
    if len(validation_names_enumeration) > 0:
        print("Processing " + validation_name)
        validate_results_output_dir = "./20231005_v1_CRISPResso2Batch_Results_q30/" + validation_name

        premerged_results_dir = validate_results_output_dir + "/premerged"
        unmerged_PE_results_dir = validate_results_output_dir + "/unmerged_PE"
        unmerged_R1_results_dir = validate_results_output_dir + "/unmerged_R1"
        unmerged_R2_results_dir = validate_results_output_dir + "/unmerged_R2"


        os.makedirs(premerged_results_dir)
        os.makedirs(unmerged_PE_results_dir)

        os.makedirs(unmerged_R1_results_dir)
        os.makedirs(unmerged_R2_results_dir)


        premerged_settings_fn = "Davide_CD19_enhancer_BE_screen_{}_premerged.settings".format(validation_name)
        unmerged_PE_settings_fn = 'Davide_CD19_enhancer_BE_screen_{}_unmerged_PE.settings'.format(validation_name)
        unmerged_R1_settings_fn = 'Davide_CD19_enhancer_BE_screen_{}_unmerged_R1.settings'.format(validation_name)
        unmerged_R2_settings_fn = 'Davide_CD19_enhancer_BE_screen_{}_unmerged_R2.settings'.format(validation_name)


        #premerged
        #ABE8e_pooled_premerged_middle
        !CRISPRessoBatch -bs {settings_output_dir + "/" + premerged_settings_fn} -a {pooled_amplicon_sequence} \
        -an cd19 -q 30 \
        --exclude_bp_from_left 3 \
        --exclude_bp_from_right 3 --no_rerun -n {validation_name + "_premerged"} \
        --min_frequency_alleles_around_cut_to_plot 0.001 --max_rows_alleles_around_cut_to_plot 500 -p 20  \
        --plot_window_size 20 --base_editor_output -w 0 -bo {premerged_results_dir}


        #ABE8e_pooled_unmerged_PE_middle
        !CRISPRessoBatch -bs {settings_output_dir + "/" + unmerged_PE_settings_fn} -a {pooled_amplicon_sequence} \
        -an cd19 -q 30 \
        --exclude_bp_from_left 3 \
        --exclude_bp_from_right 3 --no_rerun -n {validation_name + "_unmerged_PE"} \
        --min_frequency_alleles_around_cut_to_plot 0.001 --max_rows_alleles_around_cut_to_plot 500 -p 64  \
        --plot_window_size 20 --base_editor_output -w 0 -bo {unmerged_PE_results_dir}


        #ABE8e_pooled_unmerged_R1_middle
        !CRISPRessoBatch -bs {settings_output_dir + "/" + unmerged_R1_settings_fn} -a {pooled_amplicon_sequence[:250]} \
        -an cd19 -q 30 \
        --exclude_bp_from_left 3 \
        --exclude_bp_from_right 3 --no_rerun -n {validation_name + "_unmerged_R1"} \
        --min_frequency_alleles_around_cut_to_plot 0.001 --max_rows_alleles_around_cut_to_plot 500 -p 64  \
        --plot_window_size 20 --base_editor_output -w 0 -bo {unmerged_R1_results_dir}

        #ABE8e_pooled_unmerged_R2_middle
        !CRISPRessoBatch -bs {settings_output_dir + "/" + unmerged_R2_settings_fn} -a {pooled_amplicon_sequence[-250:]} \
        -an cd19 -q 30 \
        --exclude_bp_from_left 3 \
        --exclude_bp_from_right 3 --no_rerun -n {validation_name + "_unmerged_R2"} \
        --min_frequency_alleles_around_cut_to_plot 0.001 --max_rows_alleles_around_cut_to_plot 500 -p 64  \
        --plot_window_size 20 --base_editor_output -w 0 -bo {unmerged_R2_results_dir}

Processing ABE8e_sg219

                             ~~~CRISPRessoBatch~~~                              
       -Analysis of CRISPR/Cas9 outcomes from batch deep sequencing data-       
                                                                                
                 _                                             _                
                '  )                                          '  )              
                .-'            _________________              .-'               
               (____          | __    ___ __    |            (____              
            C)|     \         ||__) /\ | /  |__||         C)|     \             
              \     /         ||__)/--\| \__|  ||           \     /             
               \___/          |_________________|            \___/              

                           [CRISPResso version 2.1.3]                           
[Note that starting in version 2.1.0 insertion quantification has been changed
to on