# Encode sg218 Crispresso2 Alleles

See notebook STEP4_ABE8e_EvoCDA_Encoding_Demo.ipynb for a full description of instructions

### Install manuscript version of crispr-millipede

In [None]:
# Install CRISPR-Millipede
reinstall = True
if reinstall:    
    version = "0.1.84"
    !pip install --upgrade --no-cache-dir crispr-millipede==$version

### Download example data:

You can use the output of the previous step `STEP3_sg219_Crispresso2_Demo.ipynb`.

Or you can use the pre-generated output from [Zenodo](https://doi.org/10.5281/zenodo.13737880) file path: `CRISPR-CLEAR-data/data/crispresso_output/sg218_CRISPResso2_output.zip`. Unzip the file.

### Imports

In [1]:
%load_ext autoreload
%autoreload 2

from crispr_millipede import encoding as cme

In [2]:
import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio.Seq import Seq

### Perform encoding

In [3]:
# Set the amplicon for encoding - UPDATE THIS WITH YOUR AMPLICON
amplicon_sequence = "ACATGCTCTAGTGAAAGCCAGTCTGGGCAGCTGGGTAGCTAATGAGGGGATTAGAGAGATTTTGTTGAATGAAAGGCAGATTGAGTCCTGCTACTCGCCCCCTTCATTCCCCTTCATTCATGCCTCATTCTTCCGCCTCCCAGCCGCCTCAACTGGCCAAAGGGAAGTGGAGGCCCTGCCACCTGTAGGGAGGGTCCCCTGGGGCTTGCCCACAGCAAACAGGAAGTCACAGCCTGGTGAGATGGGCCTGGGAATCAGCCACTGAGAAAGTGGGTCTCTTGGGTCCCTGAATTCTTTTTCTGAGTCCCTGCAGCAGTGAAAAAGACACAGAGGCACATAGAGAGTG"

### Prepare encoding parameters

In [8]:
ABE_SNPs = [("A", "G"), ("T", "C")]

In [9]:
sg219_sequence = "ACAGGAAGTCACAGCCTGGT"
sg219_position = [amplicon_sequence.index(sg219_sequence) + 5]

### Prepare sg218 encoding parameters

In [24]:
sg219_encoding_parameters_fulldenoised_removed = cme.EncodingParameters(complete_amplicon_sequence=amplicon_sequence,
                            population_baseline_suffix="_CD19minus",
                            population_target_suffix="_CD19plus",
                            population_presort_suffix="_presort",
                            wt_suffix="_wt",
                            variant_types = ABE_SNPs,
                            guide_edit_positions=sg219_position,
                            guide_window_halfsize=5,
                            trim_left=20,
                            trim_right=20,
                            remove_denoised=True)


In [25]:
root_dir = "/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Processing_Analysis/Analysis/ABE8e_Pooled_Endogenous_Redo_AND_ABE8e_sg219_Endogenous_AND_EvoCDA_Hits_Endogenous_CRISPREsso2_Analysis/20231005_v1_CRISPResso2Batch_Results_q30/ABE8e_sg219/unmerged_PE/CRISPRessoBatch_on_ABE8e_sg219_unmerged_PE_middle/"

sg219_encoding_dataframes_fulldenoised_removed = cme.EncodingDataFrames(encoding_parameters=sg219_encoding_parameters_fulldenoised_removed,
                                                 reference_sequence=sg219_encoding_parameters_fulldenoised_removed.complete_amplicon_sequence,
                                                 population_baseline_filepaths=[root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_low_1/Alleles_frequency_table.zip", 
                                                                                root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_low_2/Alleles_frequency_table.zip", 
                                                                                root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_low_3/Alleles_frequency_table.zip"],
                                                 population_target_filepaths=[root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_high_1/Alleles_frequency_table.zip", 
                                                                              root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_high_2/Alleles_frequency_table.zip", 
                                                                              root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_high_3/Alleles_frequency_table.zip"],
                                                 population_presort_filepaths=[root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_presort_1/Alleles_frequency_table.zip", 
                                                                               root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_presort_2/Alleles_frequency_table.zip", 
                                                                               root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_presort_3/Alleles_frequency_table.zip"],
                                                 wt_filepaths=["/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Processing_Analysis/Analysis/ABE8e_Pooled_Endogenous_Redo_AND_ABE8e_sg219_Endogenous_AND_EvoCDA_Hits_Endogenous_CRISPREsso2_Analysis/20231002_v3_CRISPResso2Batch_Results/ABE8e_pooled/premerged_middle/CRISPRessoBatch_on_ABE8e_pooled_premerged_middle/CRISPResso_on_sample_mock_presort_1/Alleles_frequency_table.zip"])

In [None]:
import pickle

version = "20240905_v0_1_89"
name = "sg219_encoding_dataframes_variantdenoised_removed_CRISPResso2Merged_trimmed"

# RUN ENCODING
print("Reading tables")
sg219_encoding_dataframes_fulldenoised_removed.read_crispresso_allele_tables()
print("Encoding tables")
sg219_encoding_dataframes_fulldenoised_removed.encode_crispresso_allele_table(progress_bar=True, cores=50)
print("Postprocessing tables")
sg219_encoding_dataframes_fulldenoised_removed.postprocess_encoding()


# SAVE PICKLE
with open(f"{version}_{name}.pkl", 'wb') as fn: 
    pickle.dump(sg219_encoding_dataframes_fulldenoised_removed, fn) 

# SAVE ENCODINGS
cme.save_encodings(sg219_encoding_dataframes_fulldenoised_removed.encodings_collapsed_merged, sort_column="#Reads_presort", filename=f"./{version}_{name}_ABE8e_encodings_rep{{}}.tsv")
cme.save_encodings(sg219_encoding_dataframes_fulldenoised_removed.population_wt_encoding_processed, sort_column="#Reads_wt", filename=f"./{version}_{name}_wt_encodings_rep{{}}.tsv")
cme.save_encodings_df(sg219_encoding_dataframes_fulldenoised_removed.population_baseline_encoding_processed, filename=f"./{version}_{name}_CD19minus_ABE8e_encodings_rep{{}}.pkl")
cme.save_encodings_df(sg219_encoding_dataframes_fulldenoised_removed.population_target_encoding_processed, filename=f"./{version}_{name}_CD19plus_ABE8e_encodings_rep{{}}.pkl")
cme.save_encodings_df(sg219_encoding_dataframes_fulldenoised_removed.population_presort_encoding_processed, filename=f"./{version}_{name}_presort_ABE8e_encodings_rep{{}}.pkl")
cme.save_encodings_df(sg219_encoding_dataframes_fulldenoised_removed.population_wt_encoding_processed, filename=f"./{version}_{name}_wt_encodings_rep{{}}.pkl")