# Encode sg218 Crispresso2 Alleles

See notebook STEP2_ABE8e_EvoCDA_Encoding_Demo.ipynb for a full description of instructions

### Install manuscript version of crispr-millipede

### Download example data:
|
You can use the output of the previous step `STEP1_sg218_Crispresso2_Demo.ipynb`.

Or you can use the pre-generated output from [Zenodo](https://doi.org/10.5281/zenodo.13737880) file path: `CRISPR-CLEAR-data/data/crispresso_output/sg218_CRISPResso2_output.zip`. Unzip the file.

You will also need the WT file from zenodo which was generated in the STEP1_ABE8e_Crispresso2 notebook. 
'CRISPR-CLEAR-data\data\crispresso_output\wt_premerged_sample\Alleles_frequency_table.zip'

### Imports

In [17]:

from crispr_millipede import encoding as cme

In [18]:
import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio.Seq import Seq

### Perform encoding

In [19]:
# Set the amplicon for encoding - UPDATE THIS WITH YOUR AMPLICON
amplicon_sequence = "ACATGCTCTAGTGAAAGCCAGTCTGGGCAGCTGGGTAGCTAATGAGGGGATTAGAGAGATTTTGTTGAATGAAAGGCAGATTGAGTCCTGCTACTCGCCCCCTTCATTCCCCTTCATTCATGCCTCATTCTTCCGCCTCCCAGCCGCCTCAACTGGCCAAAGGGAAGTGGAGGCCCTGCCACCTGTAGGGAGGGTCCCCTGGGGCTTGCCCACAGCAAACAGGAAGTCACAGCCTGGTGAGATGGGCCTGGGAATCAGCCACTGAGAAAGTGGGTCTCTTGGGTCCCTGAATTCTTTTTCTGAGTCCCTGCAGCAGTGAAAAAGACACAGAGGCACATAGAGAGTG"

### Prepare encoding parameters

In [20]:
ABE_SNPs = [("A", "G"), ("T", "C")]

In [21]:
sg218_sequence = "ACAGGAAGTCACAGCCTGGT"
sg218_position = [amplicon_sequence.index(sg218_sequence) + 5]

### Prepare sg218 encoding parameters

In [22]:

sg218_encoding_parameters_variantdenoised_removed = cme.EncodingParameters(complete_amplicon_sequence=amplicon_sequence,
                            population_baseline_suffix="_CD19minus",
                            population_target_suffix="_CD19plus",
                            population_presort_suffix="_presort",
                            wt_suffix="_wt",
                            variant_types = ABE_SNPs,
                            trim_left=20,
                            trim_right=20, 
                            remove_denoised=True)



In [33]:
root_dir = "/20231005_v1_CRISPResso2Batch_Results_q30/ABE8e_sg218/unmerged_PE/CRISPRessoBatch_on_ABE8e_sg218_unmerged_PE/"

sg218_encoding_dataframes_variantdenoised_removed = cme.EncodingDataFrames(encoding_parameters=sg218_encoding_parameters_variantdenoised_removed,
                                                 reference_sequence=sg218_encoding_parameters_variantdenoised_removed.complete_amplicon_sequence,
                                                 population_baseline_filepaths=[root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_low_1/Alleles_frequency_table.zip", 
                                                                                root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_low_2/Alleles_frequency_table.zip", 
                                                                                root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_low_3/Alleles_frequency_table.zip"],
                                                 population_target_filepaths=[root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_high_1/Alleles_frequency_table.zip", 
                                                                              root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_high_2/Alleles_frequency_table.zip", 
                                                                              root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_high_3/Alleles_frequency_table.zip"],
                                                 population_presort_filepaths=[root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_presort_1/Alleles_frequency_table.zip", 
                                                                               root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_presort_2/Alleles_frequency_table.zip", 
                                                                               root_dir + "CRISPResso_on_sample_ABE8e_PAX5sg_presort_3/Alleles_frequency_table.zip"],
                                                 wt_filepaths=["CRISPR-CLEAR-data/data/crispresso_output/wt_premerged_sample/Alleles_frequency_table.zip"])


In [34]:
import pickle

version = "20240905_v0_1_89"
name = "sg218_encoding_dataframes_variantdenoised_removed_CRISPResso2Merged_trimmed"

# RUN ENCODING
print("Reading tables")
sg218_encoding_dataframes_variantdenoised_removed.read_crispresso_allele_tables()
print("Encoding tables")
sg218_encoding_dataframes_variantdenoised_removed.encode_crispresso_allele_table(progress_bar=True, cores=50)
print("Postprocessing tables")
sg218_encoding_dataframes_variantdenoised_removed.postprocess_encoding()


# SAVE PICKLE
with open(f"{version}_{name}.pkl", 'wb') as fn: 
    pickle.dump(sg218_encoding_dataframes_variantdenoised_removed, fn) 

# SAVE ENCODINGS
cme.save_encodings(sg218_encoding_dataframes_variantdenoised_removed.encodings_collapsed_merged, sort_column="#Reads_presort", filename=f"./{version}_{name}_ABE8e_encodings_rep{{}}.tsv")
cme.save_encodings(sg218_encoding_dataframes_variantdenoised_removed.population_wt_encoding_processed, sort_column="#Reads_wt", filename=f"./{version}_{name}_wt_encodings_rep{{}}.tsv")
cme.save_encodings_df(sg218_encoding_dataframes_variantdenoised_removed.population_baseline_encoding_processed, filename=f"./{version}_{name}_CD19minus_ABE8e_encodings_rep{{}}.pkl")
cme.save_encodings_df(sg218_encoding_dataframes_variantdenoised_removed.population_target_encoding_processed, filename=f"./{version}_{name}_CD19plus_ABE8e_encodings_rep{{}}.pkl")
cme.save_encodings_df(sg218_encoding_dataframes_variantdenoised_removed.population_presort_encoding_processed, filename=f"./{version}_{name}_presort_ABE8e_encodings_rep{{}}.pkl")
cme.save_encodings_df(sg218_encoding_dataframes_variantdenoised_removed.population_wt_encoding_processed, filename=f"./{version}_{name}_wt_encodings_rep{{}}.pkl")

Reading tables
Encoding tables
INFO: Pandarallel will run on 50 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Encoding population_baseline_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=433), Label(value='0 / 433'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=326), Label(value='0 / 326'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=343), Label(value='0 / 343'))), HB…

Encoding population_target_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=450), Label(value='0 / 450'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=419), Label(value='0 / 419'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=392), Label(value='0 / 392'))), HB…

Encoding population_presort_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=399), Label(value='0 / 399'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=443), Label(value='0 / 443'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=421), Label(value='0 / 421'))), HB…

Encoding population_wt_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=551), Label(value='0 / 551'))), HB…

Postprocessing tables
Trimming encodings with trim_left=20 and trim_right=20
Processing encoding columns
Adding read column
Performing denoising
Denoising with positions [] and variant types [('A', 'G'), ('T', 'C')]
Filtering by variant types
1393 variant types
['20G>A', '20G>C', '20G>T', '20G>N', '20G>-', '21T>A', '21T>G', '21T>N', '21T>-', '22C>A', '22C>T', '22C>G', '22C>N', '22C>-', '23T>A', '23T>G', '23T>N', '23T>-', '24G>A', '24G>C', '24G>T', '24G>N', '24G>-', '25G>A', '25G>C', '25G>T', '25G>N', '25G>-', '26G>A', '26G>C', '26G>T', '26G>N', '26G>-', '27C>A', '27C>T', '27C>G', '27C>N', '27C>-', '28A>C', '28A>T', '28A>N', '28A>-', '29G>A', '29G>C', '29G>T', '29G>N', '29G>-', '30C>A', '30C>T', '30C>G', '30C>N', '30C>-', '31T>A', '31T>G', '31T>N', '31T>-', '32G>A', '32G>C', '32G>T', '32G>N', '32G>-', '33G>A', '33G>C', '33G>T', '33G>N', '33G>-', '34G>A', '34G>C', '34G>T', '34G>N', '34G>-', '35T>A', '35T>G', '35T>N', '35T>-', '36A>C', '36A>T', '36A>N', '36A>-', '37G>A', '37G>C', '37G>T',

  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.appe