# Encode ABE8e and EvoCDA Crispresso2 Alleles

### Install manuscript version of crispr-millipede

See [PyPi](https://pypi.org/project/crispr-millipede/) for latest version.

In [105]:
# Install CRISPR-Millipede
reinstall = True
if reinstall:    
    version = "0.1.89"
    !pip install --upgrade --no-cache-dir crispr-millipede==$version

Collecting crispr-millipede==0.1.89
  Downloading crispr_millipede-0.1.89-py3-none-any.whl.metadata (1.0 kB)
Downloading crispr_millipede-0.1.89-py3-none-any.whl (39 kB)
Installing collected packages: crispr-millipede
  Attempting uninstall: crispr-millipede
    Found existing installation: crispr-millipede 0.1.88
    Uninstalling crispr-millipede-0.1.88:
      Successfully uninstalled crispr-millipede-0.1.88
Successfully installed crispr-millipede-0.1.89


### Download example data:

You can use the output of the previous step `notebooks/STEP1_ABE8e_CRISPResso2_Demo.ipynb` and `notebooks/STEP2_EvoCDA_CRISPResso2_Demo.ipynb`.

Or you can use the pre-generated output from [Zenodo](https://doi.org/10.5281/zenodo.13737880) file path: `CRISPR-CLEAR-data/data/crispresso_output/ABE8e_CRISPResso2_output.zip` and `CRISPR-CLEAR-data/data/crispresso_output/EvoCDA_CRISPResso2_output.zip`. Unzip both files.

### Import packages

In [1]:
%load_ext autoreload
%autoreload 2

from crispr_millipede import encoding as cme

In [2]:
import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio.Seq import Seq

### Perform encoding

In [3]:
# Set the amplicon for encoding - UPDATE THIS WITH YOUR AMPLICON
amplicon = "ACATGCTCTAGTGAAAGCCAGTCTGGGCAGCTGGGTAGCTAATGAGGGGATTAGAGAGATTTTGTTGAATGAAAGGCAGATTGAGTCCTGCTACTCGCCCCCTTCATTCCCCTTCATTCATGCCTCATTCTTCCGCCTCCCAGCCGCCTCAACTGGCCAAAGGGAAGTGGAGGCCCTGCCACCTGTAGGGAGGGTCCCCTGGGGCTTGCCCACAGCAAACAGGAAGTCACAGCCTGGTGAGATGGGCCTGGGAATCAGCCACTGAGAAAGTGGGTCTCTTGGGTCCCTGAATTCTTTTTCTGAGTCCCTGCAGCAGTGAAAAAGACACAGAGGCACATAGAGAGTG"
print(f"Length of amplicon {len(amplicon)}")

Length of amplicon 346


### Prepare encoding parameters

**Prepare guide edit positions for denoising**

We can set an option to denoise only edits within the expected editing window

In [4]:
# Load in the guide library dataframe
core_data_dir = "/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Data/"
CD19_guide_library_fn = f"{core_data_dir}CD19_sgRNA_count_libraries_DS.txt"

# Load CD19 Library
CD19_guide_library_raw_df = pd.read_table(CD19_guide_library_fn, encoding='utf-8')
CD19_guide_library_raw_df.columns = [colname.strip() for colname in CD19_guide_library_raw_df.columns]

In [5]:
# Prepare dataframe for CRISPR-SURF input
CD19_guide_library_SURFinput_df = pd.DataFrame({"Chr": CD19_guide_library_raw_df["chromosome_#"],
            "Start": CD19_guide_library_raw_df["start"],
            "Stop": CD19_guide_library_raw_df["end"],
            "sgRNA_Sequence": CD19_guide_library_raw_df["sgRNA"],
            "Strand": CD19_guide_library_raw_df["strand"],
            "sgRNA_Type_ABE": np.where(CD19_guide_library_raw_df["start"].isna(), "negative_control", "observation"),
            "sgRNA_Type_CBE": np.where(CD19_guide_library_raw_df["start"].isna(), "negative_control", "observation")})

CD19_guide_library_SURFinput_df["Chr"] = CD19_guide_library_SURFinput_df["Chr"].str.replace(",","_")
CD19_guide_library_SURFinput_df.to_csv("./CD19_guide_library_SURFinput.csv", index=False)

In [6]:
# Set predicted cutsite for each guide
CD19_guide_library_SURFinput_df_cutsite = CD19_guide_library_SURFinput_df.copy()
CD19_guide_library_SURFinput_df_cutsite["Cutsite"] = np.where(CD19_guide_library_SURFinput_df_cutsite["Strand"] == "+", CD19_guide_library_SURFinput_df_cutsite["Start"]+6, CD19_guide_library_SURFinput_df_cutsite["Start"]+14)

In [7]:
# Get relative positions of all edit sites
complete_amplicon_sequence = Seq(amplicon)

def get_start_coordinate(row):
    start_coordinate = complete_amplicon_sequence.find(row["sgRNA_Sequence"])
    end_coordinate = start_coordinate + 20
    editsite_coordinate = start_coordinate + 6
    if start_coordinate == -1:
        end_coordinate = complete_amplicon_sequence.find(Seq(row["sgRNA_Sequence"]).reverse_complement())
        if end_coordinate == -1:
            pass
        else:
            start_coordinate = end_coordinate + 20
            editsite_coordinate = start_coordinate - 6
            PAM = str(complete_amplicon_sequence[end_coordinate-3:end_coordinate].reverse_complement())
            assert row["Strand"].strip() == "-"
    else:
        PAM = str(complete_amplicon_sequence[end_coordinate:end_coordinate+3])
        assert row["Strand"].strip() == "+"
    
    if (start_coordinate != -1) and (end_coordinate != -1):
        return pd.Series([start_coordinate, end_coordinate, editsite_coordinate, PAM], index=["start_coordinate_amplicon", "end_coordinate_amplicon", "editsite_coordinate_amplicon", "PAM"])
    else: 
        return pd.Series([None, None, None, None], index=["start_coordinate_amplicon", "end_coordinate_amplicon", "editsite_coordinate_amplicon", "PAM"])
amplicon_coordinates_df = CD19_guide_library_SURFinput_df_cutsite.apply(get_start_coordinate, axis=1)

CD19_guide_library_SURFinput_df_cutsite_with_amplicon_coordinates = pd.concat([CD19_guide_library_SURFinput_df_cutsite, amplicon_coordinates_df], axis=1)
CD19_guide_library_SURFinput_df_cutsite_with_amplicon_coordinates = CD19_guide_library_SURFinput_df_cutsite_with_amplicon_coordinates[~CD19_guide_library_SURFinput_df_cutsite_with_amplicon_coordinates["start_coordinate_amplicon"].isna()]

ABE8e_final_editsites = list(np.sort(CD19_guide_library_SURFinput_df_cutsite_with_amplicon_coordinates["editsite_coordinate_amplicon"].astype(int).unique()))
EvoCDA_final_editsites = list(np.sort(CD19_guide_library_SURFinput_df_cutsite_with_amplicon_coordinates[CD19_guide_library_SURFinput_df_cutsite_with_amplicon_coordinates["PAM"].str[1:] == "GG"]["editsite_coordinate_amplicon"].astype(int).unique()))
print(ABE8e_final_editsites)
print(EvoCDA_final_editsites)

[7, 10, 13, 15, 16, 18, 19, 21, 22, 24, 25, 28, 30, 31, 34, 37, 39, 42, 43, 45, 46, 48, 49, 51, 52, 54, 55, 57, 60, 61, 64, 66, 67, 69, 70, 72, 73, 75, 76, 78, 79, 81, 82, 84, 85, 88, 90, 91, 93, 94, 96, 97, 100, 102, 103, 106, 108, 109, 111, 112, 114, 115, 117, 118, 120, 121, 123, 124, 126, 127, 129, 130, 132, 133, 135, 136, 138, 139, 141, 142, 144, 145, 147, 148, 150, 153, 154, 156, 157, 160, 162, 163, 165, 166, 168, 169, 171, 172, 174, 175, 177, 180, 181, 183, 184, 186, 187, 189, 190, 192, 193, 195, 196, 198, 199, 201, 202, 204, 205, 208, 210, 213, 214, 216, 217, 219, 222, 223, 225, 226, 228, 229, 231, 232, 234, 235, 237, 238, 240, 241, 243, 246, 247, 249, 250, 252, 253, 255, 259, 261, 262, 265, 267, 268, 270, 271, 273, 276, 277, 279, 280, 282, 283, 285, 286, 288, 289, 291, 292, 294, 295, 297, 298, 300, 303, 307, 309, 310, 312, 313, 315, 318, 319, 321, 322, 324, 325, 327, 328, 330, 331, 333]
[10, 31, 114, 117, 126, 139, 156, 172, 187, 195, 198, 213, 225, 229, 235, 249, 265]


### Prepare ABE8e encoding parameters

- Set the suffix to be relevant to your sample populations
- Recommended to trim the boundaries (we specified 20nt since sequencing background is high on unmerged amplicon regions). Can also set based on FASTQC profiles or can remove argument completely for no trimming.
- Can optionally set the guide_edit_positions of all positions where editing is predicted to take place. Can also set the editing window halfsize (expecting editing at `position +- halfsize`). Only variants within all windows will be considered for modelling.
- Can optionally set the expected editing substitutions in variant_types. Only these substitution types will be considered for modelling.
- Can remove any variants or positions that were filtered out based on the specifications mentioned above, suggested.

In [8]:
# Set directory of ABE8e CRISPResso2 output directory
root_dir = "/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Processing_Analysis/Analysis/ABE8e_Pooled_Endogenous_Redo_SecondAttempt_No_Contaminant_CRISPResso2_Analysis/20240606_CRISPResso2Batch_Results/ABE8e_pooled/unmerged_PE_middle/CRISPRessoBatch_on_ABE8e_pooled_unmerged_PE_middle/"

# Set the encoding specification - UPDATE BASED ON YOUR OWN SPECIFICATIONS
encoding_parameters_denoised_removed = cme.EncodingParameters(complete_amplicon_sequence=amplicon,
                            population_baseline_suffix="_CD19minus",
                            population_target_suffix="_CD19plus",
                            population_presort_suffix="_presort",
                            wt_suffix="_wt",
                            trim_left=20,
                            trim_right=20,
                            guide_edit_positions=ABE8e_final_editsites, 
                            guide_window_halfsize = 3, 
                            variant_types=[("A", "G"), ("T", "C")],
                            remove_denoised=True)

                                                         
# Load in the CRISPResso2 outputs - UPDATE FILEPATHS                                                 
encoding_dataframes_denoised_removed = cme.EncodingDataFrames(encoding_parameters=encoding_parameters_denoised_removed,
                                                 reference_sequence=encoding_parameters_denoised_removed.complete_amplicon_sequence,
                                                 population_baseline_filepaths=[root_dir + "CRISPResso_on_sample_ABE8e_pooled_low_1/Alleles_frequency_table.zip", 
                                                                                root_dir + "CRISPResso_on_sample_ABE8e_pooled_low_2/Alleles_frequency_table.zip", 
                                                                                root_dir + "CRISPResso_on_sample_ABE8e_pooled_low_3/Alleles_frequency_table.zip"],
                                                 population_target_filepaths=[root_dir + "CRISPResso_on_sample_ABE8e_pooled_high_1/Alleles_frequency_table.zip", 
                                                                              root_dir + "CRISPResso_on_sample_ABE8e_pooled_high_2/Alleles_frequency_table.zip", 
                                                                              root_dir + "CRISPResso_on_sample_ABE8e_pooled_high_3/Alleles_frequency_table.zip"],
                                                 population_presort_filepaths=[root_dir + "CRISPResso_on_sample_ABE8e_pooled_presort_1/Alleles_frequency_table.zip", 
                                                                               root_dir + "CRISPResso_on_sample_ABE8e_pooled_presort_2/Alleles_frequency_table.zip", 
                                                                               root_dir + "CRISPResso_on_sample_ABE8e_pooled_presort_3/Alleles_frequency_table.zip"],
                                                 wt_filepaths=[root_dir + "CRISPResso_on_sample_mock_presort_1/Alleles_frequency_table.zip"])


In [9]:
%%time
# Run the encoding! Highly recommended to parallelize by setting the `cores` arguement
# Takes around 48min for 50 CPU cores
print("Reading tables")
encoding_dataframes_denoised_removed.read_crispresso_allele_tables()
print("Encoding tables")
encoding_dataframes_denoised_removed.encode_crispresso_allele_table(progress_bar=True, cores=60)
print("Postprocessing tables")
encoding_dataframes_denoised_removed.postprocess_encoding()

Reading tables
Encoding tables
INFO: Pandarallel will run on 60 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Encoding population_baseline_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=938), Label(value='0 / 938'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=924), Label(value='0 / 924'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=730), Label(value='0 / 730'))), HB…

Encoding population_target_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=744), Label(value='0 / 744'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=847), Label(value='0 / 847'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=801), Label(value='0 / 801'))), HB…

Encoding population_presort_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=678), Label(value='0 / 678'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=727), Label(value='0 / 727'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=631), Label(value='0 / 631'))), HB…

Encoding population_wt_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=459), Label(value='0 / 459'))), HB…

Postprocessing tables
Trimming encodings with trim_left=20 and trim_right=20
Processing encoding columns
Adding read column
Performing denoising
Denoising with positions [7, 10, 13, 15, 16, 18, 19, 21, 22, 24, 25, 28, 30, 31, 34, 37, 39, 42, 43, 45, 46, 48, 49, 51, 52, 54, 55, 57, 60, 61, 64, 66, 67, 69, 70, 72, 73, 75, 76, 78, 79, 81, 82, 84, 85, 88, 90, 91, 93, 94, 96, 97, 100, 102, 103, 106, 108, 109, 111, 112, 114, 115, 117, 118, 120, 121, 123, 124, 126, 127, 129, 130, 132, 133, 135, 136, 138, 139, 141, 142, 144, 145, 147, 148, 150, 153, 154, 156, 157, 160, 162, 163, 165, 166, 168, 169, 171, 172, 174, 175, 177, 180, 181, 183, 184, 186, 187, 189, 190, 192, 193, 195, 196, 198, 199, 201, 202, 204, 205, 208, 210, 213, 214, 216, 217, 219, 222, 223, 225, 226, 228, 229, 231, 232, 234, 235, 237, 238, 240, 241, 243, 246, 247, 249, 250, 252, 253, 255, 259, 261, 262, 265, 267, 268, 270, 271, 273, 276, 277, 279, 280, 282, 283, 285, 286, 288, 289, 291, 292, 294, 295, 297, 298, 300, 303, 307, 30

  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.appe

CPU times: user 3min 53s, sys: 24min 17s, total: 28min 11s
Wall time: 1h 1min 19s


In [115]:
import pickle

# Save encoding result as pickle. If loading a previous run, set rerun=True
rerun = True
if rerun:
    with open('20240816_v0_1_84_encoding_dataframes_denoised_removed.pkl', 'wb') as fn: 
        pickle.dump(encoding_dataframes_denoised_removed, fn) 
else:
    with open('20240807_v0_1_84_encoding_dataframes_denoised_removed.pkl', 'rb') as fn: 
        encoding_dataframes_denoised_removed = pickle.load(fn) 

In [17]:
# Save the encoding tables for loading into the modelling step.
cme.save_encodings(encoding_dataframes_denoised_removed.encodings_collapsed_merged, sort_column="#Reads_presort", filename="./20240807_v0_1_84_encoding_dataframes_denoised_removed_ABE8e_encodings_rep{}.tsv")
cme.save_encodings(encoding_dataframes_denoised_removed.population_wt_encoding_processed, sort_column="#Reads_wt", filename="./20240807_v0_1_84_encoding_dataframes_denoised_removed_wt_encodings_rep{}.tsv")
cme.save_encodings_df(encoding_dataframes_denoised_removed.population_baseline_encoding_processed, filename="./20240807_v0_1_84_encoding_dataframes_denoised_removed_CD19minus_ABE8e_encodings_rep{}.pkl")
cme.save_encodings_df(encoding_dataframes_denoised_removed.population_target_encoding_processed, filename="./20240807_v0_1_84_encoding_dataframes_denoised_removed_CD19plus_ABE8e_encodings_rep{}.pkl")
cme.save_encodings_df(encoding_dataframes_denoised_removed.population_presort_encoding_processed, filename="./20240807_v0_1_84_encoding_dataframes_denoised_removed_presort_ABE8e_encodings_rep{}.pkl")
cme.save_encodings_df(encoding_dataframes_denoised_removed.population_wt_encoding_processed, filename="./20240807_v0_1_84_encoding_dataframes_denoised_removed_wt_encodings_rep{}.pkl")

### Prepare EvoCDA encoding parameters

In [9]:
root_dir = "/data/pinello/PROJECTS/2023_08_CD19_manuscript/Core_Processing_Analysis/Analysis/2024_01_EC_Crispresso_Millipede_Reporting_copy/20240122_CRISPResso2Batch_Results/evoCDA_pooled/unmerged_PE_middle/CRISPRessoBatch_on_evoCDA_pooled_unmerged_PE_middle/"

encoding_parameters_denoised_removed = cme.EncodingParameters(complete_amplicon_sequence="ACATGCTCTAGTGAAAGCCAGTCTGGGCAGCTGGGTAGCTAATGAGGGGATTAGAGAGATTTTGTTGAATGAAAGGCAGATTGAGTCCTGCTACTCGCCCCCTTCATTCCCCTTCATTCATGCCTCATTCTTCCGCCTCCCAGCCGCCTCAACTGGCCAAAGGGAAGTGGAGGCCCTGCCACCTGTAGGGAGGGTCCCCTGGGGCTTGCCCACAGCAAACAGGAAGTCACAGCCTGGTGAGATGGGCCTGGGAATCAGCCACTGAGAAAGTGGGTCTCTTGGGTCCCTGAATTCTTTTTCTGAGTCCCTGCAGCAGTGAAAAAGACACAGAGGCACATAGAGAGTG",
                            population_baseline_suffix="_CD19minus",
                            population_target_suffix="_CD19plus",
                            population_presort_suffix="_presort",
                            wt_suffix="_wt",
                            trim_left=20,
                            trim_right=20,
                            guide_edit_positions=EvoCDA_final_editsites, 
                            guide_window_halfsize = 7, 
                            variant_types=[("C", "T"), ("G", "A")],
                            remove_denoised=True)
                    
                                                         
encoding_dataframes_denoised_removed = cme.EncodingDataFrames(encoding_parameters=encoding_parameters_denoised_removed,
                                                 reference_sequence=encoding_parameters_denoised_removed.complete_amplicon_sequence,
                                                 population_baseline_filepaths=[root_dir + "CRISPResso_on_sample_evoCDA_pooled_low_1/Alleles_frequency_table.zip", 
                                                                                root_dir + "CRISPResso_on_sample_evoCDA_pooled_low_2/Alleles_frequency_table.zip", 
                                                                                root_dir + "CRISPResso_on_sample_evoCDA_pooled_low_3/Alleles_frequency_table.zip"],
                                                 population_target_filepaths=[root_dir + "CRISPResso_on_sample_evoCDA_pooled_high_1/Alleles_frequency_table.zip", 
                                                                              root_dir + "CRISPResso_on_sample_evoCDA_pooled_high_2/Alleles_frequency_table.zip", 
                                                                              root_dir + "CRISPResso_on_sample_evoCDA_pooled_high_3/Alleles_frequency_table.zip"],
                                                 population_presort_filepaths=[root_dir + "CRISPResso_on_sample_evoCDA_pooled_presort_1/Alleles_frequency_table.zip", 
                                                                               root_dir + "CRISPResso_on_sample_evoCDA_pooled_presort_2/Alleles_frequency_table.zip", 
                                                                               root_dir + "CRISPResso_on_sample_evoCDA_pooled_presort_3/Alleles_frequency_table.zip"],
                                                 wt_filepaths=[root_dir + "CRISPResso_on_sample_mock_presort_1/Alleles_frequency_table.zip"])


In [168]:
encoding_parameters_denoised_nonposition_nontrimmed_removed = cme.EncodingParameters(complete_amplicon_sequence="ACATGCTCTAGTGAAAGCCAGTCTGGGCAGCTGGGTAGCTAATGAGGGGATTAGAGAGATTTTGTTGAATGAAAGGCAGATTGAGTCCTGCTACTCGCCCCCTTCATTCCCCTTCATTCATGCCTCATTCTTCCGCCTCCCAGCCGCCTCAACTGGCCAAAGGGAAGTGGAGGCCCTGCCACCTGTAGGGAGGGTCCCCTGGGGCTTGCCCACAGCAAACAGGAAGTCACAGCCTGGTGAGATGGGCCTGGGAATCAGCCACTGAGAAAGTGGGTCTCTTGGGTCCCTGAATTCTTTTTCTGAGTCCCTGCAGCAGTGAAAAAGACACAGAGGCACATAGAGAGTG",
                            population_baseline_suffix="_CD19minus",
                            population_target_suffix="_CD19plus",
                            population_presort_suffix="_presort",
                            wt_suffix="_wt",
                            variant_types=[("C", "T"), ("G", "A")],
                            remove_denoised=True)

In [10]:
%%time
# Takes around 48min for 50cores
print("Reading tables")
encoding_dataframes_denoised_removed.read_crispresso_allele_tables()
print("Encoding tables")
encoding_dataframes_denoised_removed.encode_crispresso_allele_table(progress_bar=True, cores=60)
print("Postprocessing tables")
encoding_dataframes_denoised_removed.postprocess_encoding()

Reading tables
Encoding tables
INFO: Pandarallel will run on 60 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Encoding population_baseline_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=166), Label(value='0 / 166'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=137), Label(value='0 / 137'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=145), Label(value='0 / 145'))), HB…

Encoding population_target_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=88), Label(value='0 / 88'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=123), Label(value='0 / 123'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=120), Label(value='0 / 120'))), HB…

Encoding population_presort_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=109), Label(value='0 / 109'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=82), Label(value='0 / 82'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

Encoding population_wt_df


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=56), Label(value='0 / 56'))), HBox…

Postprocessing tables
Trimming encodings with trim_left=20 and trim_right=20
Processing encoding columns
Adding read column
Performing denoising
Denoising with positions [10, 31, 114, 117, 126, 139, 156, 172, 187, 195, 198, 213, 225, 229, 235, 249, 265] and variant types [('C', 'T'), ('G', 'A')]
Filtering by editable positions
645 non-editable positions
['20G>A', '20G>C', '20G>T', '20G>N', '20G>-', '21T>A', '21T>C', '21T>G', '21T>N', '21T>-', '22C>A', '22C>T', '22C>G', '22C>N', '22C>-', '23T>A', '23T>C', '23T>G', '23T>N', '23T>-', '39T>A', '39T>C', '39T>G', '39T>N', '39T>-', '40A>C', '40A>T', '40A>G', '40A>N', '40A>-', '41A>C', '41A>T', '41A>G', '41A>N', '41A>-', '42T>A', '42T>C', '42T>G', '42T>N', '42T>-', '43G>A', '43G>C', '43G>T', '43G>N', '43G>-', '44A>C', '44A>T', '44A>G', '44A>N', '44A>-', '45G>A', '45G>C', '45G>T', '45G>N', '45G>-', '46G>A', '46G>C', '46G>T', '46G>N', '46G>-', '47G>A', '47G>C', '47G>T', '47G>N', '47G>-', '48G>A', '48G>C', '48G>T', '48G>N', '48G>-', '49A>C', '49A

  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.append(encoded_df_rep.groupby(feature_colnames, as_index=True).sum().reset_index())
  encoded_dfs_collapsed.appe

CPU times: user 46.5 s, sys: 4min 35s, total: 5min 22s
Wall time: 8min 19s


In [15]:
import pickle

rerun = True
if rerun:
    with open('20240816_v0_1_84_EvoCDA_encoding_dataframes_denoised_removed.pkl', 'wb') as fn: 
        pickle.dump(encoding_dataframes_denoised_removed, fn) 
else:
    with open('20240816_v0_1_84_EvoCDA_encoding_dataframes_denoised_removed.pkl', 'rb') as fn: 
        encoding_dataframes_denoised_removed = pickle.load(fn) 

In [None]:
cme.save_encodings(encoding_dataframes_denoised_removed.encodings_collapsed_merged, sort_column="#Reads_presort", filename="./20240816_v0_1_84_EvoCDA_encoding_dataframes_denoised_removed_ABE8e_encodings_rep{}.tsv")
cme.save_encodings(encoding_dataframes_denoised_removed.population_wt_encoding_processed, sort_column="#Reads_wt", filename="./20240816_v0_1_84_EvoCDA_encoding_dataframes_denoised_removed_wt_encodings_rep{}.tsv")
cme.save_encodings_df(encoding_dataframes_denoised_removed.population_baseline_encoding_processed, filename="./20240816_v0_1_84_EvoCDA_encoding_dataframes_denoised_removed_CD19minus_ABE8e_encodings_rep{}.pkl")
cme.save_encodings_df(encoding_dataframes_denoised_removed.population_target_encoding_processed, filename="./20240816_v0_1_84_EvoCDA_encoding_dataframes_denoised_removed_CD19plus_ABE8e_encodings_rep{}.pkl")
cme.save_encodings_df(encoding_dataframes_denoised_removed.population_presort_encoding_processed, filename="./20240816_v0_1_84_EvoCDA_encoding_dataframes_denoised_removed_presort_ABE8e_encodings_rep{}.pkl")
cme.save_encodings_df(encoding_dataframes_denoised_removed.population_wt_encoding_processed, filename="./20240816_v0_1_84_EvoCDA_encoding_dataframes_denoised_removed_wt_encodings_rep{}.pkl")