# Run ABE8e Millipede Model

### Prepare Data Input

You will only need the encoding output from the previous step. You can also download the pre-computed encoding from [Zenodo](https://doi.org/10.5281/zenodo.13737880) at path: 
- CRISPR-CLEAR-data/data/encoding_output/20240807_v0_1_84_encoding_dataframes_denoised_removed_ABE8e_encodings_rep0.tsv
- CRISPR-CLEAR-data/data/encoding_output/20240807_v0_1_84_encoding_dataframes_denoised_removed_ABE8e_encodings_rep1.tsv
- CRISPR-CLEAR-data/data/encoding_output/20240807_v0_1_84_encoding_dataframes_denoised_removed_ABE8e_encodings_rep2.tsv

### Import packages

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from crispr_millipede import encoding as cme
from crispr_millipede import modelling as cmm

import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio.Seq import Seq

import matplotlib.pyplot as plt
import logomaker
import seaborn as sns
import matplotlib.patches as patches

In [3]:
# Set amplicon sequence - UPDATE
amplicon = "ACATGCTCTAGTGAAAGCCAGTCTGGGCAGCTGGGTAGCTAATGAGGGGATTAGAGAGATTTTGTTGAATGAAAGGCAGATTGAGTCCTGCTACTCGCCCCCTTCATTCCCCTTCATTCATGCCTCATTCTTCCGCCTCCCAGCCGCCTCAACTGGCCAAAGGGAAGTGGAGGCCCTGCCACCTGTAGGGAGGGTCCCCTGGGGCTTGCCCACAGCAAACAGGAAGTCACAGCCTGGTGAGATGGGCCTGGGAATCAGCCACTGAGAAAGTGGGTCTCTTGGGTCCCTGAATTCTTTTTCTGAGTCCCTGCAGCAGTGAAAAAGACACAGAGGCACATAGAGAGTG"
print(f"Length of amplicon {len(amplicon)}")

Length of amplicon 346


### Prepare Millipede specification parameters

In [3]:
design_matrix_spec = cmm.MillipedeDesignMatrixProcessingSpecification(
    wt_normalization=False,
    total_normalization=True,
    sigma_scale_normalized=True,
    decay_sigma_scale=True,
    K_enriched=5,
    K_baseline=5,
    a_parameter=0.0005,
    set_offset_as_presort = True,
    offset_normalized = False,
    offset_psuedocount = 1 
)

millipede_model_specification_set = {
    "joint_replicate_per_experiment_models" : cmm.MillipedeModelSpecification(
        model_types=[cmm.MillipedeModelType.NORMAL_SIGMA_SCALED, cmm.MillipedeModelType.NORMAL],
        replicate_merge_strategy=cmm.MillipedeReplicateMergeStrategy.COVARIATE,
        experiment_merge_strategy=cmm.MillipedeExperimentMergeStrategy.SEPARATE,
        S = 5,
        tau = 0.01,
        tau_intercept = 0.0001,
        cutoff_specification=cmm.MillipedeCutoffSpecification(
            per_replicate_each_condition_num_cutoff = 0, 
            per_replicate_all_condition_num_cutoff = 1, 
            all_replicate_num_cutoff = 0, 
            all_experiment_num_cutoff = 0,
            baseline_pop_all_condition_each_replicate_num_cutoff = 3,
            baseline_pop_all_condition_acceptable_rep_count = 2,
            enriched_pop_all_condition_each_replicate_num_cutoff = 3,
            enriched_pop_all_condition_acceptable_rep_count = 2,
            presort_pop_all_condition_each_replicate_num_cutoff = 3,
            presort_pop_all_condition_acceptable_rep_count = 2,
            
        ),
        design_matrix_processing_specification=design_matrix_spec
    )
}

In [4]:
# Load in the encoding data - UPDATE WITH YOUR OWN FILEPATHS and UPDATE SUFFIXES with what was specified in the encoding step
paired_end_experiments_inputdata_denoised = cmm.MillipedeInputDataExperimentalGroup(
    data_directory="./", 
    enriched_pop_fn_experiment_list = ["20240807_v0_1_84_encoding_dataframes_denoised_removed_ABE8e_encodings_rep{}.tsv"],
    enriched_pop_df_reads_colname = "#Reads_CD19minus",
    baseline_pop_fn_experiment_list = ["20240807_v0_1_84_encoding_dataframes_denoised_removed_ABE8e_encodings_rep{}.tsv"],
    baseline_pop_df_reads_colname = "#Reads_CD19plus", 
    presort_pop_fn_experiment_list = ["20240807_v0_1_84_encoding_dataframes_denoised_removed_ABE8e_encodings_rep{}.tsv"],
    presort_pop_df_reads_colname = '#Reads_presort',
    experiment_labels = ["ABE8e"],
    reps = [0,1,2],
    millipede_model_specification_set = millipede_model_specification_set
   )

Performing initial input validation checks...
Passed validation.
Retrieving data for
	Replicate Merge Strategy: MillipedeReplicateMergeStrategy.COVARIATE 
	Experiment Merge Strategy MillipedeExperimentMergeStrategy.SEPARATE
	Cutoff: 
                    per_replicate_each_condition_num_cutoff=0, 
                    per_replicate_presort_condition_num_cutoff=0, 
                    per_replicate_all_condition_num_cutoff=1, 
                    all_replicate_num_cutoff=0, 
                    all_experiment_num_cutoff=0, 
                    baseline_pop_per_condition_acceptable_rep_count=0, 
                    baseline_pop_per_condition_acceptable_rep_count=0, 
                    baseline_pop_per_condition_acceptable_rep_count=0,
                    enriched_pop_per_condition_each_replicate_num_cutoff=0,
                    enriched_pop_per_condition_acceptable_rep_count=0,
                    presort_pop_per_condition_each_replicate_num_cutoff=0,
                    presort_pop_per_

### Run Millipede

In [27]:
%%time
from crispr_millipede import encoding as cme
from crispr_millipede import modelling as cmm

# Run Millipede on the GPU, could also run on CPU by setting device=cmm.MillipedeComputeDevice.CPU 
paired_end_experiments_models_denoised = cmm.MillipedeModelExperimentalGroup(experiments_inputdata=paired_end_experiments_inputdata_denoised, device=cmm.MillipedeComputeDevice.GPU)

Start model inference for all provided model specifications: 1 total
Starting model inference for model specification id 1/1: joint_replicate_per_experiment_models
Number of single matrices: 1
With 2 model types, the total models to inference for this model specification: 2
Running model(s) for single matrix index: 1/1
Iterating through all 2 provided models: 
Preparing data for model NORMAL_SIGMA_SCALED, 1/2
Running model NORMAL_SIGMA_SCALED


  0%|          | 0/5500 [00:00<?, ?it/s]

Preparing data for model NORMAL, 2/2
Running model NORMAL


  0%|          | 0/5500 [00:00<?, ?it/s]

CPU times: user 40.2 s, sys: 859 ms, total: 41.1 s
Wall time: 38.8 s


In [23]:
# Get the Beta and PIP coefficients
beta_df = paired_end_experiments_models_denoised.millipede_model_specification_set_with_results['joint_replicate_per_experiment_models'].millipede_model_specification_result_input[0].millipede_model_specification_single_matrix_result[cmm.MillipedeModelType.NORMAL_SIGMA_SCALED].beta
pip_df = paired_end_experiments_models_denoised.millipede_model_specification_set_with_results['joint_replicate_per_experiment_models'].millipede_model_specification_result_input[0].millipede_model_specification_single_matrix_result[cmm.MillipedeModelType.NORMAL_SIGMA_SCALED].pip
display(beta_df.sort_values(ascending=False).head(40))
display(pip_df.sort_values(ascending=False).head(40))

223A>G                 0.345297
151A>G                 0.231534
137T>C                 0.218683
230A>G                 0.172012
158A>G                 0.145174
311A>G                 0.102413
Intercept              0.056921
148T>C                 0.026914
252A>G                 0.024541
228A>G                 0.022735
296T>C                 0.017394
308T>C                 0.016018
176T>C                 0.015985
intercept_exp0_rep1    0.014432
intercept_exp0_rep0    0.013535
254T>C                 0.006787
220A>G                 0.004918
318A>G                 0.004797
41A>G                  0.004710
302A>G                 0.002715
218A>G                 0.002708
260A>G                 0.002103
115A>G                 0.001701
290A>G                 0.001640
119A>G                 0.001291
28A>G                  0.000964
31T>C                  0.000835
324A>G                 0.000821
287T>C                 0.000604
186A>G                 0.000554
321A>G                 0.000549
283T>C  

137T>C    0.999999
151A>G    0.999971
223A>G    0.999883
158A>G    0.962024
270T>C    0.919929
230A>G    0.871196
311A>G    0.828489
105A>G    0.673081
241A>G    0.392369
148T>C    0.261461
252A>G    0.226278
112T>C    0.225789
248T>C    0.147185
228A>G    0.128986
308T>C    0.127089
176T>C    0.120848
239A>G    0.080153
106T>C    0.077520
296T>C    0.047972
318A>G    0.039347
254T>C    0.036575
62T>C     0.034494
220A>G    0.030856
60T>C     0.025770
41A>G     0.024199
242T>C    0.024038
59T>C     0.022887
260A>G    0.021459
213A>G    0.020653
61T>C     0.019509
113T>C    0.019012
302A>G    0.017598
283T>C    0.017441
218A>G    0.017207
65T>C     0.015144
324A>G    0.013401
289A>G    0.013216
211A>G    0.013110
165A>G    0.012787
180A>G    0.012493
Name: PIP, dtype: float64

In [117]:
sigma_hit_table = paired_end_experiments_models_denoised.millipede_model_specification_set_with_results["joint_replicate_per_experiment_models"].millipede_model_specification_result_input[0].millipede_model_specification_single_matrix_result[cmm.MillipedeModelType.NORMAL_SIGMA_SCALED].summary
sigma_hit_table.to_csv('CD19_ABE8e_sigma_hit_table_ZP.csv', index=True)
sigma_hit_table

Unnamed: 0,PIP,Coefficient,Coefficient StdDev,Conditional Coefficient,Conditional Coefficient StdDev
21T>C,0.002079,0.000057,0.002054,0.033305,0.036591
23T>C,0.001627,0.000081,0.002558,0.041877,0.040324
28A>G,0.009664,0.001066,0.010257,0.083130,0.037150
31T>C,0.003243,0.000404,0.009140,0.074057,0.099380
35T>C,0.001179,0.000004,0.000438,0.042920,0.000000
...,...,...,...,...,...
324A>G,0.012526,0.001565,0.013515,0.101982,0.040786
intercept_exp0_rep0,,0.013960,0.013531,0.013960,0.013531
intercept_exp0_rep1,,0.015229,0.013990,0.015229,0.013990
intercept_exp0_rep2,,-0.028532,0.014108,-0.028532,0.014108
