# Run sg218 Millipede Model

### Prepare Data Input

You will only need the encoding output from the previous step. You can also download the pre-computed encoding from [Zenodo](https://doi.org/10.5281/zenodo.13737880) at path: 
- CRISPR-CLEAR-data/data/encoding_output/20240905_v0_1_89_sg219_encoding_dataframes_variantdenoised_removed_CRISPResso2Merged_trimmed_ABE8e_encodings_rep0.tsv
- CRISPR-CLEAR-data/data/encoding_output/20240905_v0_1_89_sg219_encoding_dataframes_variantdenoised_removed_CRISPResso2Merged_trimmed_ABE8e_encodings_rep1.tsv
- CRISPR-CLEAR-data/data/encoding_output/20240905_v0_1_89_sg219_encoding_dataframes_variantdenoised_removed_CRISPResso2Merged_trimmed_ABE8e_encodings_rep2.tsv

### Import packages

In [28]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"


from crispr_millipede import encoding as cme
from crispr_millipede import modelling as cmm


In [29]:
import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio.Seq import Seq

import matplotlib.pyplot as plt

In [30]:
# Set amplicon sequence - UPDATE

amplicon = "ACATGCTCTAGTGAAAGCCAGTCTGGGCAGCTGGGTAGCTAATGAGGGGATTAGAGAGATTTTGTTGAATGAAAGGCAGATTGAGTCCTGCTACTCGCCCCCTTCATTCCCCTTCATTCATGCCTCATTCTTCCGCCTCCCAGCCGCCTCAACTGGCCAAAGGGAAGTGGAGGCCCTGCCACCTGTAGGGAGGGTCCCCTGGGGCTTGCCCACAGCAAACAGGAAGTCACAGCCTGGTGAGATGGGCCTGGGAATCAGCCACTGAGAAAGTGGGTCTCTTGGGTCCCTGAATTCTTTTTCTGAGTCCCTGCAGCAGTGAAAAAGACACAGAGGCACATAGAGAGTG"
print(f"Length of amplicon {len(amplicon)}")

Length of amplicon 346


### Prepare Millipede specification parameters

In [31]:
design_matrix_spec = cmm.MillipedeDesignMatrixProcessingSpecification(
    wt_normalization=False,
    total_normalization=True,
    sigma_scale_normalized=True,
    decay_sigma_scale=True,
    K_enriched=5,
    K_baseline=5,
    a_parameter=0.0001, #0.0001
    set_offset_as_presort = True,
    offset_normalized = False,
    offset_psuedocount = 1
)

millipede_model_specification_set = {
    "joint_replicate_per_experiment_models" : cmm.MillipedeModelSpecification(
        model_types=[cmm.MillipedeModelType.NORMAL_SIGMA_SCALED, cmm.MillipedeModelType.NORMAL],
        replicate_merge_strategy=cmm.MillipedeReplicateMergeStrategy.COVARIATE,
        experiment_merge_strategy=cmm.MillipedeExperimentMergeStrategy.SEPARATE,
        S = 5,
        tau = 0.01,
        tau_intercept = 0.0001,
        cutoff_specification=cmm.MillipedeCutoffSpecification(
            per_replicate_each_condition_num_cutoff = 0, 
            per_replicate_all_condition_num_cutoff = 1, 
            all_replicate_num_cutoff = 0, 
            all_experiment_num_cutoff = 0,
            baseline_pop_all_condition_each_replicate_num_cutoff = 3,
            baseline_pop_all_condition_acceptable_rep_count = 2,
            enriched_pop_all_condition_each_replicate_num_cutoff = 3,
            enriched_pop_all_condition_acceptable_rep_count = 2,
            presort_pop_all_condition_each_replicate_num_cutoff = 3,
            presort_pop_all_condition_acceptable_rep_count = 2,
            
        ),
        design_matrix_processing_specification=design_matrix_spec
    )
}

In [32]:
# Load in the encoding data - UPDATE WITH YOUR OWN FILEPATHS and UPDATE SUFFIXES with what was specified in the encoding step
paired_end_experiments_inputdata_denoised = cmm.MillipedeInputDataExperimentalGroup(
    data_directory="./", 
    enriched_pop_fn_experiment_list = ["20240905_v0_1_89_sg219_encoding_dataframes_variantdenoised_removed_CRISPResso2Merged_trimmed_ABE8e_encodings_rep{}.tsv"],
    enriched_pop_df_reads_colname = "#Reads_CD19minus",
    baseline_pop_fn_experiment_list = ["20240905_v0_1_89_sg219_encoding_dataframes_variantdenoised_removed_CRISPResso2Merged_trimmed_ABE8e_encodings_rep{}.tsv"],
    baseline_pop_df_reads_colname = "#Reads_CD19plus", 
    presort_pop_fn_experiment_list = ["20240905_v0_1_89_sg219_encoding_dataframes_variantdenoised_removed_CRISPResso2Merged_trimmed_ABE8e_encodings_rep{}.tsv"],
    presort_pop_df_reads_colname = '#Reads_presort',
    experiment_labels = ["EvoCDA"],
    reps = [0,1,2],
    millipede_model_specification_set = millipede_model_specification_set
   )


Performing initial input validation checks...
Passed validation.
Retrieving data for
	Replicate Merge Strategy: MillipedeReplicateMergeStrategy.COVARIATE 
	Experiment Merge Strategy MillipedeExperimentMergeStrategy.SEPARATE
	Cutoff: 
                    per_replicate_each_condition_num_cutoff=0, 
                    per_replicate_presort_condition_num_cutoff=0, 
                    per_replicate_all_condition_num_cutoff=1, 
                    all_replicate_num_cutoff=0, 
                    all_experiment_num_cutoff=0, 
                    baseline_pop_per_condition_acceptable_rep_count=0, 
                    baseline_pop_per_condition_acceptable_rep_count=0, 
                    baseline_pop_per_condition_acceptable_rep_count=0,
                    enriched_pop_per_condition_each_replicate_num_cutoff=0,
                    enriched_pop_per_condition_acceptable_rep_count=0,
                    presort_pop_per_condition_each_replicate_num_cutoff=0,
                    presort_pop_per_

## Run Millipede

In [36]:
%%time
from crispr_millipede import encoding as cme
from crispr_millipede import modelling as cmm

# Run Millipede on the GPU, could also run on CPU by setting device=cmm.MillipedeComputeDevice.CPU 
paired_end_experiments_models_denoised = cmm.MillipedeModelExperimentalGroup(experiments_inputdata=paired_end_experiments_inputdata_denoised, device=cmm.MillipedeComputeDevice.GPU)

Start model inference for all provided model specifications: 1 total
Starting model inference for model specification id 1/1: joint_replicate_per_experiment_models
Number of single matrices: 1
With 2 model types, the total models to inference for this model specification: 2
Running model(s) for single matrix index: 1/1
Iterating through all 2 provided models: 
Preparing data for model NORMAL_SIGMA_SCALED, 1/2
Running model NORMAL_SIGMA_SCALED


  0%|          | 0/5500 [00:00<?, ?it/s]

Preparing data for model NORMAL, 2/2
Running model NORMAL


  0%|          | 0/5500 [00:00<?, ?it/s]

CPU times: user 41.1 s, sys: 5.08 s, total: 46.2 s
Wall time: 44 s


In [37]:
beta_df = paired_end_experiments_models_denoised.millipede_model_specification_set_with_results['joint_replicate_per_experiment_models'].millipede_model_specification_result_input[0].millipede_model_specification_single_matrix_result[cmm.MillipedeModelType.NORMAL_SIGMA_SCALED].beta
pip_df = paired_end_experiments_models_denoised.millipede_model_specification_set_with_results['joint_replicate_per_experiment_models'].millipede_model_specification_result_input[0].millipede_model_specification_single_matrix_result[cmm.MillipedeModelType.NORMAL_SIGMA_SCALED].pip
display(beta_df.sort_values(ascending=False).head(40))
display(pip_df.sort_values(ascending=False).head(40))

224A>G                 0.209907
230A>G                 0.148859
220A>G                 0.033779
223A>G                 0.028854
intercept_exp0_rep1    0.018781
254T>C                 0.007677
127T>C                 0.003882
62T>C                  0.003430
131T>C                 0.002730
103T>C                 0.002116
205T>C                 0.002107
206T>C                 0.002033
151A>G                 0.001842
186A>G                 0.001029
190A>G                 0.001022
324A>G                 0.000917
126A>G                 0.000788
40A>G                  0.000769
107T>C                 0.000718
117T>C                 0.000708
158A>G                 0.000620
113T>C                 0.000560
54A>G                  0.000499
199T>C                 0.000494
23T>C                  0.000485
234T>C                 0.000468
237T>C                 0.000466
56A>G                  0.000398
316T>C                 0.000376
217A>G                 0.000375
176T>C                 0.000368
51T>C   

230A>G    0.999791
224A>G    0.999740
220A>G    0.198396
223A>G    0.190419
289A>G    0.041748
50T>C     0.027256
278T>C    0.023368
283T>C    0.022754
254T>C    0.018422
320A>G    0.017792
72A>G     0.014070
159A>G    0.012914
252A>G    0.012600
62T>C     0.012524
241A>G    0.012191
131T>C    0.011368
242T>C    0.010919
206T>C    0.008406
186A>G    0.008302
127T>C    0.007964
151A>G    0.007871
103T>C    0.007321
199T>C    0.007296
205T>C    0.007291
276T>C    0.006153
117T>C    0.006033
190A>G    0.005450
194T>C    0.005402
279T>C    0.005028
126A>G    0.004928
158A>G    0.004785
77A>G     0.004575
302A>G    0.004569
183T>C    0.004500
91T>C     0.004485
324A>G    0.004426
226T>C    0.004368
160A>G    0.004351
237T>C    0.004236
308T>C    0.004189
Name: PIP, dtype: float64

In [153]:
beta_df = paired_end_experiments_models_denoised.millipede_model_specification_set_with_results['joint_replicate_per_experiment_models'].millipede_model_specification_result_input[0].millipede_model_specification_single_matrix_result[cmm.MillipedeModelType.NORMAL].beta
pip_df = paired_end_experiments_models_denoised.millipede_model_specification_set_with_results['joint_replicate_per_experiment_models'].millipede_model_specification_result_input[0].millipede_model_specification_single_matrix_result[cmm.MillipedeModelType.NORMAL].pip
display(beta_df.sort_values(ascending=False).head(40))
display(pip_df.sort_values(ascending=False).head(40))

intercept_exp0_rep1    0.195477
218A>G                 0.026805
224A>G                 0.025688
220A>G                 0.021099
Intercept              0.008955
223A>G                 0.007805
228A>G                 0.001056
226T>C                -0.000827
intercept_exp0_rep0   -0.063569
intercept_exp0_rep2   -0.130239
Name: Coefficient, dtype: float64

218A>G    0.162961
224A>G    0.151067
220A>G    0.130310
223A>G    0.066543
226T>C    0.032389
228A>G    0.024483
Name: PIP, dtype: float64

In [38]:
sigma_hit_table = paired_end_experiments_models_denoised.millipede_model_specification_set_with_results["joint_replicate_per_experiment_models"].millipede_model_specification_result_input[0].millipede_model_specification_single_matrix_result[cmm.MillipedeModelType.NORMAL_SIGMA_SCALED].summary
sigma_hit_table.to_csv('CD19_sg218_sigma_hit_table_ZPV2.csv', index=True)