In [61]:
import os
import pandas as pd
from CRISPResso2 import CRISPRessoShared, CRISPRessoPlot
from collections import defaultdict
import matplotlib.pyplot as plt
import ast

In [2]:
%matplotlib inline

In [12]:
! unzip ../CRISPResso2_tests/cli_integration_tests/CRISPResso_on_params/Alleles_frequency_table.zip

Archive:  ../CRISPResso2_tests/cli_integration_tests/CRISPResso_on_params/Alleles_frequency_table.zip
  inflating: Alleles_frequency_table.txt  


In [26]:
df_alleles = pd.read_csv('df_alleles.txt', sep='\t')

In [28]:
df_alleles = df_alleles.set_index('Aligned_Sequence')

In [36]:
for idx, row in df_alleles.iterrows():
    print(idx)
    

ACATCGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGGTGAAAGCGGAAGTAGGGCCTTCGCGCACCTCATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAGCTTCTGGCGGTCTCAAGCACTACCTACGTCAGCACCTGGGACCCCGCCACCGTGCGCCGGGCCTTGCAGTGGGCGCGCTACCTGCGCCACATCCATCGGCGCTTTGGTCGGCATGGCCCCATTCGCACGGCTCT


## Plotting

In [3]:
df_to_plot = pd.read_csv('df_to_plot.csv', header=0, sep='\t', index_col=0)

In [4]:
df_to_plot.head()

Unnamed: 0_level_0,Reference_Sequence,Unedited,n_deleted,n_inserted,n_mutated,#Reads,%Reads
Aligned_Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,1,0,0,0,163,78.365385
CATGGAATCCCTTCTGCA---CCTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,1,0,0,0,8,3.846154
CATGGAATCCCTTCTG----ACCTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,1,0,0,0,2,0.961538
CATGGAATCCCTTCTGCAGC--CTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,1,0,0,0,2,0.961538
CATGGAATCCCTTCTGCAGCCACCTGGATCGCTTTTCCGA,CATGGAATCCCTTCTGCAGC-ACCTGGATCGCTTTTCCGA,1,0,0,0,2,0.961538


In [5]:
plot_9a_inputs = {
    'reference_seq': (reference_seq:='CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG'), 
    'df_alleles': (df_alleles:=df_to_plot), 
    'fig_filename_root': (fig_filename_root:='./scripts/figures/9a'), 
    'custom_colors': (custom_colors:={
        'Substitution': '#0000FF', 
        'Insertion': '#008000', 
        'Deletion': '#FF0000', 
        'A': '#7FC97F', 
        'T': '#BEAED4', 
        'C': '#FDC086', 
        'G': '#FFFF99', 
        'N': '#C8C8C8', 
        '-': '#1E1E1E'
    }), 
    'MIN_FREQUENCY': (MIN_FREQUENCY:=0.2), 
    'MAX_N_ROWS': (MAX_N_ROWS:=50), 
    'SAVE_ALSO_PNG': (SAVE_ALSO_PNG:=True), 
    'plot_cut_point': (plot_cut_point:=False), 
    'sgRNA_intervals': (sgRNA_intervals:=[(3, 22), (99, 119), (98, 112)]), 
    'sgRNA_names': (sgRNA_names:=['hi', 'dear', '']), 
    'sgRNA_mismatches': (sgRNA_mismatches:=[[], [0], [7]]), 
    'annotate_wildtype_allele': (annotate_wildtype_allele:='')
}

In [9]:
(
    X, # 2d array: This is the sequence converted to ints I THINK for the cmap 
    annot, # This is the sequence of bp's 
    y_labels, # this is the percentage and tallies displayed to the right ``
    insertion_dict, # I think this is key: which aligned_seq has insertion, value: where the insertion is
    per_element_annot_kws, # dict of dicts: this is for bolding the substitutions
    is_reference,
    ref_sequence_amino_acids) = CRISPRessoPlot.prep_amino_acid_table(
        df_to_plot, 
        plot_9a_inputs['reference_seq'], 
        plot_9a_inputs['MAX_N_ROWS'], 
        plot_9a_inputs['MIN_FREQUENCY']
        )


In [10]:
X

[[7, 6, 8, 13, 16, 1, 1, 13, 6, 16, 10, 5, 15],
 [7, 6, 8, 13, 16, 1, 22, 13, 6, 16, 10, 5, 15],
 [7, 6, 8, 13, 16, 3, 10, 3, 15, 5, 16, 4, 21],
 [7, 6, 8, 13, 16, 1, 1, 19, 8, 1, 5, 13, 21],
 [7, 6, 8, 13, 16, 1, 1, 17, 19, 8, 1, 5, 13],
 [7, 6, 8, 22, 2, 6, 0, 13, 20, 20, 15, 10, 5],
 [7, 6, 8, 13, 16, 13, 6, 16, 10, 5, 15, 21, 21],
 [7, 6, 8, 13, 16, 22, 17, 19, 8, 1, 5, 13, 21],
 [7, 6, 8, 13, 16, 22, 7, 10, 3, 15, 5, 16, 4],
 [7, 6, 8, 13, 16, 1, 19, 8, 1, 5, 13, 21, 21],
 [7, 6, 8, 13, 16, 1, 17, 19, 8, 1, 5, 13, 21],
 [7, 6, 8, 13, 16, 1, 4, 6, 8, 1, 5, 13, 21],
 [7, 6, 8, 13, 16, 1, 3, 14, 15, 6, 16, 10, 14],
 [7, 6, 8, 13, 16, 1, 1, 8, 1, 5, 13, 21, 21],
 [7, 6, 8, 13, 16, 1, 1, 6, 16, 10, 5, 15, 21],
 [7, 6, 8, 13, 16, 1, 1, 17, 19, 8, 1, 5, 13],
 [7, 6, 8, 13, 16, 1, 1, 10, 3, 15, 5, 16, 4],
 [7, 6, 8, 13, 16, 1, 1, 16, 6, 16, 10, 5, 15],
 [7, 6, 8, 13, 16, 1, 1, 17, 19, 8, 1, 5, 13],
 [7, 6, 8, 13, 16, 1, 1, 16, 22, 15, 5, 16, 4]]

In [11]:
CRISPRessoPlot.plot_amino_acid_table(**plot_9a_inputs)

In [11]:
len(set(CRISPRessoShared.CODON_TO_AMINO_ACID_SINGLE_CHAR.values()))

21

In [11]:
amino_acids = list(set(CRISPRessoShared.CODON_TO_AMINO_ACID_SINGLE_CHAR.values())) + ['']

In [12]:
cmap = ['#FF000066', '#90EE9066', '#00800066', '#00640066', '#00640066', '#C8A2C866', '#90EE9066', '#00008B66', '#0000FF66', '#FFA50066', '#0000FF66', '#0000FF66', '#00640066', '#FFC0CB66', '#00640066', '#FFA50066', '#FF573366', '#FF573366', '#0000FF66', '#C8A2C866', '#C8A2C866', '#FFFFFF66']


In [18]:
df_alleles = pd.read_csv('Alleles_frequency_table.txt', sep='\t', index_col='Aligned_Sequence')

In [22]:
len(df_alleles.iloc[0,0])

250

In [23]:
df_alleles.head()

Unnamed: 0_level_0,Reference_Sequence,Reference_Name,Read_Status,n_deleted,n_inserted,n_mutated,#Reads,%Reads,contains dsODN,contains dsODN fragment
Aligned_Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ACATCGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGGTGAAAGCGGAAGTAGGGCCTTCGCGCACCTCATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAGCTTCTGGCGGTCTCAAGCACTACCTACGTCAGCACCTGGGACCCCGCCACCGTGCGCCGGGCCTTGCAGTGGGCGCGCTACCTGCGCCACATCCATCGGCGCTTTGGTCGGCATGGCCCCATTCGCACGGCTCT,----CGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGG...,FANC,UNMODIFIED,0,0,0,3,1.442308,False,False
AGAGCGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGGTGAAAGCGGAAGTAGGGCCTTCGCGCACCTCATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAGCTTCTGGCGGTCTCAAGCACTACCTACGTCAGCACCTGGGACCCCGCCACCGTGCGCCGGGCCTTGCAGTGGGCGCGCTACCTGCGCCACATCCATCGGCGCTTTGGTCGGCATGGCCCCATTCGCACGGCTCT,----CGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGG...,FANC,UNMODIFIED,0,0,0,3,1.442308,False,False
AGGGCGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGGTGAAAGCGGAAGTAGGGCCTTCGCGCACCTCATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAGCTTCTGGCGGTCTCAAGCACTACCTACGTCAGCACCTGGGACCCCGCCACCGTGCGCCGGGCCTTGCAGTGGGCGCGCTACCTGCGCCACATCCATCGGCGCTTTGGTCGGCATGGCCCCATTCGCACGGCTCT,----CGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGG...,FANC,UNMODIFIED,0,0,0,3,1.442308,False,False
GCAGCGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGGTGAAAGCGGAAGTAGGGCCTTCGCGCACCTCATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAGCTTCTGGCGGTCTCAAGCACTACCTACGTCAGCACCTGGGACCCCGCCACCGTGCGCCGGGCCTTGCAGTGGGCGCGCTACCTGCGCCACATCCATCGGCGCTTTGGTCGGCATGGCCCCATTCGCACGGCTCT,----CGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGG...,FANC,UNMODIFIED,0,0,0,3,1.442308,False,False
GGAGCGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGGTGAAAGCGGAAGTAGGGCCTTCGCGCACCTCATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAGCTTCTGGCGGTCTCAAGCACTACCTACGTCAGCACCTGGGACCCCGCCACCGTGCGCCGGGCCTTGCAGTGGGCGCGCTACCTGCGCCACATCCATCGGCGCTTTGGTCGGCATGGCCCCATTCGCACGGCTCT,----CGGATGTTCCAATCAGTACGCAGAGAGTCGCCGTCTCCAAGG...,FANC,UNMODIFIED,0,0,0,3,1.442308,False,False


## Load info dict

In [12]:
root = '../CRISPResso2_tests/cli_integration_tests/CRISPResso_on_params/'
crispresso2_info = CRISPRessoShared.load_crispresso_info(root)

In [13]:
refs = crispresso2_info['results']['refs']

In [14]:
sequence = refs['FANC']['sequence']

In [15]:
refs['FANC']['contains_coding_seq']

True

In [16]:
exon_positions = refs['FANC']['exon_positions']

In [17]:
coding_seq = ''.join([refs['FANC']['sequence'][i] for i in refs['FANC']['exon_positions']])

## Get DF to Plot

* Aligned Sequence is the aligned read from the fastq. '-' is a deletion.
* Reference Sequence is the aligned reference sequence. '-' is an insertion.

Our goal is to grab the coding sequences out of the aligned sequences. We'll use `CRISPRessoShared.get_dataframe_around_cut`

In [18]:
refs['FANC']['sgRNA_cut_points']

[91, 188, 172]

In [64]:
df_alleles = pd.read_csv('df_alleles.txt', sep='\t', index_col=0)
df_alleles['ref_positions'] = df_alleles['ref_positions'].apply(ast.literal_eval)

In [66]:
df_test = df_alleles_around_cut=CRISPRessoShared.get_dataframe_around_cut(df_alleles.loc[df_alleles['Reference_Name'] == 'FANC'], 91, 20)

In [67]:
df_test

Unnamed: 0_level_0,Reference_Sequence,Unedited,n_deleted,n_inserted,n_mutated,#Reads,%Reads
Aligned_Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,True,0,0,0,163,78.365385
CATGGAATCCCTTCTGCA---CCTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,True,0,0,0,8,3.846154
CATGGAATCCCTTCTG----ACCTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,True,0,0,0,2,0.961538
CATGGAATCCCTTCTGCAGC--CTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,True,0,0,0,2,0.961538
CATGGAATCCCTTCTGCAGCCACCTGGATCGCTTTTCCGA,CATGGAATCCCTTCTGCAGC-ACCTGGATCGCTTTTCCGA,True,0,0,0,2,0.961538
CATGGAATC---TGTGGATAACC-GTATTACCGCCTTTTC,CATGGAATCCCTTCTGCAGCACCTGGATCGC----TTTTC,True,0,0,0,1,0.480769
CATGGAATCCCTTC------ACCTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,True,0,0,0,1,0.480769
CATGGAATCCCTTCT-----ACCTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,True,0,0,0,1,0.480769
CATGGAATCCCTTCT----CACCTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,True,0,0,0,1,0.480769
CATGGAATCCCTTCTGC-----CTGGATCGCTTTTCCGAG,CATGGAATCCCTTCTGCAGCACCTGGATCGCTTTTCCGAG,True,0,0,0,1,0.480769
