### Tool Name: Deletion Analyzer
### Name: Michael Welford

#### Updated 10/23/2022

Required input files: <br>
                      pairwise_deletion_counts_freq_for_S_all.csv<br>
                      pairwise_deletion_lengths_counts_freq_for_S_all.csv<br>
                      multi_deletion_counts_freq_for_S_all.csv<br>
                      multi_deletion_lengths_counts_freq_for_S_all.csv<br>

In [1]:
# Import required libraries/packages.
import pandas as pd
import altair as alt

# Define the number of sequences.
NUM_WITH_OMICRON_SEQUENCE = 305090
NUM_PRE_OMICRON_SEQUENCE = 1128595

In [2]:
def is_frameshift(length):
    """
    Determines whether a mutation is frameshift given a length.
    """
    if length%3 == 0:
        return False
    else:
        return True

# Import data

## UniqueSeqs-Sept

In [3]:
# Import the pairwise deletion data.
old_deletion_length_data = pd.read_csv('pairwise_deletion_counts_freq_for_S_all.csv')

In [4]:
# Preview the data.
old_deletion_length_data

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions
0,3,1,3,1,8.861305e-07,0,0.0
1,4,6,9,1,8.861305e-07,0,0.0
2,5,3,7,3,2.658391e-06,0,0.0
3,8,1,8,2,1.772261e-06,0,0.0
4,8,3,10,1,8.861305e-07,0,0.0
...,...,...,...,...,...,...,...
819,3772,1,3772,1,8.861305e-07,0,0.0
820,3780,15,3794,1,8.861305e-07,0,0.0
821,3796,24,3819,1,8.861305e-07,0,0.0
822,3810,1,3810,1,8.861305e-07,0,0.0


In [5]:
# Import the pairwise deletion lengths data.
old_data_deletion_lengths = pd.read_csv('pairwise_deletion_lengths_counts_freq_for_S_all.csv')

In [6]:
# Show the top 10 deletion lengths by number of sequences with a deletion of that length.
old_data_deletion_lengths.sort_values('Frequency', ascending=False).head(10)

Unnamed: 0,Deletion Length,Count,Frequency
5,6,730614,0.647419
2,3,387304,0.343202
8,9,5797,0.005137
0,1,1322,0.001171
19,21,876,0.000776
11,12,339,0.0003
1,2,306,0.000271
3,4,172,0.000152
14,15,120,0.000106
17,18,56,5e-05


## UniqueSeqs-Dec

In [7]:
# Import the multi alignment data.
deletion_length_data = pd.read_csv('multi_deletion_counts_freq_for_S_all.csv')

In [8]:
# Preview the data.
deletion_length_data

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions
0,7,1,7,1,0.000003,216414,0.709345
1,10,3,12,1,0.000003,216414,0.709345
2,12,1,12,11,0.000036,216414,0.709345
3,13,1,13,128,0.000420,216414,0.709345
4,13,2,14,1,0.000003,216414,0.709345
...,...,...,...,...,...,...,...
2395,3794,9,3802,2,0.000007,216414,0.709345
2396,3801,1,3801,1,0.000003,216414,0.709345
2397,3805,1,3805,1,0.000003,216414,0.709345
2398,3813,1,3813,1,0.000003,216414,0.709345


In [9]:
# Import the multi alignment deletion lengths data.
new_deletion_lengths_data = pd.read_csv('multi_deletion_lengths_counts_freq_for_S_all.csv')

# Show the top 10 lengths by the frequency of sequences with deletions of that length.
new_deletion_lengths_data.sort_values('Frequency', ascending=False).head(20)

Unnamed: 0,Deletion Length,Count,Frequency
5,6,206590,0.677144
2,3,49215,0.161313
8,9,3244,0.010633
0,1,1615,0.005294
20,21,1599,0.005241
11,12,438,0.001436
1,2,395,0.001295
4,5,242,0.000793
3,4,217,0.000711
14,15,133,0.000436


# Add amino acid position values

In [10]:
# Add the amino acid positions as attributes.
old_deletion_length_data['Starting_Amino_Acid_Position'] = ((old_deletion_length_data.Starting_Nucleotide_Position - 1) // 3 + 1)

old_deletion_length_data['Ending_Amino_Acid_Position'] = ((old_deletion_length_data.Ending_Nucleotide_Position - 1) // 3 + 1)

# Preview the updated data.
old_deletion_length_data

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position
0,3,1,3,1,8.861305e-07,0,0.0,1,1
1,4,6,9,1,8.861305e-07,0,0.0,2,3
2,5,3,7,3,2.658391e-06,0,0.0,2,3
3,8,1,8,2,1.772261e-06,0,0.0,3,3
4,8,3,10,1,8.861305e-07,0,0.0,3,4
...,...,...,...,...,...,...,...,...,...
819,3772,1,3772,1,8.861305e-07,0,0.0,1258,1258
820,3780,15,3794,1,8.861305e-07,0,0.0,1260,1265
821,3796,24,3819,1,8.861305e-07,0,0.0,1266,1273
822,3810,1,3810,1,8.861305e-07,0,0.0,1270,1270


In [11]:
# Repeat the process for the multi alignment data.
deletion_length_data['Starting_Amino_Acid_Position'] = ((deletion_length_data.Starting_Nucleotide_Position - 1) // 3 + 1)
deletion_length_data['Ending_Amino_Acid_Position'] = ((deletion_length_data.Ending_Nucleotide_Position - 1) // 3 + 1)
deletion_length_data

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position
0,7,1,7,1,0.000003,216414,0.709345,3,3
1,10,3,12,1,0.000003,216414,0.709345,4,4
2,12,1,12,11,0.000036,216414,0.709345,4,4
3,13,1,13,128,0.000420,216414,0.709345,5,5
4,13,2,14,1,0.000003,216414,0.709345,5,5
...,...,...,...,...,...,...,...,...,...
2395,3794,9,3802,2,0.000007,216414,0.709345,1265,1268
2396,3801,1,3801,1,0.000003,216414,0.709345,1267,1267
2397,3805,1,3805,1,0.000003,216414,0.709345,1269,1269
2398,3813,1,3813,1,0.000003,216414,0.709345,1271,1271


# Deletion Lengths

In [12]:
# Determine the min, max, and count of different deletion lengths in the pairwise data.
old_deletion_length_values = set(old_deletion_length_data['Deletion_Length'])
print("Min:",min(old_deletion_length_values))
print("Max:",max(old_deletion_length_values))
print("Count:",len(old_deletion_length_values))

Min: 1
Max: 271
Count: 52


In [13]:
# Determine the min, max, and count of different deletion lengths in the multi data.
deletion_length_values = set(deletion_length_data['Deletion_Length'])
print("Min:",min(deletion_length_values))
print("Max:",max(deletion_length_values))
print("Count:",len(deletion_length_values))

Min: 1
Max: 102
Count: 96


# List of contiguous deletions with frequency > 0.001

In [14]:
# Print the list of high frequency contiguous/continuous deletions in the pairwise dataset.
old_high_frequency_deletions = old_deletion_length_data[old_deletion_length_data.Deletion_Freq > 0.001].sort_values('Deletion_Freq', ascending=False)
old_high_frequency_deletions

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position
264,429,3,431,383463,0.339798,0,0.0,143,144
111,203,6,208,382341,0.338804,0,0.0,68,70
320,467,6,472,345538,0.306192,0,0.0,156,158
409,719,9,727,4587,0.004065,0,0.0,240,243
118,205,6,210,1522,0.001349,0,0.0,69,70
278,433,3,435,1161,0.001029,0,0.0,145,145


In [15]:
# Print the list of high frequency contiguoues deletions in the multi dataset.
deletion_length_data[deletion_length_data.Deletion_Freq > 0.001].sort_values('Deletion_Freq', ascending=False)

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position
761,467,6,472,158536,0.519637,216414,0.709345,156,158
238,204,6,209,47791,0.156646,216414,0.709345,68,70
686,432,3,434,47202,0.154715,216414,0.709345,144,145
994,725,9,733,2153,0.007057,216414,0.709345,242,245
1041,737,21,757,1492,0.00489,216414,0.709345,246,253
625,422,9,430,559,0.001832,216414,0.709345,141,144
1006,727,6,732,469,0.001537,216414,0.709345,243,244
907,629,3,631,399,0.001308,216414,0.709345,210,211


# Histogram (Ratio of sequences with deletions of each length) (Metric 1)

In [16]:
# Create the histogram for the pairwise dataset.
hist_sept = alt.Chart(old_data_deletion_lengths).mark_bar(color='black').encode(
x = alt.X('Deletion Length', axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(nice=False)),
y = alt.Y('Frequency', title='Log(Frequency)',axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(type='log'))).properties(title={"text":'Histogram of Deletion Lengths: UniqueSeqs-Sept',
                                                                        "fontSize":14})

# Create the histogram for the multi dataset.
hist_dec = alt.Chart(new_deletion_lengths_data).mark_bar(color='black').encode(
x = alt.X('Deletion Length', axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(nice=False)),
y = alt.Y('Frequency', title='Log(Frequency)',axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(type='log', domain=[0.0000001,1]))).properties(title={"text":'Histogram of Deletion Lengths: UniqueSeqs-Dec',
                                                                        "fontSize":14})

# Combine the two plots.
hist_sept|hist_dec

In [17]:
# Create subsets to just show lengths <= 10.
old_deletion_lengths_subset = old_data_deletion_lengths[old_data_deletion_lengths['Deletion Length'] <= 10]
new_deletion_lengths_subset = new_deletion_lengths_data[new_deletion_lengths_data['Deletion Length'] <= 10]

In [18]:
# Create the subsetted histogram for the pairwise dataset.
hist_sept_10 = alt.Chart(old_deletion_lengths_subset).mark_bar(color='black').encode(
x = alt.X('Deletion Length', axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(nice=False)),
y = alt.Y('Frequency', axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(domain=[0,1]))).properties(title={"text":'Histogram of Deletion Lengths: UniqueSeqs-Sept',
                                                                         "fontSize":14,
                                                                                                                      "subtitle":"Lengths: 1-10"})
# Create the subsetted histogram for the multi dataset.
hist_dec_10 = alt.Chart(new_deletion_lengths_subset).mark_bar(color='black').encode(
x = alt.X('Deletion Length', axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(nice=False)),
y = alt.Y('Frequency', axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(domain=[0,1]))).properties(title={"text":'Histogram of Deletion Lengths: UniqueSeqs-Dec',
                                                                         "fontSize":14,
                                                                                                                      "subtitle":"Lengths: 1-10"})
# Merge the show plots.
hist_sept_10|hist_dec_10

# Weighted overall frequency lengths of deletions. (Metric 2)

## This is the second metric used. The weighted frequency is the number of deletion occurences in the dataset of a given length over the total number of deletion occurences. 

In [19]:
# Define the weighted deletion frequency as a new attribute in the pairwise dataset.
old_deletion_length_data['Weighted Deletion Frequency'] = old_deletion_length_data['Deletion_Count'] / old_deletion_length_data.Deletion_Count.sum()

# Display the data.
old_deletion_length_data[['Starting_Nucleotide_Position', 'Deletion_Length','Weighted Deletion Frequency']]

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Weighted Deletion Frequency
0,3,1,8.859640e-07
1,4,6,8.859640e-07
2,5,3,2.657892e-06
3,8,1,1.771928e-06
4,8,3,8.859640e-07
...,...,...,...
819,3772,1,8.859640e-07
820,3780,15,8.859640e-07
821,3796,24,8.859640e-07
822,3810,1,8.859640e-07


In [20]:
# Show the top 5 deletion lengths in the pairwise dataset by frequency.
old_deletion_length_data.groupby('Deletion_Length').sum().sort_values('Deletion_Freq', ascending=False).head(5)

Unnamed: 0_level_0,Starting_Nucleotide_Position,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position,Weighted Deletion Frequency
Deletion_Length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6,57076,57601,731306,0.648033,0,0.0,19058,19244,0.647911
3,169158,169500,387745,0.343593,0,0.0,56437,56569,0.343528
9,41812,42132,5803,0.005142,0,0.0,13952,14058,0.005141
1,368152,368152,1774,0.001572,0,0.0,122797,122797,0.001572
21,15461,15781,876,0.000776,0,0.0,5158,5268,0.000776


In [21]:
# Add the weighted deletion frequency for the multi dataset.
deletion_length_data['Weighted Deletion Frequency'] = deletion_length_data['Deletion_Count'] / deletion_length_data.Deletion_Count.sum()
deletion_length_data[['Starting_Nucleotide_Position', 'Deletion_Length','Weighted Deletion Frequency']]

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Weighted Deletion Frequency
0,7,1,0.000004
1,10,3,0.000004
2,12,1,0.000041
3,13,1,0.000477
4,13,2,0.000004
...,...,...,...
2395,3794,9,0.000007
2396,3801,1,0.000004
2397,3805,1,0.000004
2398,3813,1,0.000004


In [22]:
# Show the top 5 lengths based on deletion frequency for the multi-alignment data.
deletion_length_data.groupby('Deletion_Length').sum().sort_values('Deletion_Freq', ascending=False).head(5)

Unnamed: 0_level_0,Starting_Nucleotide_Position,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position,Weighted Deletion Frequency
Deletion_Length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6,202146,202881,207857,0.681297,31812858,104.273683,67442,67674,0.775195
3,289985,290457,49509,0.162277,51073704,167.405369,96748,96906,0.184642
9,97087,97751,3258,0.010679,17962362,58.875617,32396,32610,0.012151
1,1216118,1216118,3249,0.010649,160362774,525.624485,405617,405617,0.012117
21,21702,22222,1599,0.005241,5626764,18.442964,7243,7416,0.005963


In [23]:
# Create the repective histograms for the weighted deletion frequency. (metric 2)
hist_weight_sept = alt.Chart(old_deletion_length_data.groupby('Deletion_Length').sum().reset_index()).mark_bar(color='black').encode(
    x=alt.X('Deletion_Length', title='Deletion Length', axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(nice=False)),
    y=alt.Y('Weighted Deletion Frequency', title='Log(Deletion Frequency)',axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(type='log'))).properties(title={"text":'Deletion Frequency: UniqueSeqs-Sept',
                                                                                             "fontSize":14
                                                                                                                      })

hist_weight_dec = alt.Chart(deletion_length_data.groupby('Deletion_Length').sum().reset_index()).mark_bar(color='black').encode(
    x=alt.X('Deletion_Length', title='Deletion Length',axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(nice=False)),
    y=alt.Y('Weighted Deletion Frequency', title='Log(Deletion Frequency)',axis=alt.Axis(titleFontSize=14, labelFontSize=12),scale=alt.Scale(type='log', domain=[0.0000001,1]))).properties(title={"text":'Deletion Frequency: UniqueSeqs-Dec',
                                                                                             "fontSize":14
                                                                                                                      })
# Merge the two histograms.
hist_weight_sept | hist_weight_dec

## Metric 2 subsets. (lengths 1-10)

In [24]:
# Subset the two datasets for lengths 1-10.
old_deletion_subset = old_deletion_length_data[old_deletion_length_data['Deletion_Length'] <= 10]

deletion_subset = deletion_length_data[deletion_length_data['Deletion_Length'] <= 10]

In [25]:
# Generate the two subsetted histograms for metric 2.
hist_weight_sept_10 = alt.Chart(old_deletion_subset.groupby('Deletion_Length').sum().reset_index()).mark_bar(color='black').encode(
    x=alt.X('Deletion_Length', title='Deletion Length',
            axis=alt.Axis(titleFontSize=14, labelFontSize=12),
            scale=alt.Scale(nice=False)),
    y=alt.Y('Weighted Deletion Frequency',
            title='Deletion Frequency',
            axis=alt.Axis(titleFontSize=14, labelFontSize=12),
            scale=alt.Scale(domain=[0,1]))).properties(title={"text":'Deletion Frequency: UniqueSeqs-Sept',
                                                                                             "fontSize":14,
                                                                                             "subtitle":"Lengths: 1-10"})
hist_weight_dec_10 = alt.Chart(deletion_subset.groupby('Deletion_Length').sum().reset_index()).mark_bar(color='black').encode(
    x=alt.X('Deletion_Length', title='Deletion Length',
            axis=alt.Axis(titleFontSize=14, labelFontSize=12),
            scale=alt.Scale(nice=False)),
    y=alt.Y('Weighted Deletion Frequency',
            title='Deletion Frequency',
            axis=alt.Axis(titleFontSize=14, labelFontSize=12),
            scale=alt.Scale(domain=[0,1]))).properties(title={"text":'Deletion Frequency: UniqueSeqs-Dec',
                                                                                             "fontSize":14,
                                                                                                                      "subtitle":"Lengths: 1-10"})
# Merge the two histograms.
hist_weight_sept_10 | hist_weight_dec_10

# Proportion of Unique Deletion (Unweighted frequency, Metric 3)

In [26]:
# Get the unweighted (unique deletion) data for the pairwise dataset.
pairwise_unweighted_data = old_deletion_length_data.groupby('Deletion_Length').count()
pairwise_unweighted_data

# Create the attribute for the unweighted frequency.
pairwise_unweighted_data['Unweighted Frequency'] = pairwise_unweighted_data.Deletion_Count / pairwise_unweighted_data.Deletion_Count.sum()

# Show the top 20 lengths by unique deletion frequency (Metric 3).
pairwise_unweighted_data[['Unweighted Frequency']].sort_values('Unweighted Frequency', ascending=False).head(20)

Unnamed: 0_level_0,Unweighted Frequency
Deletion_Length,Unnamed: 1_level_1
1,0.258495
3,0.207524
6,0.127427
2,0.11165
9,0.048544
12,0.032767
15,0.027913
4,0.024272
18,0.020631
21,0.019417


In [27]:
# Show the updated multi alignment data.
deletion_length_data

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position,Weighted Deletion Frequency
0,7,1,7,1,0.000003,216414,0.709345,3,3,0.000004
1,10,3,12,1,0.000003,216414,0.709345,4,4,0.000004
2,12,1,12,11,0.000036,216414,0.709345,4,4,0.000041
3,13,1,13,128,0.000420,216414,0.709345,5,5,0.000477
4,13,2,14,1,0.000003,216414,0.709345,5,5,0.000004
...,...,...,...,...,...,...,...,...,...,...
2395,3794,9,3802,2,0.000007,216414,0.709345,1265,1268,0.000007
2396,3801,1,3801,1,0.000003,216414,0.709345,1267,1267,0.000004
2397,3805,1,3805,1,0.000003,216414,0.709345,1269,1269,0.000004
2398,3813,1,3813,1,0.000003,216414,0.709345,1271,1271,0.000004


In [28]:
# Get the counts of unique deletions in the multi-alignment dataset.
multi_unweighted_data = deletion_length_data.groupby('Deletion_Length').count()
multi_unweighted_data

# Create the Unweighted frequency attribute.
multi_unweighted_data['Unweighted Frequency'] = multi_unweighted_data.Deletion_Count / multi_unweighted_data.Deletion_Count.sum()

# Show the top 20 lengths by unique deletions (unweighted frequency, metric 3).
multi_unweighted_data[['Unweighted Frequency']].sort_values('Unweighted Frequency', ascending=False).head(20)

Unnamed: 0_level_0,Unweighted Frequency
Deletion_Length,Unnamed: 1_level_1
1,0.30875
3,0.098333
2,0.09625
6,0.06125
9,0.034583
12,0.026667
4,0.025
5,0.02
15,0.018333
7,0.016667


In [29]:
# Generate the histogram for metric 3: pairwise data.
hist_unweight_sept = alt.Chart(pairwise_unweighted_data.reset_index()).mark_bar(color='black').encode(
x = alt.X('Deletion_Length',
          axis=alt.Axis(title='Deletion Length', titleFontSize=14, labelFontSize=12),
          scale=alt.Scale(nice=False)),
y = alt.Y('Unweighted Frequency',
          title='Deletion Frequency',
          axis=alt.Axis(titleFontSize=14, labelFontSize=12),
          scale=alt.Scale(domain=[0,0.35]))).properties(title={"text":'Deletion Frequency: UniqueSeqs-Sept',
                                                                                             "fontSize":14
                                                                                             })
# Generate the histogram for metric 3: multi data.
hist_unweight_dec = alt.Chart(multi_unweighted_data.reset_index()).mark_bar(color='black').encode(
x = alt.X('Deletion_Length',
          axis=alt.Axis(title='Deletion Length', titleFontSize=14, labelFontSize=12),
          scale=alt.Scale(nice=False)),
y = alt.Y('Unweighted Frequency',
          title='Deletion Frequency',
          axis=alt.Axis(titleFontSize=14, labelFontSize=12),
          scale=alt.Scale(domain=[0,0.35]))).properties(title={"text":'Deletion Frequency: UniqueSeqs-Dec',
                                                                                             "fontSize":14
                                                                                             })


# Merge the two plots.
hist_unweight_sept|hist_unweight_dec

In [30]:
# Subset the unweighted data for lengths 1-20.
pairwise_deletion_subset = pairwise_unweighted_data.reset_index()[pairwise_unweighted_data.reset_index()['Deletion_Length'] <= 20]

multi_deletion_subset = multi_unweighted_data.reset_index()[multi_unweighted_data.reset_index()['Deletion_Length'] <= 20]

In [31]:
# Generate the subsetted histograms.
hist_unweight_sept_20 = alt.Chart(pairwise_deletion_subset.reset_index()).mark_bar(color='black').encode(
x = alt.X('Deletion_Length', title='Deletion Length',
          axis=alt.Axis(titleFontSize=14, labelFontSize=12),
          scale=alt.Scale(nice=False, domain=[1,20])),
y = alt.Y('Unweighted Frequency',
          title='Deletion Frequency',
          axis=alt.Axis(titleFontSize=14, labelFontSize=12),
          scale=alt.Scale(domain=[0,0.35]))).properties(title={"text":'Deletion Frequency: UniqueSeqs-Sept',
                                                                                             "fontSize":14
                                                                                             })

hist_unweight_dec_20 = alt.Chart(multi_deletion_subset.reset_index()).mark_bar(color='black').encode(
x = alt.X('Deletion_Length',title='Deletion Length',
          axis=alt.Axis(titleFontSize=14, labelFontSize=12),
          scale=alt.Scale(nice=False, domain=[1,20])),
y = alt.Y('Unweighted Frequency',
          title='Deletion Frequency',
          axis=alt.Axis(titleFontSize=14, labelFontSize=12),
          scale=alt.Scale(domain=[0,0.35]))).properties(title={"text":'Deletion Frequency: UniqueSeqs-Dec',
                                                                                             "fontSize":14
                                                                                             })
# Combine all four subplots.
(hist_unweight_sept | hist_unweight_dec)&(hist_unweight_sept_20|hist_unweight_dec_20)

In [32]:
# Combine the four plots for metric 1.
(hist_sept|hist_dec) & (hist_sept_10 |hist_dec_10)

In [33]:
# Merge the four plots for metric 2.
(hist_weight_sept | hist_weight_dec) & (hist_weight_sept_10 | hist_weight_dec_10)

# Top 10 frameshift deletions by frequency

In [34]:
# Add an attribute to the old(pairwise) dataset to determine whether a deletion is frameshift.
old_deletion_length_data['Is_Frameshift'] = old_deletion_length_data['Deletion_Length'].apply(is_frameshift)

# Show the percentage of deletion occurences that are frameshift in the pairwise dataset.
old_deletion_length_data.groupby('Is_Frameshift').sum().reset_index()

Unnamed: 0,Is_Frameshift,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position,Weighted Deletion Frequency
0,False,387816,5091,392465,1126377,0.998117,0,0.0,129407,130998,0.99793
1,True,598404,2331,600353,2337,0.002071,0,0.0,199604,200250,0.00207


In [35]:
# Show the percentage of unique deletions in the pairwise data that are frameshift.
data = old_deletion_length_data.groupby('Is_Frameshift').count()

data['Unweighted Frequency'] = data.Deletion_Count / data.Deletion_Count.sum()
data

Unnamed: 0_level_0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position,Weighted Deletion Frequency,Unweighted Frequency
Is_Frameshift,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
False,442,442,442,442,442,442,442,442,442,442,0.536408
True,382,382,382,382,382,382,382,382,382,382,0.463592


In [36]:
# Show a list of the top 10 deletions that are frameshift by metric 1.
old_deletion_length_data[old_deletion_length_data.Is_Frameshift == True].sort_values('Deletion_Freq', ascending=False).head(10)

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position,Weighted Deletion Frequency,Is_Frameshift
555,1582,1,1582,709,0.000628,0,0.0,528,528,0.000628,True
754,3111,1,3111,394,0.000349,0,0.0,1037,1037,0.000349,True
791,3539,1,3539,228,0.000202,0,0.0,1180,1180,0.000202,True
257,428,2,429,122,0.000108,0,0.0,143,143,0.000108,True
104,201,4,204,77,6.8e-05,0,0.0,67,68,6.8e-05,True
266,430,2,431,73,6.5e-05,0,0.0,144,144,6.5e-05,True
234,420,4,423,68,6e-05,0,0.0,140,141,6e-05,True
68,153,1,153,62,5.5e-05,0,0.0,51,51,5.5e-05,True
256,428,1,428,55,4.9e-05,0,0.0,143,143,4.9e-05,True
568,1669,1,1669,34,3e-05,0,0.0,557,557,3e-05,True


In [37]:
# Apply the new frameshift attribute to the multi-alignment dataset.
deletion_length_data['Is_Frameshift'] = deletion_length_data['Deletion_Length'].apply(is_frameshift)

# Show the percentage of deletion occurences that are frameshift.
deletion_length_data.groupby('Is_Frameshift').sum().reset_index()

Unnamed: 0,Is_Frameshift,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position,Weighted Deletion Frequency
0,False,991560,15357,1006059,263338,0.863149,185683212,608.617824,330833,335652,0.98211
1,True,2355551,14590,2368599,4797,0.015723,333710388,1093.809656,785685,790045,0.01789


In [38]:
# Show the percentage of unique deletions that are frameshift.
data = deletion_length_data.groupby('Is_Frameshift').count()

data['Unweighted Frequency'] = data.Deletion_Count / data.Deletion_Count.sum()
data

Unnamed: 0_level_0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position,Weighted Deletion Frequency,Unweighted Frequency
Is_Frameshift,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
False,858,858,858,858,858,858,858,858,858,858,0.3575
True,1542,1542,1542,1542,1542,1542,1542,1542,1542,1542,0.6425


In [39]:
# Show the top 10 frameshift deletions in the multi-alignment dataset by Metric 1.
deletion_length_data[deletion_length_data.Is_Frameshift == True].sort_values('Deletion_Freq', ascending=False).head(10)

Unnamed: 0,Starting_Nucleotide_Position,Deletion_Length,Ending_Nucleotide_Position,Deletion_Count,Deletion_Freq,Num_Sequences_With_Deletions,Ratio_With_Deletions,Starting_Amino_Acid_Position,Ending_Amino_Acid_Position,Weighted Deletion Frequency,Is_Frameshift
385,258,1,258,154,0.000505,216414,0.709345,86,86,0.000574,True
237,204,5,208,150,0.000492,216414,0.709345,68,70,0.000559,True
3,13,1,13,128,0.00042,216414,0.709345,5,5,0.000477,True
767,469,4,472,84,0.000275,216414,0.709345,157,158,0.000313,True
2047,2848,1,2848,69,0.000226,216414,0.709345,950,950,0.000257,True
674,431,2,432,59,0.000193,216414,0.709345,144,144,0.00022,True
673,431,1,431,50,0.000164,216414,0.709345,144,144,0.000186,True
640,425,1,425,48,0.000157,216414,0.709345,142,142,0.000179,True
1221,1059,1,1059,48,0.000157,216414,0.709345,353,353,0.000179,True
775,471,7,477,47,0.000154,216414,0.709345,157,159,0.000175,True
