# Analyze antibody escape over time

## Import Python modules

In [1]:
import os
import glob
import pandas as pd

## Read in data that predicts how much each virus has escaped each antibody

In [2]:
# Make a list of input files, with one file per antibody
ab_escape_files = glob.glob('data/ncov_escape_scores/*.csv')

# Read in data from each file and store data in a dataframe
antibodies = []
for (i, f) in enumerate(ab_escape_files):
    df = pd.read_csv(f)
    ab = os.path.basename(f).replace('_polclonal_escape_prediction.csv', '').replace('_escape_score_variant_escape_prediction.csv', '')
    if 'additive' in df.columns.values:
        escape_score_col = f'{ab}_escape_score'
        df.rename(columns={'additive' : escape_score_col}, inplace=True)
    else:
        escape_score_col = f'{ab}_IC90_log_fold_change'

    if i == 0:
        escape_df = df[['strain', 'aa_substitutions', escape_score_col]]
    else:
        escape_df = escape_df.merge(
            df[['strain', escape_score_col]],
            on='strain'
        )
    antibodies.append(escape_score_col)

# Add a column giving the date of each strain
metadata_df = pd.read_csv(
    'data/ncov_escape_scores/metadata_with_index.tsv',
    sep='\t', on_bad_lines='skip'
    )

escape_df = escape_df.merge(
    metadata_df[['strain', 'date']], on='strain'
)
escape_df['date'] = pd.to_datetime(escape_df['date'])
escape_df['time'] = (escape_df['date'] - escape_df['date'].min()).dt.days / 365
escape_df.head()

Unnamed: 0,strain,aa_substitutions,C68.61-BA2_escape_score,LCB1_v2.2_stringent-WH1_escape_score,CC67.105_IC90_log_fold_change,NTD_5-7_IC90_log_fold_change,CC9.104_IC90_log_fold_change,C68.3-BA1_escape_score,LY-CoV1404-BA1_escape_score,C68.3-BA2_escape_score,C68.59_IC90_log_fold_change,date,time
0,Wuhan-Hu-1/2019,D339G F371S P373S F375S A376T N405D S408R N417...,0.103019,0.0,0.683449,-2.203219,1.203864,0.11961,0.137017,0.072882,1.010647,2019-12-26,0.0
1,NPL/61-TW/2020,D339G F371S P373S F375S A376T N405D S408R N417...,0.103019,0.0,0.683449,-2.203219,1.203864,0.11961,0.137017,0.072882,1.010647,2020-01-13,0.049315
2,Mesocricetusauratus/HKG/13_P2/2020,D339G F371S P373S F375S A376T N405D S408R N417...,0.103019,0.0,0.683449,-2.203219,1.203864,0.11961,0.137017,0.072882,1.010647,2020-05-15,0.386301
3,CHN/Hefei-362/2020,D339G F371S P373S F375S A376T N405D S408R N417...,0.103019,0.0,0.683449,-2.203219,1.203864,0.11961,0.137017,0.072882,1.010647,2020-02-09,0.123288
4,USA/WI-CDC-03041142-001/2020,D339G F371S P373S F375S A376T N405D S408R N417...,0.103019,0.0,0.683449,-2.203219,1.203864,0.11961,0.137017,0.072882,1.010647,2020-01-31,0.09863


## Analyze escape over time

Define our sliding windows

In [61]:
# Define parameters for windows
start_time = 0
end_time = 3
window_width = 0.1
window_step_size = 0.05

# To start, we set the left side of our window at a value
# of zero
left_side_of_window = 0

# We're going to define all windows one at a time
# based on the step size and width
list_of_left_sides_of_windows = []
while left_side_of_window <= end_time:
    
    # Append the window value to the list
    list_of_left_sides_of_windows.append(left_side_of_window)
    #print(left_side_of_window, left_side_of_window+window_width)

    # Slide the window to the right by one step
    left_side_of_window = left_side_of_window + window_step_size

For each window, get the viruses in that window and then compute the average escape for a given antibody.

In [62]:
# Make a dictionary for keeping track of averages in each window

# Iterate over each window and record data
for left_side_of_window in list_of_left_sides_of_windows:
    right_side_of_window = left_side_of_window + window_width
    
    # Get data for viruses within a given window
    data = escape_df[
        escape_df['time'].between(left_side_of_window, right_side_of_window)
    ]

    # Loop over antibodies, and compute the average amount that
    # viruses have escaped each antibody
    for antibody in antibodies:
        print(left_side_of_window, antibody, escape_df[antibody].mean())

0 CC9.104_IC90_log_fold_change 0.40114382190984277
0 NTD_5-7_IC90_log_fold_change -1.3937601224162244
0 C68.61-BA2_escape_score 0.0561780331574981
0 C68.3-BA2_escape_score 0.08927315938206479
0 LCB1_v2.2_stringent-WH1_escape_score 0.01211854547852294
0 C68.59_IC90_log_fold_change 0.5454379835207175
0 LY-CoV1404-BA1_escape_score 0.5266998685003768
0 C68.3-BA1_escape_score 0.2375919928409947
0 CC67.105_IC90_log_fold_change 0.6284802757760208
0.05 CC9.104_IC90_log_fold_change 0.40114382190984277
0.05 NTD_5-7_IC90_log_fold_change -1.3937601224162244
0.05 C68.61-BA2_escape_score 0.0561780331574981
0.05 C68.3-BA2_escape_score 0.08927315938206479
0.05 LCB1_v2.2_stringent-WH1_escape_score 0.01211854547852294
0.05 C68.59_IC90_log_fold_change 0.5454379835207175
0.05 LY-CoV1404-BA1_escape_score 0.5266998685003768
0.05 C68.3-BA1_escape_score 0.2375919928409947
0.05 CC67.105_IC90_log_fold_change 0.6284802757760208
0.1 CC9.104_IC90_log_fold_change 0.40114382190984277
0.1 NTD_5-7_IC90_log_fold_change

Here's the next task:
* the above cell of code, has a for loop that loops over windows, and in each window, it loop over antibodies
* for each antibody in each window, we'd like to record the mean escape, which is currently being printed above. We'd like to store these means in a dictionary or dataframe
* finally, we'd like to plot these means, making the kinds of plots that are on the board
* we should analyze the antibodies in two groups
    * one group is the group with the suffix `_IC90_log_fold_change`
    * the other group is the group with the suffix `_escape_score`