# Analyze antibody escape over time

## Import Python modules

In [27]:
import os
import glob
import pandas as pd

## Read in data that predicts how much each virus has escaped each antibody

In [40]:
# Make a list of input files, with one file per antibody
ab_escape_files = glob.glob('data/ncov_escape_scores/*.csv')

# Read in data from each file and store data in a dataframe
for (i, f) in enumerate(ab_escape_files):
    df = pd.read_csv(f)
    ab = os.path.basename(f).replace('_polclonal_escape_prediction.csv', '').replace('_escape_score_variant_escape_prediction.csv', '')
    if 'additive' in df.columns.values:
        escape_score_col = f'{ab}_escape_score'
        df.rename(columns={'additive' : escape_score_col}, inplace=True)
    else:
        escape_score_col = f'{ab}_IC90_log_fold_change'

    if i == 0:
        escape_df = df[['strain', 'aa_substitutions', escape_score_col]]
    else:
        escape_df = escape_df.merge(
            df[['strain', escape_score_col]],
            on='strain'
        )

# Add a column giving the date of each strain
metadata_df = pd.read_csv(
    'data/ncov_escape_scores/metadata_with_index.tsv',
    sep='\t', on_bad_lines='skip'
    )

escape_df = escape_df.merge(
    metadata_df[['strain', 'date']], on='strain'
)
escape_df['date'] = pd.to_datetime(escape_df['date'])
escape_df['time'] = (escape_df['date'] - escape_df['date'].min()).dt.days / 365
escape_df.head()

Unnamed: 0,strain,aa_substitutions,CC9.104_IC90_log_fold_change,NTD_5-7_IC90_log_fold_change,C68.61-BA2_escape_score,C68.3-BA2_escape_score,LCB1_v2.2_stringent-WH1_escape_score,C68.59_IC90_log_fold_change,LY-CoV1404-BA1_escape_score,C68.3-BA1_escape_score,CC67.105_IC90_log_fold_change,date,time
0,Wuhan-Hu-1/2019,V67A I95T I212L D339G L371S P373S F375S N417K ...,1.203864,-2.203219,0.103019,0.072882,0.0,1.010647,0.137017,0.11961,0.683449,2019-12-26,0.0
1,NPL/61-TW/2020,V67A I95T I212L D339G L371S P373S F375S N417K ...,1.203864,-2.203219,0.103019,0.072882,0.0,1.010647,0.137017,0.11961,0.683449,2020-01-13,0.049315
2,Mesocricetusauratus/HKG/13_P2/2020,V67A I95T I212L D339G L371S P373S F375S N417K ...,1.203864,-2.203219,0.103019,0.072882,0.0,1.010647,0.137017,0.11961,0.683449,2020-05-15,0.386301
3,CHN/Hefei-362/2020,V67A I95T I212L D339G L371S P373S F375S N417K ...,1.203864,-2.203219,0.103019,0.072882,0.0,1.010647,0.137017,0.11961,0.683449,2020-02-09,0.123288
4,USA/WI-CDC-03041142-001/2020,V67A I95T I212L D339G L371S P373S F375S N417K ...,1.203864,-2.203219,0.103019,0.072882,0.0,1.010647,0.137017,0.11961,0.683449,2020-01-31,0.09863


## Analyze escape over time

Define our sliding windows

In [10]:
# Define parameters for windows
start_time = 0
end_time = 3
window_width = 0.1
window_step_size = 0.05

# To start, we set the left side of our window at a value
# of zero
left_side_of_window = 0

# We're going to define all windows one at a time
# based on the step size and width
list_of_left_sides_of_windows = []
while left_side_of_window < end_time:
    
    # Append the window value to the list
    list_of_left_sides_of_windows.append(left_side_of_window)
    #print(left_side_of_window, left_side_of_window+window_width)

    # Slide the window to the right by one step
    left_side_of_window = left_side_of_window + window_step_size

For each window, get the viruses in that window and then compute the average escape for a given antibody.