# Binding affinity and expression from DMS of anti-CGG

This notebook contains code to make interactive heatmaps for binding and expression measured in the of anti-CGG antibodies as part of a collaboration with the Matsen and Victora labs.

Most of this code was inspired, and in some cases, repurposed from Sarah Hilton's work found [here](https://github.com/jbloomlab/SARS-CoV-2-RBD_DMS/blob/master/interactive_heatmap.ipynb). 

To get this to work I had to use the following environment build:
```
mamba create --name Ab-CGGnaive-docs pandas=1.4.4 altair=4.1 jsonschema=3.2 python=3.8 notebook git-lfs
```

In [19]:
import pandas as pd
import altair as alt

# Remove the limit of ~5000 rows
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Process the data

Import and format the data for the interactive `Altair` heat maps. 

In [20]:
# ## == Filepaths == ##  

# Input path to variant scores 
# variant_scores_path = "../data/dms/final_variant_scores.csv"
mutations_data = "_ignore/data.csv"
outbase = "_ignore"

# Output path to binding and expression HTML
output_mut_phenotype_html = "mut_pheno_heatmap.html"

# Output path to mutation rate, enrichment, and abundance HTML
output_mut_html = "mut_heatmap.html"

In [21]:
# Expression and Binding scores per variant background -- for the heatmap plots
scores_df = (
    pd.read_csv(mutations_data)
)

scores_df.head()

Unnamed: 0,mutation,target,wildtype,position,position_IMGT,chain,annotation,mutant,codon,single_nt,...,mutation events (10-week),mutation abundance (10-week),mutation events (LMP2A),mutation abundance (LMP2A),mutation events,mutation abundance,mutation rate,mutation enrichment,affinity change stratum,distance to antigen
0,E1(H)A,CGG_naive,E,1,1,H,FWRH1,A,GAG,True,...,0,0,0,0,6,6,6e-06,1.003672,affinity neutral,9.796122
1,E1(H)C,CGG_naive,E,1,1,H,FWRH1,C,GAG,False,...,0,0,0,0,0,0,,,affinity neutral,9.796122
2,E1(H)D,CGG_naive,E,1,1,H,FWRH1,D,GAG,True,...,5,8,0,0,10,57,0.000725,0.479223,affinity neutral,9.796122
3,E1(H)E,CGG_naive,E,1,1,H,FWRH1,E,GAG,True,...,0,0,0,0,0,0,,,affinity neutral,9.796122
4,E1(H)F,CGG_naive,E,1,1,H,FWRH1,F,GAG,False,...,0,0,0,0,0,0,,,affinity neutral,9.796122


In [22]:
scores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4400 entries, 0 to 4399
Data columns (total 34 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   mutation                      4400 non-null   object 
 1   target                        4400 non-null   object 
 2   wildtype                      4400 non-null   object 
 3   position                      4400 non-null   int64  
 4   position_IMGT                 4400 non-null   int64  
 5   chain                         4400 non-null   object 
 6   annotation                    4400 non-null   object 
 7   mutant                        4400 non-null   object 
 8   codon                         4400 non-null   object 
 9   single_nt                     4400 non-null   bool   
 10  bind_CGG                      4397 non-null   float64
 11  delta_bind_CGG                4397 non-null   float64
 12  n_bc_bind_CGG                 4400 non-null   int64  
 13  n_l

In [23]:
# Expression and Binding scores per variant background -- for the heatmap plots
scores_df = (
    pd.read_csv(mutations_data)
        .drop(columns='site')
        .rename(columns = {
            "position":"site",
            "delta_expr":"Expression", # rename for the selection menus 
            "delta_bind_CGG":"Binding",    # rename for the selection menus 
            "mutation abundance":"Abundance",
            "mutation rate":"Rate",
            "mutation enrichment" : "Enrichment"
                          })
)

# Drop the un-used columns - shrinks the size of the altair plot
scores_df = scores_df.drop(columns=[column for column in scores_df.columns
                                   if column not in ['target', 'wildtype', 'site', 'mutant',
                                                     'Expression', 'Binding',
                                                     'Abundance', 'Rate', 'Enrichment',
                                                     'n_bc_expr', 'n_bc_bind_CGG',
                                                     'chain', 'annotation', 'mutation', 'position_IMGT'
                                                    ]
                                   ]
                          )
scores_df

Unnamed: 0,mutation,target,wildtype,site,position_IMGT,chain,annotation,mutant,Binding,n_bc_bind_CGG,Expression,n_bc_expr,Abundance,Rate,Enrichment
0,E1(H)A,CGG_naive,E,1,1,H,FWRH1,A,0.11933,20,-0.00246,20,6,0.000006,1.003672
1,E1(H)C,CGG_naive,E,1,1,H,FWRH1,C,-0.01492,22,-0.12216,22,0,,
2,E1(H)D,CGG_naive,E,1,1,H,FWRH1,D,0.01249,18,0.04874,18,57,0.000725,0.479223
3,E1(H)E,CGG_naive,E,1,1,H,FWRH1,E,0.00000,23285,0.00000,23285,0,,
4,E1(H)F,CGG_naive,E,1,1,H,FWRH1,F,0.13433,29,-0.11379,29,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4395,R128(L)S,CGG_naive,R,235,128,L,FWRL4,S,0.01095,45,0.07344,45,0,,
4396,R128(L)T,CGG_naive,R,235,128,L,FWRL4,T,0.02081,41,0.08647,41,0,,
4397,R128(L)V,CGG_naive,R,235,128,L,FWRL4,V,-0.03418,28,0.04073,28,0,,
4398,R128(L)W,CGG_naive,R,235,128,L,FWRL4,W,0.04423,36,-0.09192,36,0,,


In [24]:
# cast the Abundance column to float
scores_df['Abundance'] = scores_df['Abundance'].astype(float)
# Set a character, `x`, to appear in the wildtype sites
scores_df['wildtype_code'] = (scores_df[['wildtype', 'mutant']].apply(lambda x: 'x' if x[0] == x[1] else '', axis=1))

scores_df['IMGT_Site_tmp'] = scores_df['position_IMGT'].map(lambda pos: "link" if pd.isna(pos) else  str(int(pos)))
scores_df['IMGT_Site'] = [f"{site} ({IMGT})" for IMGT, site in zip(scores_df['IMGT_Site_tmp'], scores_df['site'])]
scores_df['Chain_Site'] = [f"{site} ({chain})" for chain, site in zip(scores_df['chain'], scores_df['site'])]

## Define plot-wide parameters

Define the parameters that carry through to multiple plots for easy adjustment. 

In [25]:
# Width of the zoom bar
width = 1000

# Height of the heatmap
height = 300

# How to space the sites on the x axis on the zoom bar 
min_site = min({site for site in scores_df.site})
max_site = max({site for site in scores_df.site})
x_axis_spacing = 5

# Order of the amino acids on the y-axis
aa_order = ['R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y',
            'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C', '*']
    
# Tooltips and corresponding names with formatting
heatmap_tooltips = [
    alt.Tooltip('target:N', title="Variant"),
    alt.Tooltip('mutation:N', title="Mutation"),
    alt.Tooltip('Expression:Q', title="Change in Expression", format=".2f"), 
    alt.Tooltip('Binding:Q', title="Change in CGG Binding Affinity", format=".2f"),
    alt.Tooltip('Abundance:Q', title="Abundance", format=".0f"),
    alt.Tooltip('Rate:Q', title="Rate", format=".2f"),
    alt.Tooltip('Enrichment:Q', title="Enrichment", format=".2f"),
    alt.Tooltip('annotation:N', title="Annotation"),
    alt.Tooltip('chain:N', title="")
]



## Define selections for plots 

Define the selecion objects that define interaction. Many of these are shared between plots and datasets, so it's helpful to define these at the top. 

In [26]:
# Zoom bar brush to look closer at a region in the line plot. 
zoom_selection = alt.selection_interval(encodings=['x'], mark=alt.BrushConfig(stroke='black', strokeWidth=2))

 
# Cell selector for highlighting the cell you're currently mousing over 
amino_acid_selection = alt.selection_single(encodings=['x', 'y'], on='mouseover', empty='none')

# Drop down to select the metric displayed in the heatmap ( (delta) Expression or Binding )
metric_dropdown = alt.binding_select(options=['IMGT_Site', 'Chain_Site'],
                                     labels = ['IMGT Position', 'Chain Name'],
                                     name="Select x-axis annotation: ")
metric_selection = alt.selection_single(fields=['metric'], bind=metric_dropdown, init={'metric': 'IMGT_Site'})



## Define the plot objects

The final plot will be comprised of multiple **Heatmaps** that display the binding and expression for the RBD DMS from the four variants of concern and the ancestral sequence. 

### Zoom Bar

In [27]:
## == Zoom bar for the heatmap plot == ## 
zoom_bar = alt.Chart(scores_df[['site']].drop_duplicates()
    ).mark_rect(
        color='lightgray'
    ).encode(
        x=alt.X('site:O',
                title=None,
                axis=alt.Axis(values=list(range(min_site, max_site, x_axis_spacing)))
               )
    ).add_selection(
        zoom_selection
    ).properties(
        width=width,
        height=15,
        title="site zoom bar"
)

zoom_bar


### Heatmaps

In [28]:
## == Heatmaps plots with annotations == ## 
def heatmap(data, metric, reverse_scale=False, title_prefix=""):
    """
    Function to reproduce code for a heatmaps based on a different selection. 
    This shortens the amount of code needed to make three heatmaps that will 
    end up concatenated together in the final plot. 
    """
    
    # Define the input dataset once in the base plot
    heatmapbase = (
        alt.Chart(data)
        .transform_fold(
            ['IMGT_Site', 'Chain_Site'],
            as_=['metric', 'measurement']
        )
        .encode(
            x=alt.X(
                'measurement:O',
                axis=alt.Axis(titleFontSize=15),
                sort=alt.EncodingSortField(field="site", order ='ascending')
            ),
            y=alt.Y(
                'mutant:O',
                sort=aa_order,
                axis=alt.Axis(
                    labelFontSize=12,
                    titleFontSize=15
                )
            )
        )
        .transform_filter(
            metric_selection
        )
    )


    # Define the metric by which the plot is colored - i.e. (delta) expression or binding
    coloring = heatmapbase.mark_rect(
    ).encode(
        color= alt.Color(f'{metric}:Q',
                         type='quantitative',
                         scale=alt.Scale(scheme="redblue",
                                         reverse=reverse_scale,
#                                          domain=[minimum_domain, maximum_domain],
                                         domainMid=0, 
                                         clamp=True
                                        ),
                           legend=alt.Legend(orient='left',
                                             title='grey is n.d.',
                                             gradientLength=100)),
        stroke=alt.value('black'),
        strokeWidth=alt.condition(amino_acid_selection,
                                  alt.value(2),
                                  alt.value(0)),
        tooltip=heatmap_tooltips
    )

    
    # And a black 'x' to the wildtype amino acids 
    wildtype = heatmapbase.mark_text(
        color='black'
    ).encode(
        text=alt.Text('wildtype_code:N')
    )

    
    # Color the empty measurements gray
    nulls = heatmapbase.mark_rect(
    ).transform_filter(
        f"!isValid(datum.{metric})"
    ).mark_rect(
        opacity=0.5
    ).encode(
        alt.Color(f"{metric}:N",
                  scale=alt.Scale(scheme='greys'),
                  legend=None)
    ) 

    # Return the final heatmap along with annotations
    return (coloring + nulls + wildtype 
    ).interactive(
    ).add_selection(
        amino_acid_selection,
        zoom_selection,
        metric_selection
    ).transform_filter(
        zoom_selection
    ).properties(height=height,
                 title=alt.TitleParams(text = f"{title_prefix}{metric}", 
                                       anchor='start',
                                       dx=120)
    )



In [29]:
# Make a heatmap with the top selection and bottom selection
binding_heatmap = heatmap(scores_df, "Binding", title_prefix="anti-CGG ")
expression_heatmap = heatmap(scores_df, "Expression")
abundance_heatmap = heatmap(scores_df, "Abundance")
rate_heatmap = heatmap(scores_df, "Rate")
enrichment_heatmap = heatmap(scores_df, "Enrichment")

In [30]:
# Combine the two heatmaps with the zoom bar - you have to add the selections here backwards to get the right order - why!?
final_plot = (zoom_bar & binding_heatmap & expression_heatmap)

final_plot

In [31]:
final_plot.save(output_mut_phenotype_html)

In [32]:
final_plot = (zoom_bar & abundance_heatmap & rate_heatmap & enrichment_heatmap)
final_plot

In [33]:
final_plot.save(f"{outbase}/{output_mut_html}")