In [1]:
import pandas as pd
import numpy as np

from evcouplings.couplings import CouplingsModel


In [2]:
# load raw dataframe

df = pd.read_csv('../data/GB1/raw/GB1_zero_raw.csv', index_col=0)
df

Unnamed: 0,mutant,esm1v_t33_650M_UR90S,esm2_t36_3B_UR50D,EVmutation,EVE,esm_msa1b_t12_100M_UR50S_0,masks,esm1v_t33_650M_UR90S_1,esm1v_t33_650M_UR90S_2,esm1v_t33_650M_UR90S_3,esm1v_t33_650M_UR90S_4,esm1v_t33_650M_UR90S_5,esm_msa1b_t12_100M_UR50S_1,esm_msa1b_t12_100M_UR50S_2,esm_msa1b_t12_100M_UR50S_3,esm_msa1b_t12_100M_UR50S_4
0,VA39A;DA40A;GA41A;VA54A,-0.702931,-3.679725,-20.255317,-10.968353,-32.661022,15,-1.218270,-0.529401,-0.369648,-0.085206,-1.312130,-33.504288,-31.480907,-32.526978,-33.906433
1,VA39A;DA40A;GA41A;VA54C,-2.995390,-5.215294,-24.946040,-14.572647,-36.303730,15,-3.258709,-2.420268,-3.183101,-2.210224,-3.904644,-37.479843,-35.532173,-37.244690,-37.118729
2,VA39A;DA40A;GA41A;VA54D,-2.667509,-6.250734,-24.946040,-14.761597,-36.584679,15,-3.544477,-2.253075,-2.512638,-1.871142,-3.156216,-37.615067,-35.653019,-37.941002,-37.802586
3,VA39A;DA40A;GA41A;VA54E,-1.090491,-4.709267,-24.946040,-14.901718,-36.699921,15,-2.058664,-0.479224,-0.765473,-0.358916,-1.790181,-37.763447,-35.749214,-37.749660,-37.958874
4,VA39A;DA40A;GA41A;VA54F,-1.297809,-3.884186,-24.946040,-14.597061,-33.627846,15,-1.227628,-1.178723,-1.417058,-0.938432,-1.727203,-35.036293,-32.997684,-34.796497,-34.551331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,VA39Y;DA40Y;GA41Y;VA54S,-3.929908,-5.568047,-28.655261,-16.066269,-42.594849,15,-4.923866,-2.911441,-4.960345,-3.061902,-3.791989,-43.626251,-41.584190,-42.472687,-43.905792
159996,VA39Y;DA40Y;GA41Y;VA54T,-3.704289,-5.450732,-28.655261,-15.859589,-41.786472,15,-4.613888,-2.945370,-4.664041,-2.877244,-3.420901,-43.357597,-40.793976,-41.822872,-43.178368
159997,VA39Y;DA40Y;GA41Y;VA54V,-2.232809,-3.606445,-20.736168,-11.817322,-30.977379,14,-2.815190,-1.584976,-2.742856,-2.000220,-2.020805,-32.642883,-30.365374,-31.140972,-32.002850
159998,VA39Y;DA40Y;GA41Y;VA54W,-4.716158,-5.576900,-28.655261,-15.951630,-44.639832,15,-5.339731,-4.014680,-6.114935,-4.015084,-4.096360,-46.177338,-44.254654,-45.468613,-45.852402


## ESM computation

```
python modify/esm.py  \
    --model-location esm1v_t33_650M_UR90S_1 esm1v_t33_650M_UR90S_2 esm1v_t33_650M_UR90S_3 esm1v_t33_650M_UR90S_4 esm1v_t33_650M_UR90S_5 esm2_t36_3B_UR50D \
    --sequence MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE \
    --mutation-col mutant   \
    --dms-output data/GB1/raw/GB1_zero_raw.csv  \
    --scoring-strategy masked-marginals \
    --protein GB1
```

In [3]:
# load average esm-1v predictions

df = pd.read_csv('../data/GB1/raw/GB1_zero_raw.csv', index_col=0)
df['esm1v_t33_650M_UR90S'] = (df['esm1v_t33_650M_UR90S_1']+df['esm1v_t33_650M_UR90S_2']+df['esm1v_t33_650M_UR90S_3']+df['esm1v_t33_650M_UR90S_4']+df['esm1v_t33_650M_UR90S_5'])/5
df

Unnamed: 0,mutant,esm1v_t33_650M_UR90S,esm2_t36_3B_UR50D,EVmutation,EVE,esm_msa1b_t12_100M_UR50S_0,masks,esm1v_t33_650M_UR90S_1,esm1v_t33_650M_UR90S_2,esm1v_t33_650M_UR90S_3,esm1v_t33_650M_UR90S_4,esm1v_t33_650M_UR90S_5,esm_msa1b_t12_100M_UR50S_1,esm_msa1b_t12_100M_UR50S_2,esm_msa1b_t12_100M_UR50S_3,esm_msa1b_t12_100M_UR50S_4
0,VA39A;DA40A;GA41A;VA54A,-0.702931,-3.679725,-20.255317,-10.968353,-32.661022,15,-1.218270,-0.529401,-0.369648,-0.085206,-1.312130,-33.504288,-31.480907,-32.526978,-33.906433
1,VA39A;DA40A;GA41A;VA54C,-2.995389,-5.215294,-24.946040,-14.572647,-36.303730,15,-3.258709,-2.420268,-3.183101,-2.210224,-3.904644,-37.479843,-35.532173,-37.244690,-37.118729
2,VA39A;DA40A;GA41A;VA54D,-2.667509,-6.250734,-24.946040,-14.761597,-36.584679,15,-3.544477,-2.253075,-2.512638,-1.871142,-3.156216,-37.615067,-35.653019,-37.941002,-37.802586
3,VA39A;DA40A;GA41A;VA54E,-1.090492,-4.709267,-24.946040,-14.901718,-36.699921,15,-2.058664,-0.479224,-0.765473,-0.358916,-1.790181,-37.763447,-35.749214,-37.749660,-37.958874
4,VA39A;DA40A;GA41A;VA54F,-1.297809,-3.884186,-24.946040,-14.597061,-33.627846,15,-1.227628,-1.178723,-1.417058,-0.938432,-1.727203,-35.036293,-32.997684,-34.796497,-34.551331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,VA39Y;DA40Y;GA41Y;VA54S,-3.929908,-5.568047,-28.655261,-16.066269,-42.594849,15,-4.923866,-2.911441,-4.960345,-3.061902,-3.791989,-43.626251,-41.584190,-42.472687,-43.905792
159996,VA39Y;DA40Y;GA41Y;VA54T,-3.704289,-5.450732,-28.655261,-15.859589,-41.786472,15,-4.613888,-2.945370,-4.664041,-2.877244,-3.420901,-43.357597,-40.793976,-41.822872,-43.178368
159997,VA39Y;DA40Y;GA41Y;VA54V,-2.232809,-3.606445,-20.736168,-11.817322,-30.977379,14,-2.815190,-1.584976,-2.742856,-2.000220,-2.020805,-32.642883,-30.365374,-31.140972,-32.002850
159998,VA39Y;DA40Y;GA41Y;VA54W,-4.716158,-5.576900,-28.655261,-15.951630,-44.639832,15,-5.339731,-4.014680,-6.114935,-4.015084,-4.096360,-46.177338,-44.254654,-45.468613,-45.852402


## EVcouplings computation

The MSA and EVmutation model are generated using the server of EVcouplings (https://v2.evcouplings.org/). The EVmutation model can also be computed using the github repo of EVcouplings (https://github.com/debbiemarkslab/EVcouplings) with specified configs. Here, we provided a pre-computed EVmutation model of GB1.

In [4]:
'''compute EVcouplings'''

c = CouplingsModel("../data/GB1/raw/gb1_wt_b0.5.model")

def get_evcouplings(df):
    def parse(x):
        return (int(x[2:-1]), x[0], x[-1])

    df['EVmutation'] = df.mutant.apply(lambda x: c.delta_hamiltonian([parse(m) for m in x.split(';')])[0])
    return df

df = get_evcouplings(df)
df

Unnamed: 0,mutant,esm1v_t33_650M_UR90S,esm2_t36_3B_UR50D,EVmutation,EVE,esm_msa1b_t12_100M_UR50S_0,masks,esm1v_t33_650M_UR90S_1,esm1v_t33_650M_UR90S_2,esm1v_t33_650M_UR90S_3,esm1v_t33_650M_UR90S_4,esm1v_t33_650M_UR90S_5,esm_msa1b_t12_100M_UR50S_1,esm_msa1b_t12_100M_UR50S_2,esm_msa1b_t12_100M_UR50S_3,esm_msa1b_t12_100M_UR50S_4
0,VA39A;DA40A;GA41A;VA54A,-0.702931,-3.679725,-20.255317,-10.968353,-32.661022,15,-1.218270,-0.529401,-0.369648,-0.085206,-1.312130,-33.504288,-31.480907,-32.526978,-33.906433
1,VA39A;DA40A;GA41A;VA54C,-2.995389,-5.215294,-24.946040,-14.572647,-36.303730,15,-3.258709,-2.420268,-3.183101,-2.210224,-3.904644,-37.479843,-35.532173,-37.244690,-37.118729
2,VA39A;DA40A;GA41A;VA54D,-2.667509,-6.250734,-24.946040,-14.761597,-36.584679,15,-3.544477,-2.253075,-2.512638,-1.871142,-3.156216,-37.615067,-35.653019,-37.941002,-37.802586
3,VA39A;DA40A;GA41A;VA54E,-1.090492,-4.709267,-24.946040,-14.901718,-36.699921,15,-2.058664,-0.479224,-0.765473,-0.358916,-1.790181,-37.763447,-35.749214,-37.749660,-37.958874
4,VA39A;DA40A;GA41A;VA54F,-1.297809,-3.884186,-24.946040,-14.597061,-33.627846,15,-1.227628,-1.178723,-1.417058,-0.938432,-1.727203,-35.036293,-32.997684,-34.796497,-34.551331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,VA39Y;DA40Y;GA41Y;VA54S,-3.929908,-5.568047,-28.655261,-16.066269,-42.594849,15,-4.923866,-2.911441,-4.960345,-3.061902,-3.791989,-43.626251,-41.584190,-42.472687,-43.905792
159996,VA39Y;DA40Y;GA41Y;VA54T,-3.704289,-5.450732,-28.655261,-15.859589,-41.786472,15,-4.613888,-2.945370,-4.664041,-2.877244,-3.420901,-43.357597,-40.793976,-41.822872,-43.178368
159997,VA39Y;DA40Y;GA41Y;VA54V,-2.232809,-3.606445,-20.736168,-11.817322,-30.977379,14,-2.815190,-1.584976,-2.742856,-2.000220,-2.020805,-32.642883,-30.365374,-31.140972,-32.002850
159998,VA39Y;DA40Y;GA41Y;VA54W,-4.716158,-5.576900,-28.655261,-15.951630,-44.639832,15,-5.339731,-4.014680,-6.114935,-4.015084,-4.096360,-46.177338,-44.254654,-45.468613,-45.852402


## EVE computation

The EVE code is downloaded from the public repo of EVE (https://github.com/OATML-Markslab/EVE).

First, we trained the EVE model using the MSA generated by EVcouplings server. Then, we transformed our data into EVE format and computed the EVE predictions. The EVE computations are completed using the example scripts provided by EVE repo (examples/Step1_train_VAE.sh and examples/Step2_compute_evol_indices_all_singles.sh).

Here, we provided our input and our raw output from EVE as an illustration.

In [5]:
# Transform input for EVE
def update_eve(mutant):
    muts = mutant.split(';')
    new = []
    for mut in muts:
        if mut[0]==mut[-1]:
            continue
        new.append(mut[0]+mut[2:])
    return ':'.join(new)

df_eve = df[['mutant']].copy()
df_eve = df_eve[df_eve.mutant!='VA39V;DA40D;GA41G;VA54V']
df_eve = df_eve.reset_index()
df_eve['mutations'] = df_eve.mutant.apply(lambda x: update_eve(x))
df_eve['mutations'].to_csv('../data/GB1/raw/GB1_eve_input.csv', index=False)
df_eve

Unnamed: 0,index,mutant,mutations
0,0,VA39A;DA40A;GA41A;VA54A,V39A:D40A:G41A:V54A
1,1,VA39A;DA40A;GA41A;VA54C,V39A:D40A:G41A:V54C
2,2,VA39A;DA40A;GA41A;VA54D,V39A:D40A:G41A:V54D
3,3,VA39A;DA40A;GA41A;VA54E,V39A:D40A:G41A:V54E
4,4,VA39A;DA40A;GA41A;VA54F,V39A:D40A:G41A:V54F
...,...,...,...
159994,159995,VA39Y;DA40Y;GA41Y;VA54S,V39Y:D40Y:G41Y:V54S
159995,159996,VA39Y;DA40Y;GA41Y;VA54T,V39Y:D40Y:G41Y:V54T
159996,159997,VA39Y;DA40Y;GA41Y;VA54V,V39Y:D40Y:G41Y
159997,159998,VA39Y;DA40Y;GA41Y;VA54W,V39Y:D40Y:G41Y:V54W


In [6]:
# Analyze output for EVE

def update_eve(x):
    new_muts = ['VA39V', 'DA40D', 'GA41G', 'VA54V']
    mapping = {39:0,40:1,41:2,54:3}
    muts = x.split(':')
    for mut in muts:
        idx, mt = int(mut[1:-1]), mut[-1]
        pos = mapping[idx]
        new_muts[pos]=new_muts[pos][:-1]+mt
    return ';'.join(new_muts)

file= '../data/GB1/raw/GB1_wt_2000_samples0.csv'
df_eve = pd.read_csv(file)
df_eve = df_eve[df_eve.mutations!='wt']
df_eve['mutant'] = df_eve.mutations.apply(lambda x: update_eve(x))
df_eve['EVE'] = 0-df_eve.evol_indices
tmp = pd.DataFrame({'mutant':['VA39V;DA40D;GA41G;VA54V'],
                    'EVE':[0.0]})
df_eve = pd.concat([df_eve, tmp], ignore_index=True)

df_eve = df_eve.sort_values('mutant')
df = df.merge(df_eve[['mutant','EVE']])
df

Unnamed: 0,mutant,esm1v_t33_650M_UR90S,esm2_t36_3B_UR50D,EVmutation,EVE,esm_msa1b_t12_100M_UR50S_0,masks,esm1v_t33_650M_UR90S_1,esm1v_t33_650M_UR90S_2,esm1v_t33_650M_UR90S_3,esm1v_t33_650M_UR90S_4,esm1v_t33_650M_UR90S_5,esm_msa1b_t12_100M_UR50S_1,esm_msa1b_t12_100M_UR50S_2,esm_msa1b_t12_100M_UR50S_3,esm_msa1b_t12_100M_UR50S_4
0,VA39A;DA40A;GA41A;VA54A,-0.702931,-3.679725,-20.255317,-10.968353,-32.661022,15,-1.218270,-0.529401,-0.369648,-0.085206,-1.312130,-33.504288,-31.480907,-32.526978,-33.906433
1,VA39A;DA40A;GA41A;VA54C,-2.995389,-5.215294,-24.946040,-14.572647,-36.303730,15,-3.258709,-2.420268,-3.183101,-2.210224,-3.904644,-37.479843,-35.532173,-37.244690,-37.118729
2,VA39A;DA40A;GA41A;VA54D,-2.667509,-6.250734,-24.946040,-14.761597,-36.584679,15,-3.544477,-2.253075,-2.512638,-1.871142,-3.156216,-37.615067,-35.653019,-37.941002,-37.802586
3,VA39A;DA40A;GA41A;VA54E,-1.090492,-4.709267,-24.946040,-14.901718,-36.699921,15,-2.058664,-0.479224,-0.765473,-0.358916,-1.790181,-37.763447,-35.749214,-37.749660,-37.958874
4,VA39A;DA40A;GA41A;VA54F,-1.297809,-3.884186,-24.946040,-14.597061,-33.627846,15,-1.227628,-1.178723,-1.417058,-0.938432,-1.727203,-35.036293,-32.997684,-34.796497,-34.551331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,VA39Y;DA40Y;GA41Y;VA54S,-3.929908,-5.568047,-28.655261,-16.066269,-42.594849,15,-4.923866,-2.911441,-4.960345,-3.061902,-3.791989,-43.626251,-41.584190,-42.472687,-43.905792
159996,VA39Y;DA40Y;GA41Y;VA54T,-3.704289,-5.450732,-28.655261,-15.859589,-41.786472,15,-4.613888,-2.945370,-4.664041,-2.877244,-3.420901,-43.357597,-40.793976,-41.822872,-43.178368
159997,VA39Y;DA40Y;GA41Y;VA54V,-2.232809,-3.606445,-20.736168,-11.817322,-30.977379,14,-2.815190,-1.584976,-2.742856,-2.000220,-2.020805,-32.642883,-30.365374,-31.140972,-32.002850
159998,VA39Y;DA40Y;GA41Y;VA54W,-4.716158,-5.576900,-28.655261,-15.951630,-44.639832,15,-5.339731,-4.014680,-6.114935,-4.015084,-4.096360,-46.177338,-44.254654,-45.468613,-45.852402


## MSA Transformer computation

For MSA Transformer, we first subsampled the MSA to 384 sequences to achieve optimal inference performances, using HHfilter and a weight scheme from EVmutation (as implemented in https://github.com/OATML-Markslab/EVE). 
```
hhfilter -cov 75 -i data/GB1/raw/gb1_wt_b0.5.a2m -o data/GB1/raw/gb1_wt_b0.5_filtered.a3m -maxseq 386200
```
Here, we provided the sub-sampled MSAs for illustration as the weighting calculation would take more than 1 day.

```
python modify/msa.py  \
    --model-location esm_msa1b_t12_100M_UR50S \
    --sequence MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE \
    --mutation-col mutant \
    --dms-output data/GB1/raw/GB1_zero_raw.csv  \
    --scoring-strategy masked-marginals \
    --protein GB1 \
    --msa-path data/GB1/raw/gb1_wt_b0.5_filtered_
```

In [7]:
# load average MSA Transformer predictions

df = pd.read_csv('../data/GB1/raw/GB1_zero_raw.csv', index_col=0)
df['esm_msa1b_t12_100M_UR50S_0'] = df[['esm_msa1b_t12_100M_UR50S_0', 'esm_msa1b_t12_100M_UR50S_1',
                                               'esm_msa1b_t12_100M_UR50S_2', 'esm_msa1b_t12_100M_UR50S_3',
                                               'esm_msa1b_t12_100M_UR50S_4']].mean(axis=1)
df

Unnamed: 0,mutant,esm1v_t33_650M_UR90S,esm2_t36_3B_UR50D,EVmutation,EVE,esm_msa1b_t12_100M_UR50S_0,masks,esm1v_t33_650M_UR90S_1,esm1v_t33_650M_UR90S_2,esm1v_t33_650M_UR90S_3,esm1v_t33_650M_UR90S_4,esm1v_t33_650M_UR90S_5,esm_msa1b_t12_100M_UR50S_1,esm_msa1b_t12_100M_UR50S_2,esm_msa1b_t12_100M_UR50S_3,esm_msa1b_t12_100M_UR50S_4
0,VA39A;DA40A;GA41A;VA54A,-0.702931,-3.679725,-20.255317,-10.968353,-32.815926,15,-1.218270,-0.529401,-0.369648,-0.085206,-1.312130,-33.504288,-31.480907,-32.526978,-33.906433
1,VA39A;DA40A;GA41A;VA54C,-2.995390,-5.215294,-24.946040,-14.572647,-36.735833,15,-3.258709,-2.420268,-3.183101,-2.210224,-3.904644,-37.479843,-35.532173,-37.244690,-37.118729
2,VA39A;DA40A;GA41A;VA54D,-2.667509,-6.250734,-24.946040,-14.761597,-37.119270,15,-3.544477,-2.253075,-2.512638,-1.871142,-3.156216,-37.615067,-35.653019,-37.941002,-37.802586
3,VA39A;DA40A;GA41A;VA54E,-1.090491,-4.709267,-24.946040,-14.901718,-37.184223,15,-2.058664,-0.479224,-0.765473,-0.358916,-1.790181,-37.763447,-35.749214,-37.749660,-37.958874
4,VA39A;DA40A;GA41A;VA54F,-1.297809,-3.884186,-24.946040,-14.597061,-34.201930,15,-1.227628,-1.178723,-1.417058,-0.938432,-1.727203,-35.036293,-32.997684,-34.796497,-34.551331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,VA39Y;DA40Y;GA41Y;VA54S,-3.929908,-5.568047,-28.655261,-16.066269,-42.836754,15,-4.923866,-2.911441,-4.960345,-3.061902,-3.791989,-43.626251,-41.584190,-42.472687,-43.905792
159996,VA39Y;DA40Y;GA41Y;VA54T,-3.704289,-5.450732,-28.655261,-15.859589,-42.187857,15,-4.613888,-2.945370,-4.664041,-2.877244,-3.420901,-43.357597,-40.793976,-41.822872,-43.178368
159997,VA39Y;DA40Y;GA41Y;VA54V,-2.232809,-3.606445,-20.736168,-11.817322,-31.425891,14,-2.815190,-1.584976,-2.742856,-2.000220,-2.020805,-32.642883,-30.365374,-31.140972,-32.002850
159998,VA39Y;DA40Y;GA41Y;VA54W,-4.716158,-5.576900,-28.655261,-15.951630,-45.278568,15,-5.339731,-4.014680,-6.114935,-4.015084,-4.096360,-46.177338,-44.254654,-45.468613,-45.852402


## Ensemble model predictions

In [8]:
df

Unnamed: 0,mutant,esm1v_t33_650M_UR90S,esm2_t36_3B_UR50D,EVmutation,EVE,esm_msa1b_t12_100M_UR50S_0,masks,esm1v_t33_650M_UR90S_1,esm1v_t33_650M_UR90S_2,esm1v_t33_650M_UR90S_3,esm1v_t33_650M_UR90S_4,esm1v_t33_650M_UR90S_5,esm_msa1b_t12_100M_UR50S_1,esm_msa1b_t12_100M_UR50S_2,esm_msa1b_t12_100M_UR50S_3,esm_msa1b_t12_100M_UR50S_4
0,VA39A;DA40A;GA41A;VA54A,-0.702931,-3.679725,-20.255317,-10.968353,-32.815926,15,-1.218270,-0.529401,-0.369648,-0.085206,-1.312130,-33.504288,-31.480907,-32.526978,-33.906433
1,VA39A;DA40A;GA41A;VA54C,-2.995390,-5.215294,-24.946040,-14.572647,-36.735833,15,-3.258709,-2.420268,-3.183101,-2.210224,-3.904644,-37.479843,-35.532173,-37.244690,-37.118729
2,VA39A;DA40A;GA41A;VA54D,-2.667509,-6.250734,-24.946040,-14.761597,-37.119270,15,-3.544477,-2.253075,-2.512638,-1.871142,-3.156216,-37.615067,-35.653019,-37.941002,-37.802586
3,VA39A;DA40A;GA41A;VA54E,-1.090491,-4.709267,-24.946040,-14.901718,-37.184223,15,-2.058664,-0.479224,-0.765473,-0.358916,-1.790181,-37.763447,-35.749214,-37.749660,-37.958874
4,VA39A;DA40A;GA41A;VA54F,-1.297809,-3.884186,-24.946040,-14.597061,-34.201930,15,-1.227628,-1.178723,-1.417058,-0.938432,-1.727203,-35.036293,-32.997684,-34.796497,-34.551331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,VA39Y;DA40Y;GA41Y;VA54S,-3.929908,-5.568047,-28.655261,-16.066269,-42.836754,15,-4.923866,-2.911441,-4.960345,-3.061902,-3.791989,-43.626251,-41.584190,-42.472687,-43.905792
159996,VA39Y;DA40Y;GA41Y;VA54T,-3.704289,-5.450732,-28.655261,-15.859589,-42.187857,15,-4.613888,-2.945370,-4.664041,-2.877244,-3.420901,-43.357597,-40.793976,-41.822872,-43.178368
159997,VA39Y;DA40Y;GA41Y;VA54V,-2.232809,-3.606445,-20.736168,-11.817322,-31.425891,14,-2.815190,-1.584976,-2.742856,-2.000220,-2.020805,-32.642883,-30.365374,-31.140972,-32.002850
159998,VA39Y;DA40Y;GA41Y;VA54W,-4.716158,-5.576900,-28.655261,-15.951630,-45.278568,15,-5.339731,-4.014680,-6.114935,-4.015084,-4.096360,-46.177338,-44.254654,-45.468613,-45.852402


In [9]:
# z-score normalization

for key in ['esm1v_t33_650M_UR90S', 'esm2_t36_3B_UR50D', 'EVmutation', 'EVE', 'esm_msa1b_t12_100M_UR50S_0']:
    m, s = df[key].mean(), df[key].std()
    df[key] = (df[key]-m)/s

seed = 0
df[f'modify_fitness'] = (df.esm1v_t33_650M_UR90S/2 + df.esm2_t36_3B_UR50D/2 + df.EVmutation + df.EVE + df[f'esm_msa1b_t12_100M_UR50S_{seed}'])/4
df

Unnamed: 0,mutant,esm1v_t33_650M_UR90S,esm2_t36_3B_UR50D,EVmutation,EVE,esm_msa1b_t12_100M_UR50S_0,masks,esm1v_t33_650M_UR90S_1,esm1v_t33_650M_UR90S_2,esm1v_t33_650M_UR90S_3,esm1v_t33_650M_UR90S_4,esm1v_t33_650M_UR90S_5,esm_msa1b_t12_100M_UR50S_1,esm_msa1b_t12_100M_UR50S_2,esm_msa1b_t12_100M_UR50S_3,esm_msa1b_t12_100M_UR50S_4,modify_fitness
0,VA39A;DA40A;GA41A;VA54A,2.105725,1.373748,0.973436,1.484549,1.500922,15,-1.218270,-0.529401,-0.369648,-0.085206,-1.312130,-33.504288,-31.480907,-32.526978,-33.906433,1.424661
1,VA39A;DA40A;GA41A;VA54C,0.902695,0.671500,-0.162748,-0.041238,0.905571,15,-3.258709,-2.420268,-3.183101,-2.210224,-3.904644,-37.479843,-35.532173,-37.244690,-37.118729,0.372171
2,VA39A;DA40A;GA41A;VA54D,1.074760,0.197971,-0.162748,-0.121225,0.847335,15,-3.544477,-2.253075,-2.512638,-1.871142,-3.156216,-37.615067,-35.653019,-37.941002,-37.802586,0.299932
3,VA39A;DA40A;GA41A;VA54E,1.902343,0.902916,-0.162748,-0.180542,0.837470,15,-2.058664,-0.479224,-0.765473,-0.358916,-1.790181,-37.763447,-35.749214,-37.749660,-37.958874,0.474203
4,VA39A;DA40A;GA41A;VA54F,1.793547,1.280243,-0.162748,-0.051573,1.290417,15,-1.227628,-1.178723,-1.417058,-0.938432,-1.727203,-35.036293,-32.997684,-34.796497,-34.551331,0.653248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,VA39Y;DA40Y;GA41Y;VA54S,0.412281,0.510178,-1.061193,-0.673525,-0.021028,15,-4.923866,-2.911441,-4.960345,-3.061902,-3.791989,-43.626251,-41.584190,-42.472687,-43.905792,-0.323629
159996,VA39Y;DA40Y;GA41Y;VA54T,0.530681,0.563829,-1.061193,-0.586032,0.077525,15,-4.613888,-2.945370,-4.664041,-2.877244,-3.420901,-43.357597,-40.793976,-41.822872,-43.178368,-0.255611
159997,VA39Y;DA40Y;GA41Y;VA54V,1.302881,1.407260,0.856964,1.125159,1.712038,14,-2.815190,-1.584976,-2.742856,-2.000220,-2.020805,-32.642883,-30.365374,-31.140972,-32.002850,1.262308
159998,VA39Y;DA40Y;GA41Y;VA54W,-0.000325,0.506129,-1.061193,-0.624995,-0.391888,15,-5.339731,-4.014680,-6.114935,-4.015084,-4.096360,-46.177338,-44.254654,-45.468613,-45.852402,-0.456294


In [10]:
df[['mutant', 'modify_fitness']]

Unnamed: 0,mutant,modify_fitness
0,VA39A;DA40A;GA41A;VA54A,1.424661
1,VA39A;DA40A;GA41A;VA54C,0.372171
2,VA39A;DA40A;GA41A;VA54D,0.299932
3,VA39A;DA40A;GA41A;VA54E,0.474203
4,VA39A;DA40A;GA41A;VA54F,0.653248
...,...,...
159995,VA39Y;DA40Y;GA41Y;VA54S,-0.323629
159996,VA39Y;DA40Y;GA41Y;VA54T,-0.255611
159997,VA39Y;DA40Y;GA41Y;VA54V,1.262308
159998,VA39Y;DA40Y;GA41Y;VA54W,-0.456294
