In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from scipy.spatial.distance import cdist
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.exceptions import ConvergenceWarning
import xgboost
from sklearn.decomposition import PCA
import warnings
import random
import time
import os
import sys
import argparse
import torch
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from Bio import SeqIO

# Ignore FutureWarnings and ConvergenceWarnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn.neural_network")
pd.options.mode.chained_assignment = None  # default='warn'

### Functions ported from grid_search

In [2]:
# Function to read in the data
def read_data(dataset_name, base_path, file_type, embeddings_type='both', experimental = False):
    # Construct the file paths
    if file_type == "csvs":
        labels_file = os.path.join(base_path, 'labels', dataset_name.split('_')[0] + '_labels.csv')
        hie_file = os.path.join(base_path, 'hie_temp', dataset_name.split('_')[0] + '.csv')
        embeddings_file = os.path.join(base_path, 'csvs', dataset_name + '.csv')
        # Read in mean embeddings across all rounds
        embeddings = pd.read_csv(embeddings_file, index_col=0)
    elif file_type == "pts":
        labels_file = os.path.join(base_path, 'labels', dataset_name.split('_')[-1] + '_labels.csv')
        hie_file = os.path.join(base_path, 'hie_temp', dataset_name.split('_')[-1] + '.csv')
        embeddings_file = os.path.join(base_path, 'pts', dataset_name + '.pt')
        # Read in pytorch tensor of embeddings
        embeddings = torch.load(embeddings_file)
        # Convert embeddings to a dataframe
        if embeddings_type == 'average':
            embeddings = {key: value['average'].numpy() for key, value in embeddings.items()}
        elif embeddings_type == 'mutated':
            embeddings = {key: value['mutated'].numpy() for key, value in embeddings.items()}
        elif embeddings_type == 'both':
            embeddings = {key: torch.cat((value['average'], value['mutated'])).numpy() for key, value in embeddings.items()}
        else:
            print("Invalid embeddings_type. Please choose 'average', 'mutated', or 'both'")
            return None, None

        # Convert embeddings dictionary to a dataframe
        embeddings = pd.DataFrame.from_dict(embeddings, orient='index')
    else:
        print("Invalid file type. Please choose either 'csvs' or 'pts'")
        return None, None

    # if not experimental
    if not experimental:
        # Read in labels
        labels = pd.read_csv(labels_file)

        # Read in hierarchy
        hie_data = pd.read_csv(hie_file)

        # Filter out rows where fitness is NaN
        labels = labels[labels['fitness'].notna()]

        # Filter out rows in embeddings where row names are not in labels variant column
        embeddings = embeddings[embeddings.index.isin(labels['variant'])]

        # Align labels by variant
        labels = labels.sort_values(by=['variant'])

        # Align embeddings by row name
        embeddings = embeddings.sort_index()

        # Confirm that labels and embeddings are aligned, reset index
        labels = labels.reset_index(drop=True)

        # Get the variants in labels and embeddings, convert to list
        label_variants = labels['variant'].tolist()
        embedding_variants = embeddings.index.tolist()

        # Check if embedding row names and label variants are identical
        if label_variants == embedding_variants:
            print('Embeddings and labels are aligned')

        # return embeddings and labels
        return embeddings, labels, hie_data

    else:
        return embeddings


# Active learning function for one iteration
def top_layer(iter_train, iter_test, embeddings_pd, labels_pd, measured_var, regression_type='ridge', top_n=None, final_round=10):

    # Get the variants in labels and embeddings, convert to list
    label_variants = labels_pd['variant'].tolist()
    embedding_variants = embeddings_pd.index.tolist()

    # Check if embedding row names and label variants are identical
    if label_variants == embedding_variants:
        print('Embeddings and labels are aligned')

    # reset the indices of embeddings_pd and labels_pd
    embeddings_pd = embeddings_pd.reset_index(drop=True)
    labels_pd = labels_pd.reset_index(drop=True)

    # save column 'iteration' in the labels dataframe
    iteration = labels_pd['iteration']

    # save labels
    labels = labels_pd

    # save mean embeddings as numpy array
    a = embeddings_pd

    # subset a, y to only include the rows where iteration = iter_train and iter_test
    idx_train = iteration[iteration.isin(iter_train)].index.to_numpy()
    idx_test = iteration[iteration.isin([iter_test])].index.to_numpy()

    # subset a to only include the rows where iteration = iter_train and iter_test
    X_train = a.loc[idx_train, :]
    X_test = a.loc[idx_test, :]

    y_train = labels[iteration.isin(iter_train)][measured_var]

    y_test = labels[iteration.isin([iter_test])][measured_var]

    # fit
    if regression_type == 'ridge':
        model = linear_model.RidgeCV()
    elif regression_type == 'lasso':
        model = linear_model.LassoCV(max_iter=100000,tol=1e-3)
    elif regression_type == 'elasticnet':
        model = linear_model.ElasticNetCV(max_iter=100000,tol=1e-3)
    elif regression_type == 'linear':
        model = linear_model.LinearRegression()
    elif regression_type == 'neuralnet':
        model = MLPRegressor(hidden_layer_sizes=(5), max_iter=1000, activation='relu', solver='adam', alpha=0.001,
                             batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5,
                             momentum=0.9, nesterovs_momentum=True, shuffle=True, random_state=1, tol=0.0001,
                             verbose=False, warm_start=False, early_stopping=False, validation_fraction=0.1, beta_1=0.9,
                             beta_2=0.999, epsilon=1e-08)
    elif regression_type == 'randomforest':
        model = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_depth=None, min_samples_split=2,
                                      min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
                                      max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
                                      n_jobs=None, random_state=1, verbose=0, warm_start=False, ccp_alpha=0.0,
                                      max_samples=None)
    elif regression_type == 'gradientboosting':
        model = xgboost.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                                     max_depth=5, alpha=10, n_estimators=10)

    model.fit(X_train, y_train)

    # make predictions on train data
    y_pred_train = model.predict(X_train)
    y_std_train = np.zeros(len(y_pred_train))
    # make predictions on test data
    # NOTE: can work on alternate 2-n round strategies here
    y_pred_test = model.predict(X_test)
    y_std_test = np.zeros(len(y_pred_test))

    # combine predicted and actual thermostability values with sequence IDs into a new dataframe
    df_train = pd.DataFrame({'variant': labels.variant[idx_train], 'y_pred': y_pred_train, 'y_actual': y_train})
    df_test = pd.DataFrame({'variant': labels.variant[idx_test], 'y_pred': y_pred_test, 'y_actual': y_test})
    
    # sort df_test by y_pred
    df_test = df_test.sort_values(by=['y_pred'], ascending=False)

    df_all = pd.concat([df_train, df_test])

    # sort df_all by y_pred
    df_all = df_all.sort_values(by=['y_pred'], ascending=False)

    return df_test, df_all

### New functions

In [3]:
def read_experimental_data(base_path, round_file_name, WT_sequence, single_mutant=True):
    file_path = base_path + '/rounds/' + round_file_name
    df = pd.read_excel(file_path)

    # Iterate through the 'Variant' column and update the values based on t7_sequence
    if single_mutant:
        updated_variants = []
        for _, row in df.iterrows():
            variant = row['Variant']
            if variant == 'WT':
                updated_variants.append(variant)
            else:
                position = int(variant[:-1])
                wt_aa = WT_sequence[position - 1]
                updated_variant = wt_aa + variant
                updated_variants.append(updated_variant)
        
        df['updated_variant'] = updated_variants  # Add the updated variants to the DataFrame
    else:
        df.rename(columns={'Variant': 'updated_variant'}, inplace=True)

    return df

def create_dataframes(df_list, expected_index):
    # First dataframe
    dfs = []  # List to store modified dataframes
    
    for i, df in enumerate(df_list, start=1):
        # Create a copy of the dataframe
        df_copy = df_list[i - 1].copy()
        # If the variant is WT, and i is equal to 1 assign iteration number 0
        if i == 1:
            df_copy.loc[df_copy['updated_variant'] == 'WT', 'iteration'] = 0
        else:
            df_copy = df_copy[df_copy['updated_variant'] != 'WT']
        df_copy.loc[df_copy['updated_variant'] != 'WT', 'iteration'] = i
        df_copy['iteration'] = df_copy['iteration'].astype(int)
        df_copy.rename(columns={'updated_variant': 'variant'}, inplace=True)  # Rename the column
    
        dfs.append(df_copy)

    df1 = pd.concat(dfs, ignore_index=True)
    df2 = pd.concat(dfs, ignore_index=True)

    # Check for duplicates in the 'variant' column of df1 or df2
    df1_duplicates = df1[df1.duplicated(subset=['variant'], keep=False)]
    df2_duplicates = df2[df2.duplicated(subset=['variant'], keep=False)]

    if not df1_duplicates.empty or not df2_duplicates.empty:
        print("Duplicates found in variant column:")
        if not df1_duplicates.empty:
            print("Duplicates in df1:")
            print(df1_duplicates)
        if not df2_duplicates.empty:
            print("Duplicates in df2:")
            print(df2_duplicates)
        print("Exiting.")
        return None, None

    df1 = df1[['variant', 'iteration']]
    df2 = df2[['variant', 'fitness', 'iteration']]

    expected_index_blank = [variant for variant in expected_index if variant not in df2['variant'].tolist()]
    # make a df_external that has a column 'variant' with all the variants in expected_index
    df_external = pd.DataFrame({'variant': expected_index_blank})
    df_external['fitness'] = np.nan  
    df_external['iteration'] = 1001 
    df2 = df2.append(df_external, ignore_index=True)
    # order df2 by expected_index
    df2 = df2.set_index('variant').reindex(expected_index, fill_value=np.nan).reset_index()
    # rename column 'index' to 'variant'
    df2 = df2.rename(columns={'index': 'variant'})
    
    return df1, df2

## T7 round 1

In [24]:
# import brenan data
dataset_name = 'esm2_15B_t7_pol'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
file_type = 'pts'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [177]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A883S,0.145864,-0.184623,0.031468,-0.051257,-0.142275,-0.012640,-0.061425,-0.027589,0.017693,0.025046,...,-0.110233,0.051852,0.058792,-0.157249,0.013417,0.018916,0.019343,0.065511,-0.009308,-0.109120
A883T,0.145933,-0.184290,0.031539,-0.051148,-0.141504,-0.012715,-0.062040,-0.028451,0.017504,0.024164,...,-0.110211,0.052196,0.058730,-0.158339,0.012717,0.018639,0.020539,0.065699,-0.010361,-0.109183
A883V,0.145253,-0.182491,0.031516,-0.052928,-0.142025,-0.013264,-0.060926,-0.027928,0.017077,0.024314,...,-0.111040,0.051419,0.060660,-0.157816,0.012737,0.018943,0.017333,0.065630,-0.008499,-0.109677
A883W,0.145614,-0.183985,0.031491,-0.051818,-0.142407,-0.012759,-0.059910,-0.027657,0.016999,0.023699,...,-0.110127,0.052603,0.059757,-0.156924,0.013284,0.018394,0.016497,0.064918,-0.009098,-0.108907


In [178]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
round_file_name = 'T7_Round1.xlsx'
t7_sequence = 'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'
experimental_data = read_experimental_data(base_path, round_file_name, t7_sequence)
print(experimental_data)
df_list = [experimental_data]

  Variant   fitness updated_variant
0     12N  1.073846            S12N
1     25N  0.677227            A25N
2      WT  1.000000              WT
3     89R  0.740499            F89R
4    134T  1.074891           V134T
5    177L  1.042706           V177L
6    225E  1.075861           G225E
7    241W  0.938351           S241W
8    273H  0.785147           V273H


In [179]:
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [180]:
df_all
df_all.to_csv('t7/round1_all_new.csv', index=False)

In [181]:
df_test

Unnamed: 0,variant,y_pred,y_actual
4266,G225M,0.994051,
773,H41Q,0.992839,
4019,V212L,0.989174,
2539,V134N,0.987659,
3334,H176L,0.987584,
...,...,...,...
458,A25D,0.826342,
385,F21G,0.824527,
516,Y28D,0.821791,
529,Y28S,0.816991,


In [182]:
# write the dataframe to a csv file
df_test.to_csv('t7/round1_predictions_new.csv', index=False)

## T7 round 2

In [206]:
# import brenan data
dataset_name = 'esm2_15B_t7_pol'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
file_type = 'pts'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [207]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A883S,0.145864,-0.184623,0.031468,-0.051257,-0.142275,-0.012640,-0.061425,-0.027589,0.017693,0.025046,...,-0.110233,0.051852,0.058792,-0.157249,0.013417,0.018916,0.019343,0.065511,-0.009308,-0.109120
A883T,0.145933,-0.184290,0.031539,-0.051148,-0.141504,-0.012715,-0.062040,-0.028451,0.017504,0.024164,...,-0.110211,0.052196,0.058730,-0.158339,0.012717,0.018639,0.020539,0.065699,-0.010361,-0.109183
A883V,0.145253,-0.182491,0.031516,-0.052928,-0.142025,-0.013264,-0.060926,-0.027928,0.017077,0.024314,...,-0.111040,0.051419,0.060660,-0.157816,0.012737,0.018943,0.017333,0.065630,-0.008499,-0.109677
A883W,0.145614,-0.183985,0.031491,-0.051818,-0.142407,-0.012759,-0.059910,-0.027657,0.016999,0.023699,...,-0.110127,0.052603,0.059757,-0.156924,0.013284,0.018394,0.016497,0.064918,-0.009098,-0.108907


In [208]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
round_file_name_1 = 'T7_Round1.xlsx'
round_file_name_2 = 'T7_Round2.xlsx'
t7_sequence = 'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, t7_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, t7_sequence)
print(experimental_data_1)
print(experimental_data_2)
df_list = [experimental_data_1, experimental_data_2]

  Variant   fitness updated_variant
0     12N  1.073846            S12N
1     25N  0.677227            A25N
2      WT  1.000000              WT
3     89R  0.740499            F89R
4    134T  1.074891           V134T
5    177L  1.042706           V177L
6    225E  1.075861           G225E
7    241W  0.938351           S241W
8    273H  0.785147           V273H
   Variant   fitness updated_variant
0     249C  1.055202           E249C
1     279S  0.999604           T279S
2     281L  0.589696           I281L
3     229I  0.478365           L229I
4     735S  1.840046           V735S
5     152N  1.908253           G152N
6     822S  1.678626           A822S
7     531T  1.096505           S531T
8     256P  0.600498           T256P
9     469Q  1.319594           G469Q
10    668E  1.279096           T668E
11      WT  1.000000              WT


In [209]:
iterations_two, labels_two = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_two
embeddings_pd = embeddings
labels_pd = labels_two
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [210]:
df_all
df_all.to_csv('t7/round2_all_new.csv', index=False)

In [211]:
df_test

Unnamed: 0,variant,y_pred,y_actual
2876,G152I,1.406724,
2882,G152Q,1.364646,
13958,V735N,1.323278,
15031,R792C,1.320562,
7763,A409N,1.320432,
...,...,...,...
4814,I254H,0.828598,
11151,I587V,0.827276,
6750,E356G,0.824448,
4561,S241A,0.822014,


In [212]:
df_test.to_csv('t7/round2_predictions_new.csv', index=False)

## T7 round 3

In [239]:
# import brenan data
dataset_name = 'esm2_15B_t7_pol'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
file_type = 'pts'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [None]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A883S,0.145864,-0.184623,0.031468,-0.051257,-0.142275,-0.012640,-0.061425,-0.027589,0.017693,0.025046,...,-0.110233,0.051852,0.058792,-0.157249,0.013417,0.018916,0.019343,0.065511,-0.009308,-0.109120
A883T,0.145933,-0.184290,0.031539,-0.051148,-0.141504,-0.012715,-0.062040,-0.028451,0.017504,0.024164,...,-0.110211,0.052196,0.058730,-0.158339,0.012717,0.018639,0.020539,0.065699,-0.010361,-0.109183
A883V,0.145253,-0.182491,0.031516,-0.052928,-0.142025,-0.013264,-0.060926,-0.027928,0.017077,0.024314,...,-0.111040,0.051419,0.060660,-0.157816,0.012737,0.018943,0.017333,0.065630,-0.008499,-0.109677
A883W,0.145614,-0.183985,0.031491,-0.051818,-0.142407,-0.012759,-0.059910,-0.027657,0.016999,0.023699,...,-0.110127,0.052603,0.059757,-0.156924,0.013284,0.018394,0.016497,0.064918,-0.009098,-0.108907


In [241]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
round_file_name_1 = 'T7_Round1.xlsx'
round_file_name_2 = 'T7_Round2.xlsx'
round_file_name_3 = 'T7_Round3.xlsx'
t7_sequence = 'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, t7_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, t7_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, t7_sequence)
print(experimental_data_1)
print(experimental_data_2)
print(experimental_data_3)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3]

  Variant   fitness updated_variant
0     12N  1.073846            S12N
1     25N  0.677227            A25N
2      WT  1.000000              WT
3     89R  0.740499            F89R
4    134T  1.074891           V134T
5    177L  1.042706           V177L
6    225E  1.075861           G225E
7    241W  0.938351           S241W
8    273H  0.785147           V273H
   Variant   fitness updated_variant
0     249C  1.055202           E249C
1     279S  0.999604           T279S
2     281L  0.589696           I281L
3     229I  0.478365           L229I
4     735S  1.840046           V735S
5     152N  1.908253           G152N
6     822S  1.678626           A822S
7     531T  1.096505           S531T
8     256P  0.600498           T256P
9     469Q  1.319594           G469Q
10    668E  1.279096           T668E
11      WT  1.000000              WT
   Variant   fitness updated_variant
0      10K  0.959845            D10K
1     643N  3.909999           E643N
2     370V  2.600643           N370V
3       5I 

In [242]:
iterations_three, labels_three = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_three
embeddings_pd = embeddings
labels_pd = labels_three
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [243]:
df_all
df_all.to_csv('t7/round3_all_new.csv', index=False)

In [244]:
df_test

Unnamed: 0,variant,y_pred,y_actual
12213,E643S,2.398941,
12206,E643K,2.301938,
12199,E643A,2.223764,
12212,E643R,2.193382,
12214,E643T,2.162055,
...,...,...,...
14457,I761V,0.880633,
4715,E249D,0.875294,
6743,I355V,0.866922,
11151,I587V,0.864906,


In [245]:
df_test.to_csv('t7/round3_predictions_new.csv', index=False)

## T7 round 4

In [5]:
# import brenan data
dataset_name = 'esm2_15B_t7_pol'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
file_type = 'pts'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [19]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
F251W,0.058680,-0.101877,-0.049105,0.043003,-0.129526,0.029136,-0.176315,0.014073,-0.099785,0.001786,...,-0.176302,-0.013019,0.040493,-0.126933,0.003337,-0.095834,-0.029323,0.022122,-0.108958,0.085466
S156P,0.057480,-0.102873,-0.054044,0.042988,-0.129897,0.027450,-0.177223,0.018766,-0.098628,0.000715,...,-0.173447,-0.020796,0.035839,-0.127225,0.005545,-0.095132,-0.033339,0.025640,-0.103198,0.087120
S180F,0.058544,-0.104004,-0.048321,0.038753,-0.127392,0.027275,-0.174910,0.015151,-0.099420,0.005351,...,-0.173295,-0.019856,0.035694,-0.125969,-0.002306,-0.092605,-0.033154,0.023564,-0.104630,0.087606
A419C,0.058057,-0.106263,-0.049922,0.041353,-0.129665,0.027440,-0.172086,0.018737,-0.098932,-0.000535,...,-0.174147,-0.017247,0.036708,-0.127276,0.001815,-0.096420,-0.030465,0.025425,-0.100106,0.087800
P299Y,0.057292,-0.106125,-0.052144,0.042111,-0.129207,0.026602,-0.171750,0.018487,-0.100901,-0.003204,...,-0.174812,-0.016793,0.037009,-0.124868,0.002369,-0.095742,-0.033622,0.024722,-0.098355,0.087069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F22H,0.058223,-0.102549,-0.053289,0.043090,-0.130043,0.030082,-0.174506,0.020153,-0.096832,-0.006312,...,-0.173114,-0.019959,0.037668,-0.126781,0.000861,-0.096356,-0.031056,0.025027,-0.103469,0.088209
S232P,0.054540,-0.103079,-0.051447,0.041830,-0.129405,0.027504,-0.173162,0.018984,-0.098990,-0.002792,...,-0.172950,-0.018743,0.036896,-0.127574,0.005526,-0.096237,-0.031903,0.023376,-0.107867,0.085076
K469Y,0.057801,-0.104625,-0.050990,0.041145,-0.129001,0.027085,-0.175023,0.018681,-0.098913,-0.000384,...,-0.170491,-0.017970,0.038323,-0.126963,0.002823,-0.097188,-0.031914,0.024631,-0.103980,0.084921
C431Y,0.058150,-0.104249,-0.050383,0.039418,-0.127600,0.026382,-0.177062,0.017804,-0.098297,-0.000005,...,-0.171520,-0.018806,0.039649,-0.128669,0.002352,-0.095632,-0.031668,0.026439,-0.103850,0.086796


In [9]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
round_file_name_1 = 'T7_Round1.xlsx'
round_file_name_2 = 'T7_Round2.xlsx'
round_file_name_3 = 'T7_Round3.xlsx'
round_file_name_4 = 'T7_Round4.xlsx'
t7_sequence = 'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, t7_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, t7_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, t7_sequence)
experimental_data_4 = read_experimental_data(base_path, round_file_name_4, t7_sequence)
print(experimental_data_1)
print(experimental_data_2)
print(experimental_data_3)
print(experimental_data_4)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3, experimental_data_4]

  Variant   fitness updated_variant
0     12N  1.073846            S12N
1     25N  0.677227            A25N
2      WT  1.000000              WT
3     89R  0.740499            F89R
4    134T  1.074891           V134T
5    177L  1.042706           V177L
6    225E  1.075861           G225E
7    241W  0.938351           S241W
8    273H  0.785147           V273H
   Variant   fitness updated_variant
0     249C  1.055202           E249C
1     279S  0.999604           T279S
2     281L  0.589696           I281L
3     229I  0.478365           L229I
4     735S  1.840046           V735S
5     152N  1.908253           G152N
6     822S  1.678626           A822S
7     531T  1.096505           S531T
8     256P  0.600498           T256P
9     469Q  1.319594           G469Q
10    668E  1.279096           T668E
11      WT  1.000000              WT
   Variant   fitness updated_variant
0      10K  0.959845            D10K
1     643N  3.909999           E643N
2     370V  2.600643           N370V
3       5I 

In [10]:
iterations_four, labels_four = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_four
embeddings_pd = embeddings
labels_pd = labels_four
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [11]:
df_all
df_all.to_csv('t7/round4_all_new.csv', index=False)

In [12]:
df_test

Unnamed: 0,variant,y_pred,y_actual
8138,V429G,6.356751,
8051,G424R,5.669336,
7921,P417V,5.563483,
15452,F814G,5.505366,
8045,G424K,5.436164,
...,...,...,...
11151,I587V,0.904100,
14513,N764T,0.897736,
4349,L229V,0.897673,
4561,S241A,0.897293,


In [13]:
df_test.to_csv('t7/round4_predictions_new.csv', index=False)

## T7 round multi

In [8]:
# import brenan data
dataset_name = 'esm2_15B_t7_pol'
base_path = 't7/'
file_type = 'pts'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A883S,0.145864,-0.184623,0.031468,-0.051257,-0.142275,-0.012640,-0.061425,-0.027589,0.017693,0.025046,...,-0.110233,0.051852,0.058792,-0.157249,0.013417,0.018916,0.019343,0.065511,-0.009308,-0.109120
A883T,0.145933,-0.184290,0.031539,-0.051148,-0.141504,-0.012715,-0.062040,-0.028451,0.017504,0.024164,...,-0.110211,0.052196,0.058730,-0.158339,0.012717,0.018639,0.020539,0.065699,-0.010361,-0.109183
A883V,0.145253,-0.182491,0.031516,-0.052928,-0.142025,-0.013264,-0.060926,-0.027928,0.017077,0.024314,...,-0.111040,0.051419,0.060660,-0.157816,0.012737,0.018943,0.017333,0.065630,-0.008499,-0.109677
A883W,0.145614,-0.183985,0.031491,-0.051818,-0.142407,-0.012759,-0.059910,-0.027657,0.016999,0.023699,...,-0.110127,0.052603,0.059757,-0.156924,0.013284,0.018394,0.016497,0.064918,-0.009098,-0.108907


In [9]:
# import brenan data
dataset_name = 't7_pol_2nd_esm2_t48_15B_UR50D'
base_path = 't7/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_2nd = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_2nd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
G469Q_E643T,0.145889,-0.182790,0.030785,-0.057693,-0.137158,-0.015250,-0.065644,-0.030562,0.015721,0.026794,...,-0.110929,0.050821,0.056755,-0.156186,0.014171,0.021422,0.019437,0.063799,-0.007937,-0.107058
V567R_E800K,0.148210,-0.184537,0.027006,-0.057041,-0.138845,-0.019332,-0.066518,-0.032812,0.019602,0.026675,...,-0.113203,0.047227,0.052569,-0.151784,0.015184,0.016786,0.018867,0.065029,-0.010083,-0.109038
E643A_E800K,0.145833,-0.187117,0.026528,-0.056519,-0.137344,-0.019379,-0.067501,-0.036029,0.018969,0.026877,...,-0.112779,0.045624,0.052394,-0.152478,0.013244,0.017361,0.021519,0.065539,-0.011579,-0.109093
T3M_N370W,0.150616,-0.182953,0.033280,-0.056339,-0.136728,-0.014619,-0.065460,-0.027623,0.017054,0.025113,...,-0.111436,0.055045,0.058565,-0.157827,0.013891,0.022742,0.018364,0.064181,-0.005876,-0.104856
T3M_E800K,0.148118,-0.185680,0.030040,-0.054539,-0.137180,-0.018564,-0.066139,-0.033898,0.018749,0.025620,...,-0.113455,0.049307,0.054361,-0.153114,0.013840,0.018033,0.021221,0.066086,-0.009430,-0.106007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N370W_E800K,0.149530,-0.186386,0.027508,-0.056005,-0.138623,-0.018279,-0.067522,-0.031717,0.020284,0.025980,...,-0.112312,0.048852,0.053360,-0.152884,0.014017,0.017770,0.019846,0.064313,-0.010903,-0.107967
V735S_E643R,0.146994,-0.183297,0.031044,-0.058928,-0.135666,-0.017855,-0.065131,-0.030022,0.017942,0.027027,...,-0.110571,0.049618,0.056938,-0.153973,0.014565,0.019396,0.017271,0.066180,-0.009771,-0.102756
S12N_V567R,0.148666,-0.180093,0.030408,-0.057733,-0.138956,-0.015145,-0.065848,-0.027081,0.016528,0.025192,...,-0.113051,0.052754,0.056733,-0.155480,0.015638,0.021286,0.014874,0.063751,-0.004830,-0.107167
A822S_T668E,0.149752,-0.181345,0.032359,-0.057038,-0.138883,-0.015355,-0.065626,-0.026852,0.016490,0.023786,...,-0.113033,0.055047,0.058049,-0.156461,0.013964,0.023030,0.018152,0.062285,-0.000582,-0.106315


In [10]:
# import brenan data
dataset_name = 't7_pol_3rd_esm2_t48_15B_UR50D'
base_path = 't7/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_3rd = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_3rd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
E249C_G47A_E643G,0.145388,-0.182473,0.030933,-0.057587,-0.136714,-0.014844,-0.064533,-0.031373,0.017164,0.026230,...,-0.111851,0.051610,0.056716,-0.154969,0.013676,0.023574,0.018870,0.064054,-0.004956,-0.106245
S12N_G152N_E643K,0.146178,-0.182657,0.029301,-0.057327,-0.136312,-0.017849,-0.066446,-0.030595,0.018808,0.026044,...,-0.112887,0.049075,0.054594,-0.154273,0.015796,0.018125,0.017433,0.066663,-0.008628,-0.105024
G469Q_N370V_P371H,0.150439,-0.181980,0.031870,-0.057191,-0.136717,-0.014373,-0.064152,-0.026287,0.015969,0.024023,...,-0.108456,0.053756,0.060124,-0.157998,0.014422,0.019672,0.017911,0.062661,-0.004515,-0.103642
V735S_G469Q_E643A,0.146080,-0.182656,0.030604,-0.058083,-0.137004,-0.014725,-0.066803,-0.030728,0.015840,0.027208,...,-0.110515,0.051248,0.057642,-0.156481,0.013008,0.021253,0.018647,0.063693,-0.007801,-0.106259
V735S_N370W_E643A,0.148282,-0.184695,0.029915,-0.058435,-0.136788,-0.015158,-0.067774,-0.029087,0.017053,0.026842,...,-0.110024,0.052066,0.056958,-0.156492,0.012951,0.021864,0.018005,0.063723,-0.008293,-0.106579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N370V_T3M_E800K,0.148984,-0.186480,0.029136,-0.053807,-0.136394,-0.018242,-0.065164,-0.032221,0.019003,0.025240,...,-0.111397,0.048904,0.055090,-0.153932,0.014046,0.017207,0.021540,0.065633,-0.010440,-0.105727
E249C_G152N_E643R,0.146045,-0.183159,0.029730,-0.059345,-0.135592,-0.016455,-0.064242,-0.030665,0.018225,0.026377,...,-0.112186,0.049158,0.054978,-0.153969,0.015116,0.018102,0.019127,0.067556,-0.008467,-0.104510
E643S_E800K_W797K,0.144825,-0.189710,0.024476,-0.056087,-0.137992,-0.021174,-0.066577,-0.037789,0.019331,0.025880,...,-0.113511,0.044386,0.049532,-0.150469,0.014313,0.015131,0.022993,0.066272,-0.012565,-0.110918
V567R_P371H_E643K,0.148876,-0.183190,0.029164,-0.060743,-0.136209,-0.019567,-0.064900,-0.031300,0.018579,0.027248,...,-0.110836,0.048835,0.056755,-0.154325,0.016498,0.016762,0.016760,0.066997,-0.007244,-0.104669


In [11]:
# convert the indices of embeddings_2nd, embeddings_3rd, embeddings_4th to strings
embeddings.columns = embeddings.columns.astype(str)
embeddings_2nd.columns = embeddings_2nd.columns.astype(str)
embeddings_3rd.columns = embeddings_3rd.columns.astype(str)
# embeddings_4th.columns = embeddings_4th.columns.astype(str)

# remove row that is WT Wild-type sequence
embeddings_2nd = embeddings_2nd.drop('WT Wild-type sequence')
embeddings_3rd = embeddings_3rd.drop('WT Wild-type sequence')
# embeddings_4th = embeddings_4th.drop('WT Wild-type sequence')

# Concatenate along rows
embeddings_full = pd.concat([embeddings, embeddings_2nd, embeddings_3rd], axis=0)
embeddings_full

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N370V_T3M_E800K,0.148984,-0.186480,0.029136,-0.053807,-0.136394,-0.018242,-0.065164,-0.032221,0.019003,0.025240,...,-0.111397,0.048904,0.055090,-0.153932,0.014046,0.017207,0.021540,0.065633,-0.010440,-0.105727
E249C_G152N_E643R,0.146045,-0.183159,0.029730,-0.059345,-0.135592,-0.016455,-0.064242,-0.030665,0.018225,0.026377,...,-0.112186,0.049158,0.054978,-0.153969,0.015116,0.018102,0.019127,0.067556,-0.008467,-0.104510
E643S_E800K_W797K,0.144825,-0.189710,0.024476,-0.056087,-0.137992,-0.021174,-0.066577,-0.037789,0.019331,0.025880,...,-0.113511,0.044386,0.049532,-0.150469,0.014313,0.015131,0.022993,0.066272,-0.012565,-0.110918
V567R_P371H_E643K,0.148876,-0.183190,0.029164,-0.060743,-0.136209,-0.019567,-0.064900,-0.031300,0.018579,0.027248,...,-0.110836,0.048835,0.056755,-0.154325,0.016498,0.016762,0.016760,0.066997,-0.007244,-0.104669


In [12]:
base_path = 't7/'
round_file_name_1 = 'T7_Round1.xlsx'
round_file_name_2 = 'T7_Round2.xlsx'
round_file_name_3 = 'T7_Round3.xlsx'
round_file_name_4 = 'T7_Round4.xlsx'
t7_sequence = 'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, t7_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, t7_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, t7_sequence)
experimental_data_4 = read_experimental_data(base_path, round_file_name_4, t7_sequence)
print(experimental_data_1)
print(experimental_data_2)
print(experimental_data_3)
print(experimental_data_4)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3, experimental_data_4]

  Variant   fitness updated_variant
0     12N  1.073846            S12N
1     25N  0.677227            A25N
2      WT  1.000000              WT
3     89R  0.740499            F89R
4    134T  1.074891           V134T
5    177L  1.042706           V177L
6    225E  1.075861           G225E
7    241W  0.938351           S241W
8    273H  0.785147           V273H
   Variant   fitness updated_variant
0     249C  1.055202           E249C
1     279S  0.999604           T279S
2     281L  0.589696           I281L
3     229I  0.478365           L229I
4     735S  1.840046           V735S
5     152N  1.908253           G152N
6     822S  1.678626           A822S
7     531T  1.096505           S531T
8     256P  0.600498           T256P
9     469Q  1.319594           G469Q
10    668E  1.279096           T668E
11      WT  1.000000              WT
   Variant   fitness updated_variant
0      10K  0.959845            D10K
1     643N  3.909999           E643N
2     370V  2.600643           N370V
3       5I 

In [13]:
embeddings_full


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N370V_T3M_E800K,0.148984,-0.186480,0.029136,-0.053807,-0.136394,-0.018242,-0.065164,-0.032221,0.019003,0.025240,...,-0.111397,0.048904,0.055090,-0.153932,0.014046,0.017207,0.021540,0.065633,-0.010440,-0.105727
E249C_G152N_E643R,0.146045,-0.183159,0.029730,-0.059345,-0.135592,-0.016455,-0.064242,-0.030665,0.018225,0.026377,...,-0.112186,0.049158,0.054978,-0.153969,0.015116,0.018102,0.019127,0.067556,-0.008467,-0.104510
E643S_E800K_W797K,0.144825,-0.189710,0.024476,-0.056087,-0.137992,-0.021174,-0.066577,-0.037789,0.019331,0.025880,...,-0.113511,0.044386,0.049532,-0.150469,0.014313,0.015131,0.022993,0.066272,-0.012565,-0.110918
V567R_P371H_E643K,0.148876,-0.183190,0.029164,-0.060743,-0.136209,-0.019567,-0.064900,-0.031300,0.018579,0.027248,...,-0.110836,0.048835,0.056755,-0.154325,0.016498,0.016762,0.016760,0.066997,-0.007244,-0.104669


In [14]:
iterations_multi_one, labels_multi_one = create_dataframes(df_list, embeddings_full.index)

iteration_old = iterations_multi_one
embeddings_pd = embeddings_full
labels_pd = labels_multi_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [15]:
df_all
df_all.to_csv('t7/round5_all_new.csv', index=False)

In [16]:
df_test

Unnamed: 0,variant,y_pred,y_actual
17307,S531T_G47A_E643G,8.173290,
20055,E249C_N370W_E643G,8.043855,
18693,G469Q_G47A_E643G,8.014859,
19399,E738N_W797K_E643G,8.007547,
18421,G47A_E738N_E643G,7.959576,
...,...,...,...
11151,I587V,0.904100,
14513,N764T,0.897736,
4349,L229V,0.897673,
4561,S241A,0.897293,


In [17]:
# write the dataframe to a csv file
df_test.to_csv('t7/round5_predictions_new.csv', index=False)

## T7 round multi

In [18]:
# import brenan data
dataset_name = 'esm2_15B_t7_pol'
base_path = 't7/'
file_type = 'pts'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A883S,0.145864,-0.184623,0.031468,-0.051257,-0.142275,-0.012640,-0.061425,-0.027589,0.017693,0.025046,...,-0.110233,0.051852,0.058792,-0.157249,0.013417,0.018916,0.019343,0.065511,-0.009308,-0.109120
A883T,0.145933,-0.184290,0.031539,-0.051148,-0.141504,-0.012715,-0.062040,-0.028451,0.017504,0.024164,...,-0.110211,0.052196,0.058730,-0.158339,0.012717,0.018639,0.020539,0.065699,-0.010361,-0.109183
A883V,0.145253,-0.182491,0.031516,-0.052928,-0.142025,-0.013264,-0.060926,-0.027928,0.017077,0.024314,...,-0.111040,0.051419,0.060660,-0.157816,0.012737,0.018943,0.017333,0.065630,-0.008499,-0.109677
A883W,0.145614,-0.183985,0.031491,-0.051818,-0.142407,-0.012759,-0.059910,-0.027657,0.016999,0.023699,...,-0.110127,0.052603,0.059757,-0.156924,0.013284,0.018394,0.016497,0.064918,-0.009098,-0.108907


In [19]:
# import brenan data
dataset_name = 't7_pol_2nd_esm2_t48_15B_UR50D'
base_path = 't7/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_2nd = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_2nd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
G469Q_E643T,0.145889,-0.182790,0.030785,-0.057693,-0.137158,-0.015250,-0.065644,-0.030562,0.015721,0.026794,...,-0.110929,0.050821,0.056755,-0.156186,0.014171,0.021422,0.019437,0.063799,-0.007937,-0.107058
V567R_E800K,0.148210,-0.184537,0.027006,-0.057041,-0.138845,-0.019332,-0.066518,-0.032812,0.019602,0.026675,...,-0.113203,0.047227,0.052569,-0.151784,0.015184,0.016786,0.018867,0.065029,-0.010083,-0.109038
E643A_E800K,0.145833,-0.187117,0.026528,-0.056519,-0.137344,-0.019379,-0.067501,-0.036029,0.018969,0.026877,...,-0.112779,0.045624,0.052394,-0.152478,0.013244,0.017361,0.021519,0.065539,-0.011579,-0.109093
T3M_N370W,0.150616,-0.182953,0.033280,-0.056339,-0.136728,-0.014619,-0.065460,-0.027623,0.017054,0.025113,...,-0.111436,0.055045,0.058565,-0.157827,0.013891,0.022742,0.018364,0.064181,-0.005876,-0.104856
T3M_E800K,0.148118,-0.185680,0.030040,-0.054539,-0.137180,-0.018564,-0.066139,-0.033898,0.018749,0.025620,...,-0.113455,0.049307,0.054361,-0.153114,0.013840,0.018033,0.021221,0.066086,-0.009430,-0.106007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N370W_E800K,0.149530,-0.186386,0.027508,-0.056005,-0.138623,-0.018279,-0.067522,-0.031717,0.020284,0.025980,...,-0.112312,0.048852,0.053360,-0.152884,0.014017,0.017770,0.019846,0.064313,-0.010903,-0.107967
V735S_E643R,0.146994,-0.183297,0.031044,-0.058928,-0.135666,-0.017855,-0.065131,-0.030022,0.017942,0.027027,...,-0.110571,0.049618,0.056938,-0.153973,0.014565,0.019396,0.017271,0.066180,-0.009771,-0.102756
S12N_V567R,0.148666,-0.180093,0.030408,-0.057733,-0.138956,-0.015145,-0.065848,-0.027081,0.016528,0.025192,...,-0.113051,0.052754,0.056733,-0.155480,0.015638,0.021286,0.014874,0.063751,-0.004830,-0.107167
A822S_T668E,0.149752,-0.181345,0.032359,-0.057038,-0.138883,-0.015355,-0.065626,-0.026852,0.016490,0.023786,...,-0.113033,0.055047,0.058049,-0.156461,0.013964,0.023030,0.018152,0.062285,-0.000582,-0.106315


In [20]:
# import brenan data
dataset_name = 't7_pol_3rd_esm2_t48_15B_UR50D'
base_path = 't7/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_3rd = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_3rd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
E249C_G47A_E643G,0.145388,-0.182473,0.030933,-0.057587,-0.136714,-0.014844,-0.064533,-0.031373,0.017164,0.026230,...,-0.111851,0.051610,0.056716,-0.154969,0.013676,0.023574,0.018870,0.064054,-0.004956,-0.106245
S12N_G152N_E643K,0.146178,-0.182657,0.029301,-0.057327,-0.136312,-0.017849,-0.066446,-0.030595,0.018808,0.026044,...,-0.112887,0.049075,0.054594,-0.154273,0.015796,0.018125,0.017433,0.066663,-0.008628,-0.105024
G469Q_N370V_P371H,0.150439,-0.181980,0.031870,-0.057191,-0.136717,-0.014373,-0.064152,-0.026287,0.015969,0.024023,...,-0.108456,0.053756,0.060124,-0.157998,0.014422,0.019672,0.017911,0.062661,-0.004515,-0.103642
V735S_G469Q_E643A,0.146080,-0.182656,0.030604,-0.058083,-0.137004,-0.014725,-0.066803,-0.030728,0.015840,0.027208,...,-0.110515,0.051248,0.057642,-0.156481,0.013008,0.021253,0.018647,0.063693,-0.007801,-0.106259
V735S_N370W_E643A,0.148282,-0.184695,0.029915,-0.058435,-0.136788,-0.015158,-0.067774,-0.029087,0.017053,0.026842,...,-0.110024,0.052066,0.056958,-0.156492,0.012951,0.021864,0.018005,0.063723,-0.008293,-0.106579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N370V_T3M_E800K,0.148984,-0.186480,0.029136,-0.053807,-0.136394,-0.018242,-0.065164,-0.032221,0.019003,0.025240,...,-0.111397,0.048904,0.055090,-0.153932,0.014046,0.017207,0.021540,0.065633,-0.010440,-0.105727
E249C_G152N_E643R,0.146045,-0.183159,0.029730,-0.059345,-0.135592,-0.016455,-0.064242,-0.030665,0.018225,0.026377,...,-0.112186,0.049158,0.054978,-0.153969,0.015116,0.018102,0.019127,0.067556,-0.008467,-0.104510
E643S_E800K_W797K,0.144825,-0.189710,0.024476,-0.056087,-0.137992,-0.021174,-0.066577,-0.037789,0.019331,0.025880,...,-0.113511,0.044386,0.049532,-0.150469,0.014313,0.015131,0.022993,0.066272,-0.012565,-0.110918
V567R_P371H_E643K,0.148876,-0.183190,0.029164,-0.060743,-0.136209,-0.019567,-0.064900,-0.031300,0.018579,0.027248,...,-0.110836,0.048835,0.056755,-0.154325,0.016498,0.016762,0.016760,0.066997,-0.007244,-0.104669


In [21]:
# convert the indices of embeddings_2nd, embeddings_3rd, embeddings_4th to strings
embeddings.columns = embeddings.columns.astype(str)
embeddings_2nd.columns = embeddings_2nd.columns.astype(str)
embeddings_3rd.columns = embeddings_3rd.columns.astype(str)
# embeddings_4th.columns = embeddings_4th.columns.astype(str)

# remove row that is WT Wild-type sequence
embeddings_2nd = embeddings_2nd.drop('WT Wild-type sequence')
embeddings_3rd = embeddings_3rd.drop('WT Wild-type sequence')
# embeddings_4th = embeddings_4th.drop('WT Wild-type sequence')

# Concatenate along rows
embeddings_full = pd.concat([embeddings, embeddings_2nd, embeddings_3rd], axis=0)
embeddings_full

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N370V_T3M_E800K,0.148984,-0.186480,0.029136,-0.053807,-0.136394,-0.018242,-0.065164,-0.032221,0.019003,0.025240,...,-0.111397,0.048904,0.055090,-0.153932,0.014046,0.017207,0.021540,0.065633,-0.010440,-0.105727
E249C_G152N_E643R,0.146045,-0.183159,0.029730,-0.059345,-0.135592,-0.016455,-0.064242,-0.030665,0.018225,0.026377,...,-0.112186,0.049158,0.054978,-0.153969,0.015116,0.018102,0.019127,0.067556,-0.008467,-0.104510
E643S_E800K_W797K,0.144825,-0.189710,0.024476,-0.056087,-0.137992,-0.021174,-0.066577,-0.037789,0.019331,0.025880,...,-0.113511,0.044386,0.049532,-0.150469,0.014313,0.015131,0.022993,0.066272,-0.012565,-0.110918
V567R_P371H_E643K,0.148876,-0.183190,0.029164,-0.060743,-0.136209,-0.019567,-0.064900,-0.031300,0.018579,0.027248,...,-0.110836,0.048835,0.056755,-0.154325,0.016498,0.016762,0.016760,0.066997,-0.007244,-0.104669


In [39]:
base_path = 't7/'
round_file_name_1 = 'T7_Round1.xlsx'
round_file_name_2 = 'T7_Round2.xlsx'
round_file_name_3 = 'T7_Round3.xlsx'
round_file_name_4 = 'T7_Round4.xlsx'
round_file_name_5 = 'T7_Round5.xlsx'
t7_sequence = 'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, t7_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, t7_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, t7_sequence)
experimental_data_4 = read_experimental_data(base_path, round_file_name_4, t7_sequence)
experimental_data_5 = read_experimental_data(base_path, round_file_name_5, t7_sequence, single_mutant=False)
print(experimental_data_1)
print(experimental_data_2)
print(experimental_data_3)
print(experimental_data_4)
print(experimental_data_5)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3, experimental_data_4, experimental_data_5]

  Variant   fitness updated_variant
0     12N  1.073846            S12N
1     25N  0.677227            A25N
2      WT  1.000000              WT
3     89R  0.740499            F89R
4    134T  1.074891           V134T
5    177L  1.042706           V177L
6    225E  1.075861           G225E
7    241W  0.938351           S241W
8    273H  0.785147           V273H
   Variant   fitness updated_variant
0     249C  1.055202           E249C
1     279S  0.999604           T279S
2     281L  0.589696           I281L
3     229I  0.478365           L229I
4     735S  1.840046           V735S
5     152N  1.908253           G152N
6     822S  1.678626           A822S
7     531T  1.096505           S531T
8     256P  0.600498           T256P
9     469Q  1.319594           G469Q
10    668E  1.279096           T668E
11      WT  1.000000              WT
   Variant   fitness updated_variant
0      10K  0.959845            D10K
1     643N  3.909999           E643N
2     370V  2.600643           N370V
3       5I 

In [40]:
embeddings_full


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N370V_T3M_E800K,0.148984,-0.186480,0.029136,-0.053807,-0.136394,-0.018242,-0.065164,-0.032221,0.019003,0.025240,...,-0.111397,0.048904,0.055090,-0.153932,0.014046,0.017207,0.021540,0.065633,-0.010440,-0.105727
E249C_G152N_E643R,0.146045,-0.183159,0.029730,-0.059345,-0.135592,-0.016455,-0.064242,-0.030665,0.018225,0.026377,...,-0.112186,0.049158,0.054978,-0.153969,0.015116,0.018102,0.019127,0.067556,-0.008467,-0.104510
E643S_E800K_W797K,0.144825,-0.189710,0.024476,-0.056087,-0.137992,-0.021174,-0.066577,-0.037789,0.019331,0.025880,...,-0.113511,0.044386,0.049532,-0.150469,0.014313,0.015131,0.022993,0.066272,-0.012565,-0.110918
V567R_P371H_E643K,0.148876,-0.183190,0.029164,-0.060743,-0.136209,-0.019567,-0.064900,-0.031300,0.018579,0.027248,...,-0.110836,0.048835,0.056755,-0.154325,0.016498,0.016762,0.016760,0.066997,-0.007244,-0.104669


In [41]:
iterations_multi_two, labels_multi_two = create_dataframes(df_list, embeddings_full.index)

iteration_old = iterations_multi_two
embeddings_pd = embeddings_full
labels_pd = labels_multi_two
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [42]:
df_all
df_all.to_csv('t7/round6_all_new.csv', index=False)

In [43]:
df_test

Unnamed: 0,variant,y_pred,y_actual
17458,V177L_G47A_E643G,16.017540,
18196,T3M_G47A_E643G,15.644449,
18891,T668E_G47A_E643G,15.529111,
18797,V134T_G47A_E643G,15.290023,
18867,S12N_G47A_E643G,15.096458,
...,...,...,...
5014,I264V,0.925190,
5129,P270W,0.924973,
10828,I570V,0.908968,
3627,L191V,0.908279,


In [44]:
# write the dataframe to a csv file
df_test.to_csv('t7/round6_predictions_new.csv', index=False)

## Fanzor round 1

In [4]:
# import brenan data
dataset_name = 'fanzor_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/fanzor/'
file_type = 'csvs'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, experimental=experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
D289I,0.076350,-0.096517,-0.039485,0.022141,-0.140778,0.060272,-0.157692,0.053028,-0.101141,-0.058941,...,-0.163110,-0.056517,0.018743,-0.116311,0.010050,-0.082736,-0.068338,0.026717,-0.135575,0.090331
K54A,0.081564,-0.108703,-0.035292,0.022103,-0.143718,0.055708,-0.158636,0.051587,-0.099935,-0.056645,...,-0.159871,-0.058828,0.010209,-0.116926,0.010969,-0.086742,-0.072210,0.026919,-0.131113,0.088953
Y403W,0.083564,-0.103152,-0.035773,0.019590,-0.140105,0.058891,-0.161704,0.049788,-0.096187,-0.057417,...,-0.163753,-0.054475,0.013977,-0.118503,0.010152,-0.083362,-0.071264,0.024228,-0.136338,0.091752
N441W,0.085060,-0.102434,-0.034094,0.019113,-0.140549,0.057376,-0.160521,0.053643,-0.102079,-0.058173,...,-0.163639,-0.054493,0.015502,-0.117870,0.010070,-0.085296,-0.072524,0.028138,-0.135484,0.088637
E336I,0.081061,-0.103307,-0.037303,0.019810,-0.141395,0.060582,-0.161910,0.051436,-0.100989,-0.056924,...,-0.159600,-0.057073,0.012353,-0.119047,0.011846,-0.084845,-0.072005,0.029802,-0.134100,0.089094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D132W,0.079420,-0.105425,-0.030861,0.021674,-0.141255,0.057854,-0.160459,0.051246,-0.100951,-0.055793,...,-0.163258,-0.056315,0.014693,-0.118476,0.013192,-0.081750,-0.067947,0.028840,-0.137237,0.088986
R263G,0.083450,-0.106207,-0.033075,0.021937,-0.141942,0.058554,-0.160312,0.051613,-0.098827,-0.060687,...,-0.162723,-0.054210,0.014769,-0.119330,0.010308,-0.084599,-0.070771,0.026594,-0.133062,0.091404
V123P,0.082387,-0.104821,-0.032308,0.020482,-0.139570,0.055889,-0.161555,0.051725,-0.097712,-0.054436,...,-0.163546,-0.057811,0.011002,-0.118673,0.012264,-0.085351,-0.068471,0.026130,-0.137345,0.090820
I355T,0.082915,-0.105555,-0.036674,0.019969,-0.141193,0.058157,-0.162853,0.050937,-0.098292,-0.057070,...,-0.162178,-0.056842,0.014052,-0.117169,0.009596,-0.084980,-0.070661,0.026135,-0.134839,0.087683


In [6]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/fanzor/'
round_file_name = 'fanzor_Round1.xlsx'
fanzor_sequence = 'MKRKREDLTLWDAANVHKHKSMWYWWEYIRRKDMVNHEKTDCDVIQLLQSASVKKQKTQSDKFLTSFSVGIRPTKHQKRVLNEMLRVSNYTYNWCLWLVNEKGLKPHQFELQKIVCKTNANDVDPQYRMENDDWFFNNKMTSVKLTSCKNFCTSYKSAKSLKSKLKRPMSVSNIIQGSFCVPKLFIRHLSSKDVSTDNTNMQNRYICMMPDNFEKRSNPKERFLKLAKPITKIPPIDHDVKIVKRADGMFIMNIPCDPKYTRRNASNDTIEKRVCGIDPGGRTFATVYDPIDCCVFQVGIKEDKQYVISKLHNKIDHAHMHLTKAQNKKQQQAARERIVSLKKTHLKLKTFVDDIHLKLSSHLVKEYQYVALGKINVAQLVKTDRPKPLSKRAKRDLLYWQHYRFRQRLTHRTTNTECILDVQNEAYTSKTCGVCGTINKNLEKSETFYCDQCKYNTHRDVNGARNILLKSLRMFPFEKQQQ'
experimental_data = read_experimental_data(base_path, round_file_name, fanzor_sequence)
print(experimental_data)
df_list = [experimental_data]

  Variant   fitness updated_variant
0     27T  0.579502            E27T
1     20M  0.804204            K20M
2     22V  0.867465            M22V
3     46E  0.790334            Q46E
4     58Q  0.812868            T58Q
5     69M  0.434715            V69M
6     93A  0.976318            N93A
7    109N  0.428318           F109N
8     78T  0.568579            K78T
9      WT  1.000000              WT


In [7]:
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [8]:
df_all
df_all.to_csv('fanzor/round1_all_new.csv', index=False)

In [9]:
df_test

Unnamed: 0,variant,y_pred,y_actual
7463,T199E,0.836856,
1286,H17E,0.835102,
2006,Q176E,0.833198,
163,M320L,0.829511,
2357,T196S,0.829295,
...,...,...,...
435,F109S,0.602243,
2454,F109A,0.592755,
2870,F109T,0.575805,
7667,F109H,0.569451,


In [10]:
# write the dataframe to a csv file
df_test.to_csv('fanzor/round1_predictions_new.csv', index=False)

## Fanzor round 2

In [12]:
# import brenan data
dataset_name = 'fanzor_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/fanzor/'
file_type = 'csvs'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, experimental=experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [13]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
D289I,0.076350,-0.096517,-0.039485,0.022141,-0.140778,0.060272,-0.157692,0.053028,-0.101141,-0.058941,...,-0.163110,-0.056517,0.018743,-0.116311,0.010050,-0.082736,-0.068338,0.026717,-0.135575,0.090331
K54A,0.081564,-0.108703,-0.035292,0.022103,-0.143718,0.055708,-0.158636,0.051587,-0.099935,-0.056645,...,-0.159871,-0.058828,0.010209,-0.116926,0.010969,-0.086742,-0.072210,0.026919,-0.131113,0.088953
Y403W,0.083564,-0.103152,-0.035773,0.019590,-0.140105,0.058891,-0.161704,0.049788,-0.096187,-0.057417,...,-0.163753,-0.054475,0.013977,-0.118503,0.010152,-0.083362,-0.071264,0.024228,-0.136338,0.091752
N441W,0.085060,-0.102434,-0.034094,0.019113,-0.140549,0.057376,-0.160521,0.053643,-0.102079,-0.058173,...,-0.163639,-0.054493,0.015502,-0.117870,0.010070,-0.085296,-0.072524,0.028138,-0.135484,0.088637
E336I,0.081061,-0.103307,-0.037303,0.019810,-0.141395,0.060582,-0.161910,0.051436,-0.100989,-0.056924,...,-0.159600,-0.057073,0.012353,-0.119047,0.011846,-0.084845,-0.072005,0.029802,-0.134100,0.089094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D132W,0.079420,-0.105425,-0.030861,0.021674,-0.141255,0.057854,-0.160459,0.051246,-0.100951,-0.055793,...,-0.163258,-0.056315,0.014693,-0.118476,0.013192,-0.081750,-0.067947,0.028840,-0.137237,0.088986
R263G,0.083450,-0.106207,-0.033075,0.021937,-0.141942,0.058554,-0.160312,0.051613,-0.098827,-0.060687,...,-0.162723,-0.054210,0.014769,-0.119330,0.010308,-0.084599,-0.070771,0.026594,-0.133062,0.091404
V123P,0.082387,-0.104821,-0.032308,0.020482,-0.139570,0.055889,-0.161555,0.051725,-0.097712,-0.054436,...,-0.163546,-0.057811,0.011002,-0.118673,0.012264,-0.085351,-0.068471,0.026130,-0.137345,0.090820
I355T,0.082915,-0.105555,-0.036674,0.019969,-0.141193,0.058157,-0.162853,0.050937,-0.098292,-0.057070,...,-0.162178,-0.056842,0.014052,-0.117169,0.009596,-0.084980,-0.070661,0.026135,-0.134839,0.087683


In [14]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/fanzor/'
round_file_name_1 = 'fanzor_Round1.xlsx'
round_file_name_2 = 'fanzor_Round2.xlsx'
fanzor_sequence = 'MKRKREDLTLWDAANVHKHKSMWYWWEYIRRKDMVNHEKTDCDVIQLLQSASVKKQKTQSDKFLTSFSVGIRPTKHQKRVLNEMLRVSNYTYNWCLWLVNEKGLKPHQFELQKIVCKTNANDVDPQYRMENDDWFFNNKMTSVKLTSCKNFCTSYKSAKSLKSKLKRPMSVSNIIQGSFCVPKLFIRHLSSKDVSTDNTNMQNRYICMMPDNFEKRSNPKERFLKLAKPITKIPPIDHDVKIVKRADGMFIMNIPCDPKYTRRNASNDTIEKRVCGIDPGGRTFATVYDPIDCCVFQVGIKEDKQYVISKLHNKIDHAHMHLTKAQNKKQQQAARERIVSLKKTHLKLKTFVDDIHLKLSSHLVKEYQYVALGKINVAQLVKTDRPKPLSKRAKRDLLYWQHYRFRQRLTHRTTNTECILDVQNEAYTSKTCGVCGTINKNLEKSETFYCDQCKYNTHRDVNGARNILLKSLRMFPFEKQQQ'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, fanzor_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, fanzor_sequence)
print(experimental_data_1)
print(experimental_data_2)
df_list = [experimental_data_1, experimental_data_2]

  Variant   fitness updated_variant
0     27T  0.579502            E27T
1     20M  0.804204            K20M
2     22V  0.867465            M22V
3     46E  0.790334            Q46E
4     58Q  0.812868            T58Q
5     69M  0.434715            V69M
6     93A  0.976318            N93A
7    109N  0.428318           F109N
8     78T  0.568579            K78T
9      WT  1.000000              WT
   Variant   fitness updated_variant
0     214D  0.997700           E214D
1      70C  0.935783            G70C
2     131H  0.397414           N131H
3     238F  0.713195           H238F
4     223H  0.742610           F223H
5     128L  0.009221           R128L
6     239V  0.737319           D239V
7      40D  0.547851            T40D
8     282Y  0.805207           R282Y
9     466K  0.037661           N466K
10    379A  0.039456           Q379A
11    440N  1.164566           K440N
12      WT  1.000010              WT


In [15]:
iterations_two, labels_two = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_two
embeddings_pd = embeddings
labels_pd = labels_two
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [16]:
df_all
df_all.to_csv('fanzor/round2_all_new.csv', index=False)

In [17]:
df_test

Unnamed: 0,variant,y_pred,y_actual
97,T231N,0.875245,
8939,K192N,0.873594,
503,M129I,0.861280,
4714,K192A,0.858800,
8812,S191D,0.855105,
...,...,...,...
3807,F109E,0.421076,
8236,Q401L,0.406626,
121,D278Y,0.403477,
1112,D278N,0.401153,


In [18]:
# write the dataframe to a csv file
df_test.to_csv('fanzor/round2_predictions_new.csv', index=False)

## Fanzor round 3

In [4]:
# import brenan data
dataset_name = 'fanzor_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/fanzor/'
file_type = 'csvs'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, experimental=experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
D289I,0.076350,-0.096517,-0.039485,0.022141,-0.140778,0.060272,-0.157692,0.053028,-0.101141,-0.058941,...,-0.163110,-0.056517,0.018743,-0.116311,0.010050,-0.082736,-0.068338,0.026717,-0.135575,0.090331
K54A,0.081564,-0.108703,-0.035292,0.022103,-0.143718,0.055708,-0.158636,0.051587,-0.099935,-0.056645,...,-0.159871,-0.058828,0.010209,-0.116926,0.010969,-0.086742,-0.072210,0.026919,-0.131113,0.088953
Y403W,0.083564,-0.103152,-0.035773,0.019590,-0.140105,0.058891,-0.161704,0.049788,-0.096187,-0.057417,...,-0.163753,-0.054475,0.013977,-0.118503,0.010152,-0.083362,-0.071264,0.024228,-0.136338,0.091752
N441W,0.085060,-0.102434,-0.034094,0.019113,-0.140549,0.057376,-0.160521,0.053643,-0.102079,-0.058173,...,-0.163639,-0.054493,0.015502,-0.117870,0.010070,-0.085296,-0.072524,0.028138,-0.135484,0.088637
E336I,0.081061,-0.103307,-0.037303,0.019810,-0.141395,0.060582,-0.161910,0.051436,-0.100989,-0.056924,...,-0.159600,-0.057073,0.012353,-0.119047,0.011846,-0.084845,-0.072005,0.029802,-0.134100,0.089094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D132W,0.079420,-0.105425,-0.030861,0.021674,-0.141255,0.057854,-0.160459,0.051246,-0.100951,-0.055793,...,-0.163258,-0.056315,0.014693,-0.118476,0.013192,-0.081750,-0.067947,0.028840,-0.137237,0.088986
R263G,0.083450,-0.106207,-0.033075,0.021937,-0.141942,0.058554,-0.160312,0.051613,-0.098827,-0.060687,...,-0.162723,-0.054210,0.014769,-0.119330,0.010308,-0.084599,-0.070771,0.026594,-0.133062,0.091404
V123P,0.082387,-0.104821,-0.032308,0.020482,-0.139570,0.055889,-0.161555,0.051725,-0.097712,-0.054436,...,-0.163546,-0.057811,0.011002,-0.118673,0.012264,-0.085351,-0.068471,0.026130,-0.137345,0.090820
I355T,0.082915,-0.105555,-0.036674,0.019969,-0.141193,0.058157,-0.162853,0.050937,-0.098292,-0.057070,...,-0.162178,-0.056842,0.014052,-0.117169,0.009596,-0.084980,-0.070661,0.026135,-0.134839,0.087683


In [6]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/fanzor/'
round_file_name_1 = 'fanzor_Round1.xlsx'
round_file_name_2 = 'fanzor_Round2.xlsx'
round_file_name_3 = 'fanzor_Round3.xlsx'
fanzor_sequence = 'MKRKREDLTLWDAANVHKHKSMWYWWEYIRRKDMVNHEKTDCDVIQLLQSASVKKQKTQSDKFLTSFSVGIRPTKHQKRVLNEMLRVSNYTYNWCLWLVNEKGLKPHQFELQKIVCKTNANDVDPQYRMENDDWFFNNKMTSVKLTSCKNFCTSYKSAKSLKSKLKRPMSVSNIIQGSFCVPKLFIRHLSSKDVSTDNTNMQNRYICMMPDNFEKRSNPKERFLKLAKPITKIPPIDHDVKIVKRADGMFIMNIPCDPKYTRRNASNDTIEKRVCGIDPGGRTFATVYDPIDCCVFQVGIKEDKQYVISKLHNKIDHAHMHLTKAQNKKQQQAARERIVSLKKTHLKLKTFVDDIHLKLSSHLVKEYQYVALGKINVAQLVKTDRPKPLSKRAKRDLLYWQHYRFRQRLTHRTTNTECILDVQNEAYTSKTCGVCGTINKNLEKSETFYCDQCKYNTHRDVNGARNILLKSLRMFPFEKQQQ'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, fanzor_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, fanzor_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, fanzor_sequence)
print(experimental_data_1)
print(experimental_data_2)
print(experimental_data_3)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3]

  Variant   fitness updated_variant
0     27T  0.579502            E27T
1     20M  0.804204            K20M
2     22V  0.867465            M22V
3     46E  0.790334            Q46E
4     58Q  0.812868            T58Q
5     69M  0.434715            V69M
6     93A  0.976318            N93A
7    109N  0.428318           F109N
8     78T  0.568579            K78T
9      WT  1.000000              WT
   Variant   fitness updated_variant
0     214D  0.997700           E214D
1      70C  0.935783            G70C
2     131H  0.397414           N131H
3     238F  0.713195           H238F
4     223H  0.742610           F223H
5     128L  0.009221           R128L
6     239V  0.737319           D239V
7      40D  0.547851            T40D
8     282Y  0.805207           R282Y
9     466K  0.037661           N466K
10    379A  0.039456           Q379A
11    440N  1.164566           K440N
12      WT  1.000010              WT
   Variant   fitness updated_variant
0     231N  0.718384           T231N
1     192N  

In [8]:
iterations_three, labels_three = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_three
embeddings_pd = embeddings
labels_pd = labels_three
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [9]:
df_all
df_all.to_csv('fanzor/round3_all_new.csv', index=False)

In [10]:
df_test

Unnamed: 0,variant,y_pred,y_actual
2008,K192T,1.031431,
3469,K329Q,0.984432,
3945,K440S,0.976134,
1687,K220S,0.962656,
415,K220A,0.960266,
...,...,...,...
1170,D278G,0.421623,
1112,D278N,0.413717,
2454,F109A,0.411215,
3807,F109E,0.409067,


In [11]:
# write the dataframe to a csv file
df_test.to_csv('fanzor/round3_predictions_new.csv', index=False)

## Bxb1 round 1 (pretrained)

In [42]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [43]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [44]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/bxb1/'
round_file_name_1 = 'bxb1_Round1_pretrained.xlsx'
bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
print(experimental_data_1)
df_list = [experimental_data_1]

   Variant  fitness_raw Variant_real   fitness updated_variant
0      12K    35.217163         V12K  1.750215            V12K
1      14K    31.010153         D14K  1.541136            D14K
2      17K     2.670498         T17K  0.132718            T17K
3      45K    24.140190         D45K  1.199714            D45K
4      47K     0.040382         S47K  0.002007            S47K
5      49K    24.730773         A49K  1.229064            A49K
6     134K     0.072676        L134K  0.003612           L134K
7     137K     0.093551        I137K  0.004649           I137K
8     144K     9.514925        A144K  0.472871           A144K
9     145K     0.791633        A145K  0.039342           A145K
10    149K     0.089444        I149K  0.004445           I149K
11    199K     5.294259        H199K  0.263113           H199K
12    221K    17.470383        Q221K  0.868239           Q221K
13    226K    29.136050        N226K  1.447997           Q226K
14    229K     0.865538        E229K  0.043015         

In [45]:
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [46]:
df_all
df_all.to_csv('bxb1/pretrained_round1_all_new.csv', index=False)

In [47]:
df_test

Unnamed: 0,variant,y_pred,y_actual
4134,D14S,1.235718,
5172,D14G,1.175557,
4676,Q28R,1.171850,
7250,A412R,1.156191,
4007,E20R,1.152608,
...,...,...,...
4472,A315K,0.298830,
3643,I149Q,0.279652,
6537,I149C,0.278023,
1796,H199Q,0.275105,


In [48]:
df_test.to_csv('bxb1/pretrained_round1_predictions_new.csv', index=False)

## Bxb1 round 2 (pretrained)

In [17]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [18]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [19]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/bxb1/'
round_file_name_1 = 'bxb1_Round1_pretrained.xlsx'
round_file_name_2 = 'bxb1_Round2_pretrained.xlsx'
bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, bxb1_sequence)
print(experimental_data_1)
print(experimental_data_2)
df_list = [experimental_data_1, experimental_data_2]

   Variant  fitness_raw Variant_real   fitness updated_variant
0      12K    35.217163         V12K  1.750215            V12K
1      14K    31.010153         D14K  1.541136            D14K
2      17K     2.670498         T17K  0.132718            T17K
3      45K    24.140190         D45K  1.199714            D45K
4      47K     0.040382         S47K  0.002007            S47K
5      49K    24.730773         A49K  1.229064            A49K
6     134K     0.072676        L134K  0.003612           L134K
7     137K     0.093551        I137K  0.004649           I137K
8     144K     9.514925        A144K  0.472871           A144K
9     145K     0.791633        A145K  0.039342           A145K
10    149K     0.089444        I149K  0.004445           I149K
11    199K     5.294259        H199K  0.263113           H199K
12    221K    17.470383        Q221K  0.868239           Q221K
13    226K    29.136050        N226K  1.447997           Q226K
14    229K     0.865538        E229K  0.043015         

In [20]:
iterations_two, labels_two = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_two
embeddings_pd = embeddings
labels_pd = labels_two
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [21]:
df_all
df_all.to_csv('bxb1/pretrained_round2_all_new.csv', index=False)

In [22]:
df_test

Unnamed: 0,variant,y_pred,y_actual
1636,D14A,1.112856,
6556,D14Y,1.066313,
3957,L9R,1.036923,
3093,A360R,1.028954,
4593,D14F,1.018732,
...,...,...,...
2770,G152Q,0.313898,
6537,I149C,0.297728,
3643,I149Q,0.274097,
6617,S47Q,0.273315,


In [23]:
df_test.to_csv('bxb1/pretrained_round2_predictions_new.csv', index=False)

## Bxb1 round 3 (pretrained)

In [24]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [25]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [26]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/bxb1/'
round_file_name_1 = 'bxb1_Round1_pretrained.xlsx'
round_file_name_2 = 'bxb1_Round2_pretrained.xlsx'
round_file_name_3 = 'bxb1_Round3_pretrained.xlsx'
bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, bxb1_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, bxb1_sequence)
print(experimental_data_1)
print(experimental_data_2)
print(experimental_data_3)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3]

   Variant  fitness_raw Variant_real   fitness updated_variant
0      12K    35.217163         V12K  1.750215            V12K
1      14K    31.010153         D14K  1.541136            D14K
2      17K     2.670498         T17K  0.132718            T17K
3      45K    24.140190         D45K  1.199714            D45K
4      47K     0.040382         S47K  0.002007            S47K
5      49K    24.730773         A49K  1.229064            A49K
6     134K     0.072676        L134K  0.003612           L134K
7     137K     0.093551        I137K  0.004649           I137K
8     144K     9.514925        A144K  0.472871           A144K
9     145K     0.791633        A145K  0.039342           A145K
10    149K     0.089444        I149K  0.004445           I149K
11    199K     5.294259        H199K  0.263113           H199K
12    221K    17.470383        Q221K  0.868239           Q221K
13    226K    29.136050        N226K  1.447997           Q226K
14    229K     0.865538        E229K  0.043015         

In [27]:
iterations_three, labels_three = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_three
embeddings_pd = embeddings
labels_pd = labels_three
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [28]:
df_all
df_all.to_csv('bxb1/pretrained_round3_all_new.csv', index=False)

In [29]:
df_test

Unnamed: 0,variant,y_pred,y_actual
3600,D14W,1.482967,
8829,D14L,1.365908,
4460,D14V,1.332520,
9334,D14H,1.277728,
6985,D14M,1.275672,
...,...,...,...
4675,I149H,0.344950,
8402,I137G,0.321267,
6617,S47Q,0.315430,
8032,I149N,0.310335,


In [30]:
df_test.to_csv('bxb1/pretrained_round3_predictions_new.csv', index=False)

## Bxb1 round 4 (pretrained)

In [4]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = 'bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [15]:
base_path = 'bxb1/'
round_file_name_1 = 'bxb1_Round1_pretrained.xlsx'
round_file_name_2 = 'bxb1_Round2_pretrained.xlsx'
round_file_name_3 = 'bxb1_Round3_pretrained.xlsx'
round_file_name_4 = 'bxb1_Round4_pretrained.xlsx'
bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, bxb1_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, bxb1_sequence)
experimental_data_4 = read_experimental_data(base_path, round_file_name_4, bxb1_sequence)
print(experimental_data_1)
print(experimental_data_2)
print(experimental_data_3)
print(experimental_data_4)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3, experimental_data_4]

   Variant  fitness_raw Variant_real   fitness updated_variant
0      12K    35.217163         V12K  1.750215            V12K
1      14K    31.010153         D14K  1.541136            D14K
2      17K     2.670498         T17K  0.132718            T17K
3      45K    24.140190         D45K  1.199714            D45K
4      47K     0.040382         S47K  0.002007            S47K
5      49K    24.730773         A49K  1.229064            A49K
6     134K     0.072676        L134K  0.003612           L134K
7     137K     0.093551        I137K  0.004649           I137K
8     144K     9.514925        A144K  0.472871           A144K
9     145K     0.791633        A145K  0.039342           A145K
10    149K     0.089444        I149K  0.004445           I149K
11    199K     5.294259        H199K  0.263113           H199K
12    221K    17.470383        Q221K  0.868239           Q221K
13    226K    29.136050        N226K  1.447997           Q226K
14    229K     0.865538        E229K  0.043015         

In [16]:
iterations_four, labels_four = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_four
embeddings_pd = embeddings
labels_pd = labels_four
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [14]:
df_all
df_all.to_csv('bxb1/pretrained_round4_all_new.csv', index=False)

In [15]:
df_test

Unnamed: 0,variant,y_pred,y_actual
812,D14P,1.164368,
9338,D14T,1.094945,
9498,K102M,1.039205,
4584,K102V,1.017767,
4370,A288R,1.012354,
...,...,...,...
5422,L298P,0.372433,
6617,S47Q,0.364117,
276,I137C,0.357346,
6537,I149C,0.350295,


In [16]:
df_test.to_csv('bxb1/pretrained_round4_predictions_new.csv', index=False)

## Bxb1 round 5 (pretrained)

In [19]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = 'bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [20]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [26]:
base_path = 'bxb1/'
round_file_name_1 = 'bxb1_Round1_pretrained.xlsx'
round_file_name_2 = 'bxb1_Round2_pretrained.xlsx'
round_file_name_3 = 'bxb1_Round3_pretrained.xlsx'
round_file_name_5 = 'bxb1_Round5_pretrained.xlsx'
bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, bxb1_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, bxb1_sequence)
experimental_data_5 = read_experimental_data(base_path, round_file_name_5, bxb1_sequence)
print(experimental_data_1)
print(experimental_data_2)
print(experimental_data_3)
print(experimental_data_5)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3, experimental_data_5]

   Variant  fitness_raw Variant_real   fitness updated_variant
0      12K    35.217163         V12K  1.750215            V12K
1      14K    31.010153         D14K  1.541136            D14K
2      17K     2.670498         T17K  0.132718            T17K
3      45K    24.140190         D45K  1.199714            D45K
4      47K     0.040382         S47K  0.002007            S47K
5      49K    24.730773         A49K  1.229064            A49K
6     134K     0.072676        L134K  0.003612           L134K
7     137K     0.093551        I137K  0.004649           I137K
8     144K     9.514925        A144K  0.472871           A144K
9     145K     0.791633        A145K  0.039342           A145K
10    149K     0.089444        I149K  0.004445           I149K
11    199K     5.294259        H199K  0.263113           H199K
12    221K    17.470383        Q221K  0.868239           Q221K
13    226K    29.136050        N226K  1.447997           Q226K
14    229K     0.865538        E229K  0.043015         

In [27]:
iterations_five, labels_five = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_five
embeddings_pd = embeddings
labels_pd = labels_five
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [28]:
df_all
df_all.to_csv('bxb1/pretrained_round5_all_new.csv', index=False)

In [29]:
df_test

Unnamed: 0,variant,y_pred,y_actual
9334,D14H,1.242673,
4460,D14V,1.161733,
3600,D14W,1.159752,
1206,D14I,1.129656,
8829,D14L,1.127311,
...,...,...,...
2602,A234Q,0.316963,
3643,I149Q,0.292655,
797,F147W,0.289239,
1796,H199Q,0.289122,


In [30]:
df_test.to_csv('bxb1/pretrained_round5_predictions_new.csv', index=False)

## Bxb1 round 1

In [39]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [40]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [41]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/bxb1/'
round_file_name_1 = 'bxb1_Round1.xlsx'
bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
print(experimental_data_1)
df_list = [experimental_data_1]

   Variant  fitness_raw   fitness updated_variant
0      23K     2.666789  0.483352            L23K
1      58T     0.000000  0.000000            R58T
2      79R     1.860380  0.337192            R79R
3     115K     3.122669  0.565980           T115K
4     141D     0.000000  0.000000           N141D
5     182M     1.822323  0.330294           E182M
6     230R     0.815088  0.147734           W230R
7     271N     0.000000  0.000000           T271N
8     318F     0.485051  0.087915           G318F
9     345H     1.172282  0.212475           W345H
10    376Y     0.906366  0.164278           E376Y
11    422P     1.026970  0.186137           G422P
12      WT     5.517278  1.000000              WT


In [42]:
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [43]:
df_all
df_all.to_csv('bxb1/round1_all_new.csv', index=False)

In [44]:
df_test

Unnamed: 0,variant,y_pred,y_actual
8096,V375K,0.552319,
5861,S25A,0.552317,
2344,S238M,0.541832,
4488,I123M,0.541639,
5199,V375C,0.540567,
...,...,...,...
9280,R58I,0.196812,
3081,R58S,0.177974,
2992,R58Q,0.159928,
591,R58V,0.154377,


In [45]:
df_test.to_csv('bxb1/round1_predictions_new.csv', index=False)

## Bxb1 round 2

In [46]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [47]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [48]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/bxb1/'
round_file_name_1 = 'bxb1_Round1.xlsx'
round_file_name_2 = 'bxb1_Round2.xlsx'
bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, bxb1_sequence)
print(experimental_data_1, experimental_data_2)
df_list = [experimental_data_1, experimental_data_2]

   Variant  fitness_raw   fitness updated_variant
0      23K     2.666789  0.483352            L23K
1      58T     0.000000  0.000000            R58T
2      79R     1.860380  0.337192            R79R
3     115K     3.122669  0.565980           T115K
4     141D     0.000000  0.000000           N141D
5     182M     1.822323  0.330294           E182M
6     230R     0.815088  0.147734           W230R
7     271N     0.000000  0.000000           T271N
8     318F     0.485051  0.087915           G318F
9     345H     1.172282  0.212475           W345H
10    376Y     0.906366  0.164278           E376Y
11    422P     1.026970  0.186137           G422P
12      WT     5.517278  1.000000              WT    Variant  fitness_raw   fitness updated_variant
0     123M     0.016724  0.016723           I123M
1     166C     1.511531  1.511455           T166C
2     189Q     1.780421  1.780331           H189Q
3     245A     0.794822  0.794782           L245A
4      25A     1.411378  1.411307            S25A


In [49]:
iterations_two, labels_two = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_two
embeddings_pd = embeddings
labels_pd = labels_two
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [50]:
df_all
df_all.to_csv('bxb1/round2_all_new.csv', index=False)

In [51]:
df_test

Unnamed: 0,variant,y_pred,y_actual
4619,V375Q,1.250632,
6300,V375W,1.239254,
4733,V375A,1.183530,
6332,T166Q,1.135125,
5092,V375F,1.126244,
...,...,...,...
6704,G318L,0.281939,
8728,E267M,0.280385,
9012,A405I,0.279286,
3334,A412V,0.277518,


In [52]:
df_test.to_csv('bxb1/round2_predictions_new.csv', index=False)

## Bxb1 round 3

In [41]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = 'bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [42]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [43]:
base_path = 'bxb1/'
round_file_name_1 = 'bxb1_Round1.xlsx'
round_file_name_2 = 'bxb1_Round2.xlsx'
round_file_name_3 = 'bxb1_Round3.xlsx'
bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, bxb1_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, bxb1_sequence)
print(experimental_data_1, experimental_data_2, experimental_data_3)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3]

   Variant  fitness_raw   fitness updated_variant
0      23K     2.666789  0.483352            L23K
1      58T     0.000000  0.000000            R58T
2      79R     1.860380  0.337192            R79R
3     115K     3.122669  0.565980           T115K
4     141D     0.000000  0.000000           N141D
5     182M     1.822323  0.330294           E182M
6     230R     0.815088  0.147734           W230R
7     271N     0.000000  0.000000           T271N
8     318F     0.485051  0.087915           G318F
9     345H     1.172282  0.212475           W345H
10    376Y     0.906366  0.164278           E376Y
11    422P     1.026970  0.186137           G422P
12      WT     5.517278  1.000000              WT    Variant  fitness_raw   fitness updated_variant
0     123M     0.016724  0.016723           I123M
1     166C     1.511531  1.511455           T166C
2     189Q     1.780421  1.780331           H189Q
3     245A     0.794822  0.794782           L245A
4      25A     1.411378  1.411307            S25A


In [44]:
iterations_three, labels_three = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_three
embeddings_pd = embeddings
labels_pd = labels_three
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [45]:
df_all
df_all.to_csv('bxb1/round3_all_new.csv', index=False)

In [46]:
df_test

Unnamed: 0,variant,y_pred,y_actual
7366,H189G,1.349092,
9282,Q22A,1.328185,
8844,R79K,1.289187,
3464,P71G,1.240311,
3720,Y247V,1.234329,
...,...,...,...
4993,G318M,0.363230,
2268,R58A,0.355748,
4103,Q32T,0.354464,
1641,D99T,0.350117,


In [47]:
df_test.to_csv('bxb1/round3_predictions_new.csv', index=False)

## Bxb1 round 4

In [48]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = 'bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [49]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [50]:
base_path = 'bxb1/'
round_file_name_1 = 'bxb1_Round1.xlsx'
round_file_name_2 = 'bxb1_Round2.xlsx'
round_file_name_4 = 'bxb1_Round4.xlsx'
bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, bxb1_sequence)
experimental_data_4 = read_experimental_data(base_path, round_file_name_4, bxb1_sequence)
print(experimental_data_1, experimental_data_2, experimental_data_4)
df_list = [experimental_data_1, experimental_data_2, experimental_data_4]

   Variant  fitness_raw   fitness updated_variant
0      23K     2.666789  0.483352            L23K
1      58T     0.000000  0.000000            R58T
2      79R     1.860380  0.337192            R79R
3     115K     3.122669  0.565980           T115K
4     141D     0.000000  0.000000           N141D
5     182M     1.822323  0.330294           E182M
6     230R     0.815088  0.147734           W230R
7     271N     0.000000  0.000000           T271N
8     318F     0.485051  0.087915           G318F
9     345H     1.172282  0.212475           W345H
10    376Y     0.906366  0.164278           E376Y
11    422P     1.026970  0.186137           G422P
12      WT     5.517278  1.000000              WT    Variant  fitness_raw   fitness updated_variant
0     123M     0.016724  0.016723           I123M
1     166C     1.511531  1.511455           T166C
2     189Q     1.780421  1.780331           H189Q
3     245A     0.794822  0.794782           L245A
4      25A     1.411378  1.411307            S25A


In [51]:
iterations_four, labels_four = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_four
embeddings_pd = embeddings
labels_pd = labels_four
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [52]:
df_all
df_all.to_csv('bxb1/round4_all_new.csv', index=False)

In [53]:
df_test

Unnamed: 0,variant,y_pred,y_actual
477,V375G,1.306310,
6300,V375W,1.276257,
4733,V375A,1.177830,
4619,V375Q,1.167516,
5092,V375F,1.141182,
...,...,...,...
208,G318Y,0.342003,
23,A347H,0.341996,
4993,G318M,0.339205,
6704,G318L,0.332986,


In [54]:
df_test.to_csv('bxb1/round4_predictions_new.csv', index=False)

## Bxb1 round 6

In [4]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = 'bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [7]:
base_path = 'bxb1/'
round_file_name_1 = 'bxb1_Round1.xlsx'
round_file_name_4 = 'bxb1_Round4.xlsx'
round_file_name_5 = 'bxb1_Round5.xlsx'
round_file_name_6 = 'bxb1_Round6.xlsx'
bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_4 = read_experimental_data(base_path, round_file_name_4, bxb1_sequence)
experimental_data_5 = read_experimental_data(base_path, round_file_name_5, bxb1_sequence)
experimental_data_6 = read_experimental_data(base_path, round_file_name_6, bxb1_sequence)

print(experimental_data_1, experimental_data_4, experimental_data_5, experimental_data_6)
df_list = [experimental_data_1, experimental_data_4, experimental_data_5, experimental_data_6]

   Variant  fitness_raw   fitness updated_variant
0      23K     2.666789  0.483352            L23K
1      58T     0.000000  0.000000            R58T
2      79R     1.860380  0.337192            R79R
3     115K     3.122669  0.565980           T115K
4     141D     0.000000  0.000000           N141D
5     182M     1.822323  0.330294           E182M
6     230R     0.815088  0.147734           W230R
7     271N     0.000000  0.000000           T271N
8     318F     0.485051  0.087915           G318F
9     345H     1.172282  0.212475           W345H
10    376Y     0.906366  0.164278           E376Y
11    422P     1.026970  0.186137           G422P
12      WT     5.517278  1.000000              WT    Variant   fitness updated_variant
0     219W  0.343482           Q219W
1     189G  1.306588           H189G
2      22V  0.850706            Q22V
3     247V  0.117336           Y247V
4     275A  1.371898           L275A
5     439C  1.737664           F439C
6     459G  1.418401           N459G
7   

In [9]:
iterations_six, labels_six = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_six
embeddings_pd = embeddings
labels_pd = labels_six
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [10]:
df_all
df_all.to_csv('bxb1/round6_all_new.csv', index=False)

In [11]:
df_test

Unnamed: 0,variant,y_pred,y_actual
5199,V375C,0.977173,
6388,H146A,0.942886,
2882,I240V,0.936289,
4213,V385A,0.933075,
7928,Y482L,0.931592,
...,...,...,...
9094,A374I,0.285251,
7315,A415I,0.280957,
7086,A343I,0.278023,
9007,A378I,0.275057,


In [12]:
df_test.to_csv('bxb1/round6_predictions_new.csv', index=False)

## Bxb1 round 7

In [4]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = 'bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [6]:
base_path = 'bxb1/'
round_file_name_1 = 'bxb1_Round1.xlsx'
round_file_name_4 = 'bxb1_Round4.xlsx'
round_file_name_5 = 'bxb1_Round5.xlsx'
round_file_name_6 = 'bxb1_Round6.xlsx'
round_file_name_7 = 'bxb1_Round7.xlsx'

bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_4 = read_experimental_data(base_path, round_file_name_4, bxb1_sequence)
experimental_data_5 = read_experimental_data(base_path, round_file_name_5, bxb1_sequence)
experimental_data_6 = read_experimental_data(base_path, round_file_name_6, bxb1_sequence)
experimental_data_7 = read_experimental_data(base_path, round_file_name_7, bxb1_sequence)

print(experimental_data_1, experimental_data_4, experimental_data_5, experimental_data_6, experimental_data_7)
df_list = [experimental_data_1, experimental_data_4, experimental_data_5, experimental_data_6, experimental_data_7]

   Variant  fitness_raw   fitness updated_variant
0      23K     2.666789  0.483352            L23K
1      58T     0.000000  0.000000            R58T
2      79R     1.860380  0.337192            R79R
3     115K     3.122669  0.565980           T115K
4     141D     0.000000  0.000000           N141D
5     182M     1.822323  0.330294           E182M
6     230R     0.815088  0.147734           W230R
7     271N     0.000000  0.000000           T271N
8     318F     0.485051  0.087915           G318F
9     345H     1.172282  0.212475           W345H
10    376Y     0.906366  0.164278           E376Y
11    422P     1.026970  0.186137           G422P
12      WT     5.517278  1.000000              WT    Variant   fitness updated_variant
0     219W  0.343482           Q219W
1     189G  1.306588           H189G
2      22V  0.850706            Q22V
3     247V  0.117336           Y247V
4     275A  1.371898           L275A
5     439C  1.737664           F439C
6     459G  1.418401           N459G
7   

In [7]:
iterations_seven, labels_seven = create_dataframes(df_list, embeddings.index)

# Assuming 'labels_five' is your DataFrame
filtered_labels_seven = labels_seven.dropna(subset=['fitness'])

# Remove the character before the numeric in each variant
filtered_labels_seven['variant'] = filtered_labels_seven['variant'].str.replace(r'\D+(\d+)', r'\1', regex=True)

# Write the modified DataFrame to an Excel file
filtered_labels_seven.to_excel('bxb1_variants_seven.xlsx', index=False)

In [8]:
iterations_seven, labels_seven = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_seven
embeddings_pd = embeddings
labels_pd = labels_seven
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [9]:
df_all
df_all.to_csv('bxb1/round7_all_new.csv', index=False)

In [10]:
df_test

Unnamed: 0,variant,y_pred,y_actual
6252,V292Q,0.978724,
4195,N194W,0.976197,
678,V375E,0.959653,
7859,L275C,0.943778,
1145,I240L,0.939623,
...,...,...,...
7227,R58Y,0.269532,
9280,R58I,0.257290,
3081,R58S,0.247031,
3655,R58F,0.238728,


In [11]:
df_test.to_csv('bxb1/round7_predictions_new.csv', index=False)

## Bxb1 round 8

In [5]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = 'bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [6]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [10]:
base_path = 'bxb1/'
round_file_name_1 = 'bxb1_Round1.xlsx'
round_file_name_4 = 'bxb1_Round4.xlsx'
round_file_name_5 = 'bxb1_Round5.xlsx'
round_file_name_6 = 'bxb1_Round6.xlsx'
round_file_name_7 = 'bxb1_Round7.xlsx'
round_file_name_8 = 'bxb1_Round8.xlsx'

bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_4 = read_experimental_data(base_path, round_file_name_4, bxb1_sequence)
experimental_data_5 = read_experimental_data(base_path, round_file_name_5, bxb1_sequence)
experimental_data_6 = read_experimental_data(base_path, round_file_name_6, bxb1_sequence)
experimental_data_7 = read_experimental_data(base_path, round_file_name_7, bxb1_sequence)
experimental_data_8 = read_experimental_data(base_path, round_file_name_8, bxb1_sequence)

print(experimental_data_1, experimental_data_4, experimental_data_5, experimental_data_6, experimental_data_7, experimental_data_8)
df_list = [experimental_data_1, experimental_data_4, experimental_data_5, experimental_data_6, experimental_data_7, experimental_data_8]

   Variant  fitness_raw   fitness updated_variant
0      23K     2.666789  0.483352            L23K
1      58T     0.000000  0.000000            R58T
2      79R     1.860380  0.337192            R79R
3     115K     3.122669  0.565980           T115K
4     141D     0.000000  0.000000           N141D
5     182M     1.822323  0.330294           E182M
6     230R     0.815088  0.147734           W230R
7     271N     0.000000  0.000000           T271N
8     318F     0.485051  0.087915           G318F
9     345H     1.172282  0.212475           W345H
10    376Y     0.906366  0.164278           E376Y
11    422P     1.026970  0.186137           G422P
12      WT     5.517278  1.000000              WT    Variant   fitness updated_variant
0     219W  0.343482           Q219W
1     189G  1.306588           H189G
2      22V  0.850706            Q22V
3     247V  0.117336           Y247V
4     275A  1.371898           L275A
5     439C  1.737664           F439C
6     459G  1.418401           N459G
7   

In [15]:
iterations_eight, labels_eight = create_dataframes(df_list, embeddings.index)

# Assuming 'labels_five' is your DataFrame
filtered_labels_eight = labels_eight.dropna(subset=['fitness'])

# Remove the character before the numeric in each variant
filtered_labels_eight['variant'] = filtered_labels_eight['variant'].str.replace(r'\D+(\d+)', r'\1', regex=True)

# Write the modified DataFrame to an Excel file
filtered_labels_eight.to_excel('bxb1_variants_eight.xlsx', index=False)

In [11]:
iterations_eight, labels_eight = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_eight
embeddings_pd = embeddings
labels_pd = labels_eight
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [12]:
df_all
df_all.to_csv('bxb1/round8_all_new.csv', index=False)

In [13]:
df_test

Unnamed: 0,variant,y_pred,y_actual
1318,L275D,1.295496,
2937,L275P,1.247459,
5103,L275Q,1.243852,
5963,L275Y,1.235787,
5406,L275H,1.229444,
...,...,...,...
3081,R58S,0.309109,
44,G252D,0.308620,
8313,R63T,0.300745,
3516,A382I,0.296308,


In [14]:
df_test.to_csv('bxb1/round8_predictions_new.csv', index=False)

## Bxb1 round 9

In [4]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = 'bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [6]:
base_path = 'bxb1/'
round_file_name_1 = 'bxb1_Round1.xlsx'
round_file_name_4 = 'bxb1_Round4.xlsx'
round_file_name_5 = 'bxb1_Round5.xlsx'
round_file_name_6 = 'bxb1_Round6.xlsx'
round_file_name_7 = 'bxb1_Round7.xlsx'
round_file_name_8 = 'bxb1_Round8.xlsx'
round_file_name_9 = 'bxb1_Round9.xlsx'

bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_4 = read_experimental_data(base_path, round_file_name_4, bxb1_sequence)
experimental_data_5 = read_experimental_data(base_path, round_file_name_5, bxb1_sequence)
experimental_data_6 = read_experimental_data(base_path, round_file_name_6, bxb1_sequence)
experimental_data_7 = read_experimental_data(base_path, round_file_name_7, bxb1_sequence)
experimental_data_8 = read_experimental_data(base_path, round_file_name_8, bxb1_sequence)
experimental_data_9 = read_experimental_data(base_path, round_file_name_9, bxb1_sequence)

print(experimental_data_1, experimental_data_4, experimental_data_5, experimental_data_6, experimental_data_7, experimental_data_8, experimental_data_9)
df_list = [experimental_data_1, experimental_data_4, experimental_data_5, experimental_data_6, experimental_data_7, experimental_data_8, experimental_data_9]

   Variant  fitness_raw   fitness updated_variant
0      23K     2.666789  0.483352            L23K
1      58T     0.000000  0.000000            R58T
2      79R     1.860380  0.337192            R79R
3     115K     3.122669  0.565980           T115K
4     141D     0.000000  0.000000           N141D
5     182M     1.822323  0.330294           E182M
6     230R     0.815088  0.147734           W230R
7     271N     0.000000  0.000000           T271N
8     318F     0.485051  0.087915           G318F
9     345H     1.172282  0.212475           W345H
10    376Y     0.906366  0.164278           E376Y
11    422P     1.026970  0.186137           G422P
12      WT     5.517278  1.000000              WT    Variant   fitness updated_variant
0     219W  0.343482           Q219W
1     189G  1.306588           H189G
2      22V  0.850706            Q22V
3     247V  0.117336           Y247V
4     275A  1.371898           L275A
5     439C  1.737664           F439C
6     459G  1.418401           N459G
7   

In [7]:
iterations_nine, labels_nine = create_dataframes(df_list, embeddings.index)

# Assuming 'labels_five' is your DataFrame
filtered_labels_nine = labels_nine.dropna(subset=['fitness'])

# Remove the character before the numeric in each variant
filtered_labels_nine['variant'] = filtered_labels_nine['variant'].str.replace(r'\D+(\d+)', r'\1', regex=True)

# Write the modified DataFrame to an Excel file
filtered_labels_nine.to_excel('bxb1_variants_nine.xlsx', index=False)

In [8]:
iterations_nine, labels_nine = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_nine
embeddings_pd = embeddings
labels_pd = labels_nine
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [9]:
df_all
df_all.to_csv('bxb1/round9_all_new.csv', index=False)

In [10]:
df_test

Unnamed: 0,variant,y_pred,y_actual
6,I240G,1.190016,
8026,I240S,1.098214,
8800,N194F,1.096443,
809,A110N,1.084744,
8355,V353K,1.084483,
...,...,...,...
470,R467I,0.301910,
5470,E376D,0.300791,
9427,V340T,0.289793,
6399,E376V,0.283712,


In [11]:
df_test.to_csv('bxb1/round9_predictions_new.csv', index=False)

## Bxb1 round 10

In [4]:
# import brenan data
dataset_name = 'bxb1_esm2_t48_15B_UR50D'
base_path = 'bxb1/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
P52L,0.155984,-0.116075,0.148573,-0.028123,-0.128543,-0.084058,-0.072859,-0.046838,-0.002415,-0.005118,...,0.000492,0.028932,0.044435,-0.114348,-0.081508,-0.014825,-0.013309,0.009477,0.016132,-0.005828
R461Y,0.156687,-0.112199,0.145999,-0.024424,-0.130263,-0.083671,-0.071455,-0.046674,-0.005362,-0.004633,...,0.001077,0.030514,0.043889,-0.111547,-0.080573,-0.015240,-0.016286,0.011181,0.015980,-0.006640
R85A,0.154052,-0.115900,0.144834,-0.026544,-0.129637,-0.081927,-0.072439,-0.048122,-0.004115,-0.006136,...,0.003807,0.027205,0.042758,-0.113761,-0.078599,-0.019909,-0.019584,0.008901,0.016665,-0.006007
A315M,0.156422,-0.111366,0.145899,-0.025250,-0.130326,-0.082279,-0.071448,-0.045753,-0.003816,-0.005938,...,0.000151,0.029420,0.046772,-0.111136,-0.079974,-0.015162,-0.016042,0.010557,0.016900,-0.007189
R79C,0.154901,-0.110541,0.142594,-0.029071,-0.130484,-0.079025,-0.078174,-0.045398,-0.005141,-0.009236,...,-0.001456,0.027167,0.048554,-0.110430,-0.082701,-0.014656,-0.014808,0.012512,0.019153,-0.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P290D,0.156653,-0.111046,0.146798,-0.025523,-0.129217,-0.082911,-0.070534,-0.045017,-0.003130,-0.007678,...,0.000111,0.029905,0.048076,-0.112167,-0.079270,-0.014585,-0.017037,0.011904,0.019825,-0.004860
R487F,0.159038,-0.109513,0.146208,-0.024718,-0.131947,-0.081873,-0.071920,-0.047177,-0.005338,-0.006448,...,0.000096,0.029832,0.045403,-0.111501,-0.081506,-0.016406,-0.016590,0.010052,0.017690,-0.006641
K102M,0.157048,-0.113739,0.144254,-0.024531,-0.130623,-0.082974,-0.068666,-0.047273,-0.005326,-0.005729,...,0.001046,0.027864,0.045563,-0.111530,-0.080371,-0.017716,-0.016377,0.008898,0.018480,-0.007759
F314G,0.157193,-0.112016,0.145149,-0.025453,-0.131428,-0.082888,-0.072090,-0.047332,-0.003777,-0.005369,...,-0.000387,0.030322,0.044799,-0.111127,-0.079471,-0.016143,-0.015138,0.009161,0.018830,-0.006157


In [6]:
base_path = 'bxb1/'
round_file_name_1 = 'bxb1_Round1.xlsx'
round_file_name_4 = 'bxb1_Round4.xlsx'
round_file_name_5 = 'bxb1_Round5.xlsx'
round_file_name_6 = 'bxb1_Round6.xlsx'
round_file_name_7 = 'bxb1_Round7.xlsx'
round_file_name_8 = 'bxb1_Round8.xlsx'
round_file_name_9 = 'bxb1_Round9.xlsx'
round_file_name_10 = 'bxb1_Round10.xlsx'

bxb1_sequence = 'MRALVVIRLSRVTDATTSPERQLESCQQLCAQRGWDVVGVAEDLDVSGAVDPFDRKRRPNLARWLAFEEQPFDVIVAYRVDRLTRSIRHLQQLVHWAEDHKKLVVSATEAHFDTTTPFAAVVIALMGTVAQMELEAIKERNRSAAHFNIRAGKYRGSLPPWGYLPTRVDGEWRLVPDPVQRERILEVYHRVVDNHEPLHLVAHDLNRRGVLSPKDYFAQLQGREPQGREWSATALKRSMISEAMLGYATLNGKTVRDDDGAPLVRAEPILTREQLEALRAELVKTSRAKPAVSTPSLLLRVLFCAVCGEPAYKFAGGGRKHPRYRCRSMGFPKHCGNGTVAMAEWDAFCEEQVLDLLGDAERLEKVWVAGSDSAVELAEVNAELVDLTSLIGSPAYRAGSPQREALDARIAALAARQEELEGLEARPSGWEWRETGQRFGDWWREQDTAAKNTWLRSMNVRLTFDVRGGLTRTIDFGDLQEYEQHLRLGSVVERLHTGMS'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, bxb1_sequence)
experimental_data_4 = read_experimental_data(base_path, round_file_name_4, bxb1_sequence)
experimental_data_5 = read_experimental_data(base_path, round_file_name_5, bxb1_sequence)
experimental_data_6 = read_experimental_data(base_path, round_file_name_6, bxb1_sequence)
experimental_data_7 = read_experimental_data(base_path, round_file_name_7, bxb1_sequence)
experimental_data_8 = read_experimental_data(base_path, round_file_name_8, bxb1_sequence)
experimental_data_9 = read_experimental_data(base_path, round_file_name_9, bxb1_sequence)
experimental_data_10 = read_experimental_data(base_path, round_file_name_10, bxb1_sequence)

print(experimental_data_1, experimental_data_4, experimental_data_5, experimental_data_6, experimental_data_7, experimental_data_8, experimental_data_9, experimental_data_10)
df_list = [experimental_data_1, experimental_data_4, experimental_data_5, experimental_data_6, experimental_data_7, experimental_data_8, experimental_data_9, experimental_data_10]

   Variant  fitness_raw   fitness updated_variant
0      23K     2.666789  0.483352            L23K
1      58T     0.000000  0.000000            R58T
2      79R     1.860380  0.337192            R79R
3     115K     3.122669  0.565980           T115K
4     141D     0.000000  0.000000           N141D
5     182M     1.822323  0.330294           E182M
6     230R     0.815088  0.147734           W230R
7     271N     0.000000  0.000000           T271N
8     318F     0.485051  0.087915           G318F
9     345H     1.172282  0.212475           W345H
10    376Y     0.906366  0.164278           E376Y
11    422P     1.026970  0.186137           G422P
12      WT     5.517278  1.000000              WT    Variant   fitness updated_variant
0     219W  0.343482           Q219W
1     189G  1.306588           H189G
2      22V  0.850706            Q22V
3     247V  0.117336           Y247V
4     275A  1.371898           L275A
5     439C  1.737664           F439C
6     459G  1.418401           N459G
7   

In [7]:
iterations_ten, labels_ten = create_dataframes(df_list, embeddings.index)

# Assuming 'labels_five' is your DataFrame
filtered_labels_ten = labels_ten.dropna(subset=['fitness'])

# Remove the character before the numeric in each variant
filtered_labels_ten['variant'] = filtered_labels_ten['variant'].str.replace(r'\D+(\d+)', r'\1', regex=True)

# Write the modified DataFrame to an Excel file
filtered_labels_ten.to_excel('bxb1_variants_ten.xlsx', index=False)

In [8]:
iterations_ten, labels_ten = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_ten
embeddings_pd = embeddings
labels_pd = labels_ten
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [9]:
df_all
df_all.to_csv('bxb1/round10_all_new.csv', index=False)

In [10]:
df_test

Unnamed: 0,variant,y_pred,y_actual
2778,I240R,1.342555,
5160,T166C,1.311896,
3144,T166D,1.178455,
8543,L250C,1.170270,
8429,I240K,1.168252,
...,...,...,...
1093,A412I,0.332996,
3967,S371I,0.326322,
5192,R444I,0.314146,
7432,R287I,0.305132,


In [11]:
df_test.to_csv('bxb1/round10_predictions_new.csv', index=False)

## CA round 1

In [64]:
# import brenan data
dataset_name = 'ca_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/ca/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [65]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
D13F,0.085663,-0.070125,0.037689,0.019694,0.002905,-0.083943,-0.017667,0.060394,0.145624,-0.067333,...,-0.097398,-0.021168,-0.020748,-0.093693,-0.010967,0.057353,-0.071873,-0.056516,-0.071533,-0.130958
M45W,0.077280,-0.075316,0.040967,0.021198,0.007825,-0.089681,-0.018298,0.055321,0.142251,-0.056774,...,-0.091781,-0.016596,-0.017690,-0.089796,-0.006738,0.059956,-0.073939,-0.062558,-0.071477,-0.132031
A173E,0.087915,-0.072260,0.040313,0.022687,0.002946,-0.083454,-0.018720,0.055359,0.143977,-0.063539,...,-0.099815,-0.018082,-0.011997,-0.095527,-0.011362,0.054954,-0.070879,-0.061164,-0.070656,-0.126392
G47F,0.088420,-0.077352,0.038010,0.023851,0.006484,-0.085765,-0.022136,0.052470,0.145836,-0.062145,...,-0.098187,-0.017449,-0.015975,-0.092171,-0.010464,0.046052,-0.076304,-0.060668,-0.069655,-0.134855
M93T,0.069239,-0.085450,0.033526,0.028181,-0.001162,-0.094387,-0.023597,0.058478,0.140190,-0.059581,...,-0.080886,-0.020508,-0.030550,-0.093876,-0.003380,0.052741,-0.064846,-0.052142,-0.074726,-0.130032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R170P,0.089024,-0.074984,0.043466,0.023683,0.002117,-0.085876,-0.012908,0.058917,0.142977,-0.063109,...,-0.092769,-0.017586,-0.019851,-0.097972,-0.008383,0.060735,-0.074547,-0.066768,-0.060328,-0.132468
L85P,0.091548,-0.075719,0.036663,0.019273,0.007470,-0.089777,-0.016545,0.057242,0.143495,-0.063106,...,-0.097158,-0.013028,-0.011680,-0.093433,-0.007895,0.054404,-0.078479,-0.058570,-0.063366,-0.135473
I56F,0.080827,-0.080963,0.041483,0.024030,0.011549,-0.092821,-0.023545,0.052908,0.136979,-0.068835,...,-0.091350,-0.013346,-0.022677,-0.098219,-0.003037,0.052579,-0.067524,-0.053649,-0.059933,-0.138120
L8G,0.079088,-0.077801,0.041376,0.020790,0.001375,-0.087771,-0.015298,0.057495,0.144516,-0.063691,...,-0.087798,-0.016448,-0.021661,-0.092828,-0.005738,0.057070,-0.075719,-0.053168,-0.061291,-0.126112


In [66]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/ca/'
round_file_name_1 = 'ca_Round1.xlsx'
ca_sequence = 'MTVTDDYLANNVDYASGFKGPLPMPPSKHIAIVACMDARLDVYRMLGIKEGEAHVIRNAGCVVTDDVIRSLAISQRLLGTREIILLHHTDCGMLTFTDDDFKRAIQDETGIRPTWSPESYPDAVEDVRQSLRRIEVNPFVTKHTSLRGFVFDVATGKLNEVTPAAALEARKEAELAAATAEQ'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, ca_sequence)
print(experimental_data_1)
df_list = [experimental_data_1]

   Variant   fitness updated_variant
0      13A  0.369211            D13A
1      35S  0.947054            C35S
2      26E  1.540771            P26E
3      22I  0.748429            L22I
4      15G  0.631504            A15G
5      20E  0.800809            G20E
6      25R  0.874041            P25R
7      30V  0.664379            I30V
8      40T  1.177277            L40T
9      55M  0.763146            V55M
10     65Y  0.711245            D65Y
11      WT  1.000000              WT


In [67]:
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [68]:
df_all
df_all.to_csv('ca/round1_all_new.csv', index=False)

In [69]:
df_test

Unnamed: 0,variant,y_pred,y_actual
3253,P26D,1.068809,
179,H87L,1.005771,
1300,K49G,1.003750,
312,K49S,0.989169,
1431,K49C,0.989132,
...,...,...,...
2680,D13T,0.759582,
161,D65E,0.758149,
3394,D107M,0.756664,
3440,D13L,0.755272,


In [70]:
df_test.to_csv('ca/round1_predictions_new.csv', index=False)

## MMFnuc Round 1

In [5]:
# import brenan data
dataset_name = 'mmfnuc_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/mmfnuc/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [6]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
F251W,0.058680,-0.101877,-0.049105,0.043003,-0.129526,0.029136,-0.176315,0.014073,-0.099785,0.001786,...,-0.176302,-0.013019,0.040493,-0.126933,0.003337,-0.095834,-0.029323,0.022122,-0.108958,0.085466
S156P,0.057480,-0.102873,-0.054044,0.042988,-0.129897,0.027450,-0.177223,0.018766,-0.098628,0.000715,...,-0.173447,-0.020796,0.035839,-0.127225,0.005545,-0.095132,-0.033339,0.025640,-0.103198,0.087120
S180F,0.058544,-0.104004,-0.048321,0.038753,-0.127392,0.027275,-0.174910,0.015151,-0.099420,0.005351,...,-0.173295,-0.019856,0.035694,-0.125969,-0.002306,-0.092605,-0.033154,0.023564,-0.104630,0.087606
A419C,0.058057,-0.106263,-0.049922,0.041353,-0.129665,0.027440,-0.172086,0.018737,-0.098932,-0.000535,...,-0.174147,-0.017247,0.036708,-0.127276,0.001815,-0.096420,-0.030465,0.025425,-0.100106,0.087800
P299Y,0.057292,-0.106125,-0.052144,0.042111,-0.129207,0.026602,-0.171750,0.018487,-0.100901,-0.003204,...,-0.174812,-0.016793,0.037009,-0.124868,0.002369,-0.095742,-0.033622,0.024722,-0.098355,0.087069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F22H,0.058223,-0.102549,-0.053289,0.043090,-0.130043,0.030082,-0.174506,0.020153,-0.096832,-0.006312,...,-0.173114,-0.019959,0.037668,-0.126781,0.000861,-0.096356,-0.031056,0.025027,-0.103469,0.088209
S232P,0.054540,-0.103079,-0.051447,0.041830,-0.129405,0.027504,-0.173162,0.018984,-0.098990,-0.002792,...,-0.172950,-0.018743,0.036896,-0.127574,0.005526,-0.096237,-0.031903,0.023376,-0.107867,0.085076
K469Y,0.057801,-0.104625,-0.050990,0.041145,-0.129001,0.027085,-0.175023,0.018681,-0.098913,-0.000384,...,-0.170491,-0.017970,0.038323,-0.126963,0.002823,-0.097188,-0.031914,0.024631,-0.103980,0.084921
C431Y,0.058150,-0.104249,-0.050383,0.039418,-0.127600,0.026382,-0.177062,0.017804,-0.098297,-0.000005,...,-0.171520,-0.018806,0.039649,-0.128669,0.002352,-0.095632,-0.031668,0.026439,-0.103850,0.086796


In [7]:

base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/mmfnuc/'
round_file_name_1 = 'mmfnuc_Round1.xlsx'
mmfnuc_sequence = 'MKRKREQMTLWKAAFVNGQETFKSWIDKARMLELNCDVSSASSTHYSDLNLKTKCAKMEDKFMCTFSVGIRPTSKQKRTLNQMLKVSNHAYNWCNYLVKEKDFKPKQFDLQRVVTKTNSTDVPAEYRLPGDDWFFDNKMSSIKLTACKNFCTMYKSAQTNQKKTKVDLRNKDIAMLREGSFEVQKKYVRLLTEKDIPDERIRQSRIALMADNFSKSKKDWKERFLRLSKNVSKIPPLSHDMKVCKRPNGKFVLQIPCDPIYTRQIQVHTSDSICSIDPGGRTFATCYDPSNIKAFQIGPEADKKEVIHKYHEKIDYVHRLLAYAQKKKQTQAVQDRIGQLKKLHLKLKTYVDDVHLKLCSYLVKNYKLVVLGKISVSSIVRKDRPNHLAKKANRDLLCWQHYRFRQRLLHRVRGTDCEAIAQDERYTSKTCGNCGVKNNKLGGKETFICESCNYKTHRDVNGARNILCKYLGLFPFAA'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, mmfnuc_sequence)
print(experimental_data_1)
df_list = [experimental_data_1]

  Variant  fitness_raw   fitness updated_variant
0     35S     1.147238  1.147235            N35S
1     22I     1.310796  1.310793            F22I
2     25R     0.573165  0.573164            W25R
3     40T     0.818348  0.818347            S40T
4     50K     1.350812  1.350809            N50K
5     55M     0.908122  0.908120            C55M
6      WT     1.000002  1.000000              WT


In [8]:
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [9]:
df_all
df_all.to_csv('mmfnuc/round1_all_new.csv', index=False)


In [10]:
df_test

Unnamed: 0,variant,y_pred,y_actual
2992,A14I,1.110493,
3950,F22V,1.108385,
1327,N50S,1.108227,
2844,L49K,1.108054,
1687,N50W,1.107230,
...,...,...,...
789,W25G,0.859596,
4074,W25Q,0.859500,
8697,W25E,0.858582,
2102,W25N,0.854021,


In [11]:
df_test.to_csv('mmfnuc/round1_predictions_new.csv', index=False)

## MMFnuc Round 2

In [12]:
# import brenan data
dataset_name = 'mmfnuc_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/mmfnuc/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [13]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
F251W,0.058680,-0.101877,-0.049105,0.043003,-0.129526,0.029136,-0.176315,0.014073,-0.099785,0.001786,...,-0.176302,-0.013019,0.040493,-0.126933,0.003337,-0.095834,-0.029323,0.022122,-0.108958,0.085466
S156P,0.057480,-0.102873,-0.054044,0.042988,-0.129897,0.027450,-0.177223,0.018766,-0.098628,0.000715,...,-0.173447,-0.020796,0.035839,-0.127225,0.005545,-0.095132,-0.033339,0.025640,-0.103198,0.087120
S180F,0.058544,-0.104004,-0.048321,0.038753,-0.127392,0.027275,-0.174910,0.015151,-0.099420,0.005351,...,-0.173295,-0.019856,0.035694,-0.125969,-0.002306,-0.092605,-0.033154,0.023564,-0.104630,0.087606
A419C,0.058057,-0.106263,-0.049922,0.041353,-0.129665,0.027440,-0.172086,0.018737,-0.098932,-0.000535,...,-0.174147,-0.017247,0.036708,-0.127276,0.001815,-0.096420,-0.030465,0.025425,-0.100106,0.087800
P299Y,0.057292,-0.106125,-0.052144,0.042111,-0.129207,0.026602,-0.171750,0.018487,-0.100901,-0.003204,...,-0.174812,-0.016793,0.037009,-0.124868,0.002369,-0.095742,-0.033622,0.024722,-0.098355,0.087069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F22H,0.058223,-0.102549,-0.053289,0.043090,-0.130043,0.030082,-0.174506,0.020153,-0.096832,-0.006312,...,-0.173114,-0.019959,0.037668,-0.126781,0.000861,-0.096356,-0.031056,0.025027,-0.103469,0.088209
S232P,0.054540,-0.103079,-0.051447,0.041830,-0.129405,0.027504,-0.173162,0.018984,-0.098990,-0.002792,...,-0.172950,-0.018743,0.036896,-0.127574,0.005526,-0.096237,-0.031903,0.023376,-0.107867,0.085076
K469Y,0.057801,-0.104625,-0.050990,0.041145,-0.129001,0.027085,-0.175023,0.018681,-0.098913,-0.000384,...,-0.170491,-0.017970,0.038323,-0.126963,0.002823,-0.097188,-0.031914,0.024631,-0.103980,0.084921
C431Y,0.058150,-0.104249,-0.050383,0.039418,-0.127600,0.026382,-0.177062,0.017804,-0.098297,-0.000005,...,-0.171520,-0.018806,0.039649,-0.128669,0.002352,-0.095632,-0.031668,0.026439,-0.103850,0.086796


In [14]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/mmfnuc/'
round_file_name_1 = 'mmfnuc_Round1.xlsx'
round_file_name_2 = 'mmfnuc_Round2.xlsx'
mmfnuc_sequence = 'MKRKREQMTLWKAAFVNGQETFKSWIDKARMLELNCDVSSASSTHYSDLNLKTKCAKMEDKFMCTFSVGIRPTSKQKRTLNQMLKVSNHAYNWCNYLVKEKDFKPKQFDLQRVVTKTNSTDVPAEYRLPGDDWFFDNKMSSIKLTACKNFCTMYKSAQTNQKKTKVDLRNKDIAMLREGSFEVQKKYVRLLTEKDIPDERIRQSRIALMADNFSKSKKDWKERFLRLSKNVSKIPPLSHDMKVCKRPNGKFVLQIPCDPIYTRQIQVHTSDSICSIDPGGRTFATCYDPSNIKAFQIGPEADKKEVIHKYHEKIDYVHRLLAYAQKKKQTQAVQDRIGQLKKLHLKLKTYVDDVHLKLCSYLVKNYKLVVLGKISVSSIVRKDRPNHLAKKANRDLLCWQHYRFRQRLLHRVRGTDCEAIAQDERYTSKTCGNCGVKNNKLGGKETFICESCNYKTHRDVNGARNILCKYLGLFPFAA'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, mmfnuc_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, mmfnuc_sequence)
print(experimental_data_1, experimental_data_2)
df_list = [experimental_data_1, experimental_data_2]

  Variant  fitness_raw   fitness updated_variant
0     35S     1.147238  1.147235            N35S
1     22I     1.310796  1.310793            F22I
2     25R     0.573165  0.573164            W25R
3     40T     0.818348  0.818347            S40T
4     50K     1.350812  1.350809            N50K
5     55M     0.908122  0.908120            C55M
6      WT     1.000002  1.000000              WT    Variant  fitness_raw   fitness updated_variant
0      65Y     0.191288  0.191288            T65Y
1      15G     0.226811  0.226811            F15G
2      26E     0.438868  0.438868            I26E
3      14I     1.679332  1.679331            A14I
4      22V     1.636589  1.636589            F22V
5      50W     1.963885  1.963885            N50W
6      46I     1.105226  1.105225            Y46I
7      22L     1.140359  1.140358            F22L
8      49E     0.473967  0.473967            L49E
9      50A     1.025887  1.025887            N50A
10     50S     0.192993  0.192993            N50S
11      

In [15]:
iterations_two, labels_two = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_two
embeddings_pd = embeddings
labels_pd = labels_two
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [16]:
df_all
df_all.to_csv('mmfnuc/round2_all_new.csv', index=False)


In [17]:
df_test

Unnamed: 0,variant,y_pred,y_actual
8449,T53I,1.267518,
8590,T53L,1.264053,
3647,S43I,1.260009,
6997,S42I,1.247397,
3417,E33W,1.216781,
...,...,...,...
4553,L34G,0.631872,
3465,I26K,0.627114,
878,L34N,0.620678,
843,L34D,0.605848,


In [18]:
df_test.to_csv('mmfnuc/round2_predictions_new.csv', index=False)

## R2

In [4]:
# import brenan data
dataset_name = 'r2_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/r2/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
G91K,0.125095,-0.086747,0.007329,0.066044,-0.088842,-0.018201,-0.123120,0.048890,-0.063238,-0.024220,...,-0.061586,-0.006344,0.057469,-0.050334,-0.042349,-0.016509,-0.068606,0.012836,-0.129086,0.022432
N927I,0.122180,-0.083631,0.008685,0.064573,-0.088776,-0.018910,-0.124121,0.050447,-0.062081,-0.025686,...,-0.062493,-0.004403,0.056985,-0.052900,-0.042592,-0.014429,-0.069569,0.015393,-0.128585,0.021870
I824V,0.121158,-0.084260,0.007549,0.064432,-0.089038,-0.019331,-0.123460,0.049893,-0.061687,-0.025476,...,-0.062508,-0.004760,0.057204,-0.052061,-0.043694,-0.014540,-0.068647,0.014702,-0.128467,0.022632
I257T,0.121469,-0.084672,0.007914,0.064696,-0.088860,-0.020061,-0.125147,0.049035,-0.062149,-0.025158,...,-0.062353,-0.005594,0.056296,-0.052189,-0.044065,-0.014480,-0.068425,0.014220,-0.128542,0.020876
V56A,0.121873,-0.084203,0.008424,0.064599,-0.088758,-0.020137,-0.123685,0.050389,-0.061743,-0.025748,...,-0.062738,-0.005220,0.057436,-0.051883,-0.043081,-0.014980,-0.067994,0.014596,-0.128305,0.022357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K1019H,0.121816,-0.084847,0.007153,0.065182,-0.088532,-0.019749,-0.123572,0.050393,-0.062215,-0.025013,...,-0.062319,-0.004719,0.056696,-0.052044,-0.043212,-0.014929,-0.069097,0.015089,-0.128408,0.022167
D812L,0.120452,-0.084344,0.007394,0.063507,-0.087286,-0.018784,-0.124111,0.047359,-0.061184,-0.024530,...,-0.063294,-0.005696,0.057122,-0.053182,-0.041089,-0.014853,-0.069275,0.015093,-0.128790,0.019908
F260L,0.121224,-0.084671,0.007690,0.064884,-0.089026,-0.019367,-0.124186,0.050523,-0.061800,-0.026249,...,-0.062903,-0.004626,0.057484,-0.052013,-0.042299,-0.014930,-0.067630,0.014060,-0.128773,0.022168
R135N,0.120981,-0.083602,0.005954,0.064894,-0.088813,-0.018985,-0.124834,0.050413,-0.061735,-0.025136,...,-0.063572,-0.004989,0.057836,-0.051759,-0.040987,-0.014490,-0.069512,0.014465,-0.127477,0.021782


In [6]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/r2/'
round_file_name_1 = 'r2_Round1.xlsx'
mmfunc_sequence = 'VKVTVPDKNPPCPCCSTRLNSVLALIDHLKGSHGKRRVCFRCAKCGRENFNHHSTVCHFAKCKGPSEEKPPVGEWICEVCGRDFTTKIGLGQHKRLAHPMVRNQERIDASQPKETSNRGAHKKCWTKEEEELLARLEVQFEGHKNINKLIAEHITTKTNKQISDKRRQMTRKDKGEGGAAGKLGPDTGRGNHSQAKVGNNGLGGNQLPGGPAATKDKAGCHLDKEEGNRIAISQQKKGRLQGRYHKEIKRRLEEGVINTFTKAFKQLLECQEVQPLINKTAQDCFGLLESACHIRTALRGKNKKETQEKPTGGQCLKWMKKRAVKKGNYLRFQRLFHLDRGKLARIILDDIECLSCDIAPSEIYSVFKARWETPGQFAGLGNFKSTGKADNKAFSDLITAKEIKKNVQEMSKGSAPGPDGIAIGDIKGMDPGYSRTAELFNLWLTSGEIPDMVRGCRTVLIPKSTQPERLKDINNWRPITIGSILLRLFSRIITARMTKACPLNPRQRGFIRAAGCSENLKLLQTIIRTAKSEHRPLGVVFVDIAKAFDTVSHQHILHVLQQRGVDPHIIGLVSNMYKDISTFVTTKKDTHTDKIQIRVGVKQGDPLSPLLFNLAMDPLLCKLEESGNGFHRGGHTITAMAFADDLVLLSDSWENMEKNIEILEAFCDLTGLKTQGQKCHGFYIKPTKDSYTVNNCAAWTIYGTPLNMINPGDSEKYLGLQIDPWTGIARSNISSKLDSWLERINQAPLKPLQKLDILKTYTIPRLTYMVDHSEMKAGALEALDLQIRSAVKDWLHLPSCTCDAILYVSTKDGGLGVTKLAGLIPSIQARRLHRIAQSPDETMKAFLDKEQMEKQYAKLWVQAGGKREKIPSIWDALPTPVLLTTSDTLSEWEAPNPKSKYPRPCNWRRKEFEKWTKLQCQGRGIQNFKGDVISNNWIQNYRRIPHRKLLTAVQLRANVYPTREFLGRGRGDDCVKFCRHCEVDLETCGHIISYCPVTKEARIKRHNRICERLIEEAEKKDWVVFKEPHIRDAVKELFKPDLIFVKEDRALVVDVTVRFEATTTSLEEAAIEKVDKYKRLETEVRSLTNAKDVLFMGFPLGARGKWYQGNFKLLDMLGLSESRQVTVAKTLSTDALISSVDIVHMFASKARKMNLVTV'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, mmfunc_sequence)
print(experimental_data_1)
df_list = [experimental_data_1]

  Variant   fitness updated_variant
0     23K  1.161144            L23K
1     79R  0.913773            V79R
2    115K  1.245699           T115K
3    141D  0.875473           E141D
4    271N  0.785357           Q271N
5    318F  1.262595           W318F
6    345H  0.820908           R345H
7    422P  1.115601           A422P
8      WT  1.000000              WT


In [7]:
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [8]:
df_all
df_all.to_csv('r2/round1_all_new.csv', index=False)

In [9]:
df_test

Unnamed: 0,variant,y_pred,y_actual
11359,W318V,1.147273,
1121,W318I,1.134502,
15462,W318Y,1.132448,
14588,W318L,1.132188,
16561,W318S,1.126092,
...,...,...,...
4346,K144V,0.935839,
10158,E305H,0.935395,
10704,Q307H,0.934809,
13833,K531T,0.934760,


In [10]:
df_test.to_csv('r2/round1_predictions_new.csv', index=False)

## R2 round 2

In [4]:
# import brenan data
dataset_name = 'r2_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/r2/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
G91K,0.125095,-0.086747,0.007329,0.066044,-0.088842,-0.018201,-0.123120,0.048890,-0.063238,-0.024220,...,-0.061586,-0.006344,0.057469,-0.050334,-0.042349,-0.016509,-0.068606,0.012836,-0.129086,0.022432
N927I,0.122180,-0.083631,0.008685,0.064573,-0.088776,-0.018910,-0.124121,0.050447,-0.062081,-0.025686,...,-0.062493,-0.004403,0.056985,-0.052900,-0.042592,-0.014429,-0.069569,0.015393,-0.128585,0.021870
I824V,0.121158,-0.084260,0.007549,0.064432,-0.089038,-0.019331,-0.123460,0.049893,-0.061687,-0.025476,...,-0.062508,-0.004760,0.057204,-0.052061,-0.043694,-0.014540,-0.068647,0.014702,-0.128467,0.022632
I257T,0.121469,-0.084672,0.007914,0.064696,-0.088860,-0.020061,-0.125147,0.049035,-0.062149,-0.025158,...,-0.062353,-0.005594,0.056296,-0.052189,-0.044065,-0.014480,-0.068425,0.014220,-0.128542,0.020876
V56A,0.121873,-0.084203,0.008424,0.064599,-0.088758,-0.020137,-0.123685,0.050389,-0.061743,-0.025748,...,-0.062738,-0.005220,0.057436,-0.051883,-0.043081,-0.014980,-0.067994,0.014596,-0.128305,0.022357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K1019H,0.121816,-0.084847,0.007153,0.065182,-0.088532,-0.019749,-0.123572,0.050393,-0.062215,-0.025013,...,-0.062319,-0.004719,0.056696,-0.052044,-0.043212,-0.014929,-0.069097,0.015089,-0.128408,0.022167
D812L,0.120452,-0.084344,0.007394,0.063507,-0.087286,-0.018784,-0.124111,0.047359,-0.061184,-0.024530,...,-0.063294,-0.005696,0.057122,-0.053182,-0.041089,-0.014853,-0.069275,0.015093,-0.128790,0.019908
F260L,0.121224,-0.084671,0.007690,0.064884,-0.089026,-0.019367,-0.124186,0.050523,-0.061800,-0.026249,...,-0.062903,-0.004626,0.057484,-0.052013,-0.042299,-0.014930,-0.067630,0.014060,-0.128773,0.022168
R135N,0.120981,-0.083602,0.005954,0.064894,-0.088813,-0.018985,-0.124834,0.050413,-0.061735,-0.025136,...,-0.063572,-0.004989,0.057836,-0.051759,-0.040987,-0.014490,-0.069512,0.014465,-0.127477,0.021782


In [12]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/r2/'
round_file_name_1 = 'r2_Round1.xlsx'
round_file_name_2 = 'r2_Round2.xlsx'
mmfunc_sequence = 'VKVTVPDKNPPCPCCSTRLNSVLALIDHLKGSHGKRRVCFRCAKCGRENFNHHSTVCHFAKCKGPSEEKPPVGEWICEVCGRDFTTKIGLGQHKRLAHPMVRNQERIDASQPKETSNRGAHKKCWTKEEEELLARLEVQFEGHKNINKLIAEHITTKTNKQISDKRRQMTRKDKGEGGAAGKLGPDTGRGNHSQAKVGNNGLGGNQLPGGPAATKDKAGCHLDKEEGNRIAISQQKKGRLQGRYHKEIKRRLEEGVINTFTKAFKQLLECQEVQPLINKTAQDCFGLLESACHIRTALRGKNKKETQEKPTGGQCLKWMKKRAVKKGNYLRFQRLFHLDRGKLARIILDDIECLSCDIAPSEIYSVFKARWETPGQFAGLGNFKSTGKADNKAFSDLITAKEIKKNVQEMSKGSAPGPDGIAIGDIKGMDPGYSRTAELFNLWLTSGEIPDMVRGCRTVLIPKSTQPERLKDINNWRPITIGSILLRLFSRIITARMTKACPLNPRQRGFIRAAGCSENLKLLQTIIRTAKSEHRPLGVVFVDIAKAFDTVSHQHILHVLQQRGVDPHIIGLVSNMYKDISTFVTTKKDTHTDKIQIRVGVKQGDPLSPLLFNLAMDPLLCKLEESGNGFHRGGHTITAMAFADDLVLLSDSWENMEKNIEILEAFCDLTGLKTQGQKCHGFYIKPTKDSYTVNNCAAWTIYGTPLNMINPGDSEKYLGLQIDPWTGIARSNISSKLDSWLERINQAPLKPLQKLDILKTYTIPRLTYMVDHSEMKAGALEALDLQIRSAVKDWLHLPSCTCDAILYVSTKDGGLGVTKLAGLIPSIQARRLHRIAQSPDETMKAFLDKEQMEKQYAKLWVQAGGKREKIPSIWDALPTPVLLTTSDTLSEWEAPNPKSKYPRPCNWRRKEFEKWTKLQCQGRGIQNFKGDVISNNWIQNYRRIPHRKLLTAVQLRANVYPTREFLGRGRGDDCVKFCRHCEVDLETCGHIISYCPVTKEARIKRHNRICERLIEEAEKKDWVVFKEPHIRDAVKELFKPDLIFVKEDRALVVDVTVRFEATTTSLEEAAIEKVDKYKRLETEVRSLTNAKDVLFMGFPLGARGKWYQGNFKLLDMLGLSESRQVTVAKTLSTDALISSVDIVHMFASKARKMNLVTV'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, mmfunc_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, mmfunc_sequence)
print(experimental_data_1, experimental_data_2)
df_list = [experimental_data_1, experimental_data_2]

  Variant   fitness updated_variant
0     23K  1.161144            L23K
1     79R  0.913773            V79R
2    115K  1.245699           T115K
3    141D  0.875473           E141D
4    271N  0.785357           Q271N
5    318F  1.262595           W318F
6    345H  0.820908           R345H
7    422P  1.115601           A422P
8      WT  1.000000              WT   Variant   fitness updated_variant
0      WT  1.000000              WT
1    318I  0.110506           W318I
2    318L  2.482788           W318L
3    318S  2.538343           W318S
4    318P  2.207212           W318P
5    318E  2.126198           W318E
6    318M  1.898696           W318M
7    318K  2.627626           W318K
8    318Q  2.346451           W318Q
9    318A  1.538110           W318A


In [13]:
iterations_two, labels_two = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_two
embeddings_pd = embeddings
labels_pd = labels_two
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [14]:
df_all
df_all.to_csv('r2/round2_all_new.csv', index=False)

In [15]:
df_test

Unnamed: 0,variant,y_pred,y_actual
6219,W318T,2.129203,
5463,W318R,2.116551,
11932,W318D,2.097647,
826,W318N,2.066087,
9693,W318G,1.868245,
...,...,...,...
21649,R535K,0.936748,
18983,K249V,0.934365,
4719,G300A,0.931492,
6841,K2Y,0.931119,


In [16]:
df_test.to_csv('r2/round2_predictions_new.csv', index=False)

## R2 round 3 alternate

In [4]:
# import brenan data
dataset_name = 'r2_esm2_t48_15B_UR50D'
base_path = 'r2/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
G91K,0.125095,-0.086747,0.007329,0.066044,-0.088842,-0.018201,-0.123120,0.048890,-0.063238,-0.024220,...,-0.061586,-0.006344,0.057469,-0.050334,-0.042349,-0.016509,-0.068606,0.012836,-0.129086,0.022432
N927I,0.122180,-0.083631,0.008685,0.064573,-0.088776,-0.018910,-0.124121,0.050447,-0.062081,-0.025686,...,-0.062493,-0.004403,0.056985,-0.052900,-0.042592,-0.014429,-0.069569,0.015393,-0.128585,0.021870
I824V,0.121158,-0.084260,0.007549,0.064432,-0.089038,-0.019331,-0.123460,0.049893,-0.061687,-0.025476,...,-0.062508,-0.004760,0.057204,-0.052061,-0.043694,-0.014540,-0.068647,0.014702,-0.128467,0.022632
I257T,0.121469,-0.084672,0.007914,0.064696,-0.088860,-0.020061,-0.125147,0.049035,-0.062149,-0.025158,...,-0.062353,-0.005594,0.056296,-0.052189,-0.044065,-0.014480,-0.068425,0.014220,-0.128542,0.020876
V56A,0.121873,-0.084203,0.008424,0.064599,-0.088758,-0.020137,-0.123685,0.050389,-0.061743,-0.025748,...,-0.062738,-0.005220,0.057436,-0.051883,-0.043081,-0.014980,-0.067994,0.014596,-0.128305,0.022357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K1019H,0.121816,-0.084847,0.007153,0.065182,-0.088532,-0.019749,-0.123572,0.050393,-0.062215,-0.025013,...,-0.062319,-0.004719,0.056696,-0.052044,-0.043212,-0.014929,-0.069097,0.015089,-0.128408,0.022167
D812L,0.120452,-0.084344,0.007394,0.063507,-0.087286,-0.018784,-0.124111,0.047359,-0.061184,-0.024530,...,-0.063294,-0.005696,0.057122,-0.053182,-0.041089,-0.014853,-0.069275,0.015093,-0.128790,0.019908
F260L,0.121224,-0.084671,0.007690,0.064884,-0.089026,-0.019367,-0.124186,0.050523,-0.061800,-0.026249,...,-0.062903,-0.004626,0.057484,-0.052013,-0.042299,-0.014930,-0.067630,0.014060,-0.128773,0.022168
R135N,0.120981,-0.083602,0.005954,0.064894,-0.088813,-0.018985,-0.124834,0.050413,-0.061735,-0.025136,...,-0.063572,-0.004989,0.057836,-0.051759,-0.040987,-0.014490,-0.069512,0.014465,-0.127477,0.021782


In [6]:
base_path = 'r2/'
round_file_name_1 = 'r2_Round1.xlsx'
round_file_name_2 = 'r2_Round2.xlsx'
round_file_name_3 = 'r2_Round3_alternate.xlsx'
r2_sequence = 'VKVTVPDKNPPCPCCSTRLNSVLALIDHLKGSHGKRRVCFRCAKCGRENFNHHSTVCHFAKCKGPSEEKPPVGEWICEVCGRDFTTKIGLGQHKRLAHPMVRNQERIDASQPKETSNRGAHKKCWTKEEEELLARLEVQFEGHKNINKLIAEHITTKTNKQISDKRRQMTRKDKGEGGAAGKLGPDTGRGNHSQAKVGNNGLGGNQLPGGPAATKDKAGCHLDKEEGNRIAISQQKKGRLQGRYHKEIKRRLEEGVINTFTKAFKQLLECQEVQPLINKTAQDCFGLLESACHIRTALRGKNKKETQEKPTGGQCLKWMKKRAVKKGNYLRFQRLFHLDRGKLARIILDDIECLSCDIAPSEIYSVFKARWETPGQFAGLGNFKSTGKADNKAFSDLITAKEIKKNVQEMSKGSAPGPDGIAIGDIKGMDPGYSRTAELFNLWLTSGEIPDMVRGCRTVLIPKSTQPERLKDINNWRPITIGSILLRLFSRIITARMTKACPLNPRQRGFIRAAGCSENLKLLQTIIRTAKSEHRPLGVVFVDIAKAFDTVSHQHILHVLQQRGVDPHIIGLVSNMYKDISTFVTTKKDTHTDKIQIRVGVKQGDPLSPLLFNLAMDPLLCKLEESGNGFHRGGHTITAMAFADDLVLLSDSWENMEKNIEILEAFCDLTGLKTQGQKCHGFYIKPTKDSYTVNNCAAWTIYGTPLNMINPGDSEKYLGLQIDPWTGIARSNISSKLDSWLERINQAPLKPLQKLDILKTYTIPRLTYMVDHSEMKAGALEALDLQIRSAVKDWLHLPSCTCDAILYVSTKDGGLGVTKLAGLIPSIQARRLHRIAQSPDETMKAFLDKEQMEKQYAKLWVQAGGKREKIPSIWDALPTPVLLTTSDTLSEWEAPNPKSKYPRPCNWRRKEFEKWTKLQCQGRGIQNFKGDVISNNWIQNYRRIPHRKLLTAVQLRANVYPTREFLGRGRGDDCVKFCRHCEVDLETCGHIISYCPVTKEARIKRHNRICERLIEEAEKKDWVVFKEPHIRDAVKELFKPDLIFVKEDRALVVDVTVRFEATTTSLEEAAIEKVDKYKRLETEVRSLTNAKDVLFMGFPLGARGKWYQGNFKLLDMLGLSESRQVTVAKTLSTDALISSVDIVHMFASKARKMNLVTV'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, r2_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, r2_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, r2_sequence)
print(experimental_data_1, experimental_data_2, experimental_data_3)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3]

  Variant   fitness updated_variant
0     23K  1.161144            L23K
1     79R  0.913773            V79R
2    115K  1.245699           T115K
3    141D  0.875473           E141D
4    271N  0.785357           Q271N
5    318F  1.262595           W318F
6    345H  0.820908           R345H
7    422P  1.115601           A422P
8      WT  1.000000              WT   Variant   fitness updated_variant
0      WT  1.000000              WT
1    318I  0.110506           W318I
2    318L  2.482788           W318L
3    318S  2.538343           W318S
4    318P  2.207212           W318P
5    318E  2.126198           W318E
6    318M  1.898696           W318M
7    318K  2.627626           W318K
8    318Q  2.346451           W318Q
9    318A  1.538110           W318A    Variant   fitness updated_variant
0       WT  1.000000              WT
1     318T  0.826489           W318T
2     318R  0.784169           W318R
3     678Q  1.155724           K678Q
4     318D  0.717124           W318D
5     318N  0.904255  

In [7]:
iterations_three, labels_three = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_three
embeddings_pd = embeddings
labels_pd = labels_three
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [8]:
df_all
df_all.to_csv('r2/round3_alternate_all_new.csv', index=False)

In [9]:
df_test

Unnamed: 0,variant,y_pred,y_actual
20824,L646C,1.586598,
20558,T155G,1.560069,
68,I162C,1.494450,
12754,W740V,1.486333,
15000,C45V,1.481305,
...,...,...,...
7129,R345N,0.905186,
16097,M410H,0.900743,
9346,L252T,0.897272,
9685,E225T,0.897167,


In [10]:
df_test.to_csv('r2/round3_alternate_predictions_new.csv', index=False)

## R2 round 3

In [4]:
# import brenan data
dataset_name = 'r2_esm2_t48_15B_UR50D'
base_path = 'r2/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
G91K,0.125095,-0.086747,0.007329,0.066044,-0.088842,-0.018201,-0.123120,0.048890,-0.063238,-0.024220,...,-0.061586,-0.006344,0.057469,-0.050334,-0.042349,-0.016509,-0.068606,0.012836,-0.129086,0.022432
N927I,0.122180,-0.083631,0.008685,0.064573,-0.088776,-0.018910,-0.124121,0.050447,-0.062081,-0.025686,...,-0.062493,-0.004403,0.056985,-0.052900,-0.042592,-0.014429,-0.069569,0.015393,-0.128585,0.021870
I824V,0.121158,-0.084260,0.007549,0.064432,-0.089038,-0.019331,-0.123460,0.049893,-0.061687,-0.025476,...,-0.062508,-0.004760,0.057204,-0.052061,-0.043694,-0.014540,-0.068647,0.014702,-0.128467,0.022632
I257T,0.121469,-0.084672,0.007914,0.064696,-0.088860,-0.020061,-0.125147,0.049035,-0.062149,-0.025158,...,-0.062353,-0.005594,0.056296,-0.052189,-0.044065,-0.014480,-0.068425,0.014220,-0.128542,0.020876
V56A,0.121873,-0.084203,0.008424,0.064599,-0.088758,-0.020137,-0.123685,0.050389,-0.061743,-0.025748,...,-0.062738,-0.005220,0.057436,-0.051883,-0.043081,-0.014980,-0.067994,0.014596,-0.128305,0.022357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K1019H,0.121816,-0.084847,0.007153,0.065182,-0.088532,-0.019749,-0.123572,0.050393,-0.062215,-0.025013,...,-0.062319,-0.004719,0.056696,-0.052044,-0.043212,-0.014929,-0.069097,0.015089,-0.128408,0.022167
D812L,0.120452,-0.084344,0.007394,0.063507,-0.087286,-0.018784,-0.124111,0.047359,-0.061184,-0.024530,...,-0.063294,-0.005696,0.057122,-0.053182,-0.041089,-0.014853,-0.069275,0.015093,-0.128790,0.019908
F260L,0.121224,-0.084671,0.007690,0.064884,-0.089026,-0.019367,-0.124186,0.050523,-0.061800,-0.026249,...,-0.062903,-0.004626,0.057484,-0.052013,-0.042299,-0.014930,-0.067630,0.014060,-0.128773,0.022168
R135N,0.120981,-0.083602,0.005954,0.064894,-0.088813,-0.018985,-0.124834,0.050413,-0.061735,-0.025136,...,-0.063572,-0.004989,0.057836,-0.051759,-0.040987,-0.014490,-0.069512,0.014465,-0.127477,0.021782


In [6]:
base_path = 'r2/'
round_file_name_1 = 'r2_Round1.xlsx'
round_file_name_2 = 'r2_Round2.xlsx'
round_file_name_3 = 'r2_Round3.xlsx'
r2_sequence = 'VKVTVPDKNPPCPCCSTRLNSVLALIDHLKGSHGKRRVCFRCAKCGRENFNHHSTVCHFAKCKGPSEEKPPVGEWICEVCGRDFTTKIGLGQHKRLAHPMVRNQERIDASQPKETSNRGAHKKCWTKEEEELLARLEVQFEGHKNINKLIAEHITTKTNKQISDKRRQMTRKDKGEGGAAGKLGPDTGRGNHSQAKVGNNGLGGNQLPGGPAATKDKAGCHLDKEEGNRIAISQQKKGRLQGRYHKEIKRRLEEGVINTFTKAFKQLLECQEVQPLINKTAQDCFGLLESACHIRTALRGKNKKETQEKPTGGQCLKWMKKRAVKKGNYLRFQRLFHLDRGKLARIILDDIECLSCDIAPSEIYSVFKARWETPGQFAGLGNFKSTGKADNKAFSDLITAKEIKKNVQEMSKGSAPGPDGIAIGDIKGMDPGYSRTAELFNLWLTSGEIPDMVRGCRTVLIPKSTQPERLKDINNWRPITIGSILLRLFSRIITARMTKACPLNPRQRGFIRAAGCSENLKLLQTIIRTAKSEHRPLGVVFVDIAKAFDTVSHQHILHVLQQRGVDPHIIGLVSNMYKDISTFVTTKKDTHTDKIQIRVGVKQGDPLSPLLFNLAMDPLLCKLEESGNGFHRGGHTITAMAFADDLVLLSDSWENMEKNIEILEAFCDLTGLKTQGQKCHGFYIKPTKDSYTVNNCAAWTIYGTPLNMINPGDSEKYLGLQIDPWTGIARSNISSKLDSWLERINQAPLKPLQKLDILKTYTIPRLTYMVDHSEMKAGALEALDLQIRSAVKDWLHLPSCTCDAILYVSTKDGGLGVTKLAGLIPSIQARRLHRIAQSPDETMKAFLDKEQMEKQYAKLWVQAGGKREKIPSIWDALPTPVLLTTSDTLSEWEAPNPKSKYPRPCNWRRKEFEKWTKLQCQGRGIQNFKGDVISNNWIQNYRRIPHRKLLTAVQLRANVYPTREFLGRGRGDDCVKFCRHCEVDLETCGHIISYCPVTKEARIKRHNRICERLIEEAEKKDWVVFKEPHIRDAVKELFKPDLIFVKEDRALVVDVTVRFEATTTSLEEAAIEKVDKYKRLETEVRSLTNAKDVLFMGFPLGARGKWYQGNFKLLDMLGLSESRQVTVAKTLSTDALISSVDIVHMFASKARKMNLVTV'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, r2_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, r2_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, r2_sequence)
print(experimental_data_1, experimental_data_2, experimental_data_3)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3]

  Variant   fitness updated_variant
0     23K  1.161144            L23K
1     79R  0.913773            V79R
2    115K  1.245699           T115K
3    141D  0.875473           E141D
4    271N  0.785357           Q271N
5    318F  1.262595           W318F
6    345H  0.820908           R345H
7    422P  1.115601           A422P
8      WT  1.000000              WT   Variant   fitness updated_variant
0      WT  1.000000              WT
1    318I  0.110506           W318I
2    318L  2.482788           W318L
3    318S  2.538343           W318S
4    318P  2.207212           W318P
5    318E  2.126198           W318E
6    318M  1.898696           W318M
7    318K  2.627626           W318K
8    318Q  2.346451           W318Q
9    318A  1.538110           W318A    Variant   fitness updated_variant
0       WT  1.000000              WT
1     318T  0.995178           W318T
2     318R  0.856420           W318R
3     678Q  1.158733           K678Q
4     318D  0.594624           W318D
5     318N  0.769030  

In [7]:
iterations_three, labels_three = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_three
embeddings_pd = embeddings
labels_pd = labels_three
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [8]:
df_all
df_all.to_csv('r2/round3_all_new.csv', index=False)

In [9]:
df_test

Unnamed: 0,variant,y_pred,y_actual
20558,T155G,1.491538,
7409,C45E,1.485510,
68,I162C,1.470536,
20824,L646C,1.454224,
15164,C45A,1.452734,
...,...,...,...
1898,P1099D,0.897408,
11900,M319T,0.896799,
7129,R345N,0.894031,
9092,R331D,0.890328,


In [10]:
df_test.to_csv('r2/round3_predictions_new.csv', index=False)

## MLV Round 1

In [70]:
# import brenan data
dataset_name = 'mlv_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [71]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L395V,0.026370,0.035806,0.065681,0.098617,0.022640,0.015231,-0.131188,0.093099,-0.056186,-0.113474,...,-0.088710,0.028330,0.009087,-0.106295,-0.013857,0.057777,0.035376,0.013376,-0.154173,-0.082881
V122A,0.027030,0.039869,0.063317,0.098480,0.023198,0.014039,-0.130743,0.095194,-0.055028,-0.112952,...,-0.090134,0.026029,0.008298,-0.105948,-0.015715,0.058151,0.033629,0.011017,-0.154426,-0.080128
A536T,0.026795,0.037209,0.065507,0.097628,0.024206,0.015874,-0.132245,0.094248,-0.057104,-0.113739,...,-0.089233,0.026587,0.007957,-0.107605,-0.015971,0.057966,0.034426,0.012013,-0.152568,-0.081039
D371W,0.023449,0.037792,0.064348,0.098553,0.021913,0.014635,-0.130733,0.093426,-0.055363,-0.112917,...,-0.088215,0.026495,0.006595,-0.106479,-0.013371,0.058535,0.033883,0.013393,-0.156771,-0.084479


In [72]:
base_path = 'mlv/'
round_file_name_1 = 'mlv_Round1.xlsx'
mlv_sequence = 'TLNIEDEYRLHETSKEPDVSLGSTWLSDFPQAWAETGGMGLAVRQAPLIIPLKATSTPVSIKQYPMSQEARLGIKPHIQRLLDQGILVPCQSPWNTPLLPVKKPGTNDYRPVQDLREVNKRVEDIHPTVPNPYNLLSGLPPSHQWYTVLDLKDAFFCLRLHPTSQPLFAFEWRDPEMGISGQLTWTRLPQGFKNSPTLFNEALHRDLADFRIQHPDLILLQYVDDLLLAATSELDCQQGTRALLQTLGNLGYRASAKKAQICQKQVKYLGYLLKEGQRWLTEARKETVMGQPTPKTPRQLREFLGKAGFCRLFIPGFAEMAAPLYPLTKPGTLFNWGPDQQKAYQEIKQALLTAPALGLPDLTKPFELFVDEKQGYAKGVLTQKLGPWRRPVAYLSKKLDPVAAGWPPCLRMVAAIAVLTKDAGKLTMGQPLVILAPHAVEALVKQPPDRWLSNARMTHYQALLLDTDRVQFGPVVALNPATLLPLPEEGLQHNCLDILAEAHGTRPDLTDQPLPDADHTWYTDGSSLLQEGQRKAGAAVTTETEVIWAKALPAGTSAQRAELIALTQALKMAEGKKLNVYTDSRYAFATAHIHGEIYRRRGWLTSEGKEIKNKDEILALLKALFLPKRLSIIHCPGHQKGHSAEARGNRMADQAARKAAITETPDTSTLLIENSSP'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, mlv_sequence)
print(experimental_data_1)
df_list = [experimental_data_1]

  Variant  fitness_raw   fitness updated_variant
0     24K     0.790631  0.871999            T24K
1     59T     0.848395  0.935708            V59T
2    116K     0.447482  0.493534           R116K
3    183M     0.806511  0.889514           L183M
4    272N     0.647005  0.713592           L272N
5    319F     0.632159  0.697218           E319F
6    346H     0.789644  0.870911           E346H
7    377Y     0.440696  0.486051           A377Y
8    423P     0.169476  0.186918           A423P
9      WT     0.906688  1.000000              WT


In [73]:
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [74]:
df_all
df_all.to_csv('mlv/round1_all_new.csv', index=False)

In [75]:
df_test

Unnamed: 0,variant,y_pred,y_actual
5352,L52M,0.801629,
12400,E501D,0.800033,
718,L362V,0.799894,
4993,E346T,0.798471,
10526,A549C,0.796840,
...,...,...,...
5160,A423L,0.559473,
6405,F588W,0.559018,
2151,A46G,0.544397,
4978,A423W,0.532271,


In [76]:
df_test.to_csv('mlv/round1_predictions_new.csv', index=False)

## MLV Round 2

In [6]:
# import brenan data
dataset_name = 'mlv_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [7]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L395V,0.026370,0.035806,0.065681,0.098617,0.022640,0.015231,-0.131188,0.093099,-0.056186,-0.113474,...,-0.088710,0.028330,0.009087,-0.106295,-0.013857,0.057777,0.035376,0.013376,-0.154173,-0.082881
V122A,0.027030,0.039869,0.063317,0.098480,0.023198,0.014039,-0.130743,0.095194,-0.055028,-0.112952,...,-0.090134,0.026029,0.008298,-0.105948,-0.015715,0.058151,0.033629,0.011017,-0.154426,-0.080128
A536T,0.026795,0.037209,0.065507,0.097628,0.024206,0.015874,-0.132245,0.094248,-0.057104,-0.113739,...,-0.089233,0.026587,0.007957,-0.107605,-0.015971,0.057966,0.034426,0.012013,-0.152568,-0.081039
D371W,0.023449,0.037792,0.064348,0.098553,0.021913,0.014635,-0.130733,0.093426,-0.055363,-0.112917,...,-0.088215,0.026495,0.006595,-0.106479,-0.013371,0.058535,0.033883,0.013393,-0.156771,-0.084479


In [8]:
base_path = 'mlv/'
round_file_name_1 = 'mlv_Round1.xlsx'
round_file_name_2 = 'mlv_Round2.xlsx'
mlv_sequence = 'TLNIEDEYRLHETSKEPDVSLGSTWLSDFPQAWAETGGMGLAVRQAPLIIPLKATSTPVSIKQYPMSQEARLGIKPHIQRLLDQGILVPCQSPWNTPLLPVKKPGTNDYRPVQDLREVNKRVEDIHPTVPNPYNLLSGLPPSHQWYTVLDLKDAFFCLRLHPTSQPLFAFEWRDPEMGISGQLTWTRLPQGFKNSPTLFNEALHRDLADFRIQHPDLILLQYVDDLLLAATSELDCQQGTRALLQTLGNLGYRASAKKAQICQKQVKYLGYLLKEGQRWLTEARKETVMGQPTPKTPRQLREFLGKAGFCRLFIPGFAEMAAPLYPLTKPGTLFNWGPDQQKAYQEIKQALLTAPALGLPDLTKPFELFVDEKQGYAKGVLTQKLGPWRRPVAYLSKKLDPVAAGWPPCLRMVAAIAVLTKDAGKLTMGQPLVILAPHAVEALVKQPPDRWLSNARMTHYQALLLDTDRVQFGPVVALNPATLLPLPEEGLQHNCLDILAEAHGTRPDLTDQPLPDADHTWYTDGSSLLQEGQRKAGAAVTTETEVIWAKALPAGTSAQRAELIALTQALKMAEGKKLNVYTDSRYAFATAHIHGEIYRRRGWLTSEGKEIKNKDEILALLKALFLPKRLSIIHCPGHQKGHSAEARGNRMADQAARKAAITETPDTSTLLIENSSP'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, mlv_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, mlv_sequence)
print(experimental_data_1, experimental_data_2)
df_list = [experimental_data_1, experimental_data_2]

  Variant  fitness_raw   fitness updated_variant
0     24K     0.790631  0.871999            T24K
1     59T     0.848395  0.935708            V59T
2    116K     0.447482  0.493534           R116K
3    183M     0.806511  0.889514           L183M
4    272N     0.647005  0.713592           L272N
5    319F     0.632159  0.697218           E319F
6    346H     0.789644  0.870911           E346H
7    377Y     0.440696  0.486051           A377Y
8    423P     0.169476  0.186918           A423P
9      WT     0.906688  1.000000              WT    Variant   fitness updated_variant
0      52M  0.814861            L52M
1     501D  0.882929           E501D
2     362V  0.762152           L362V
3     346T  0.861033           E346T
4      13M  0.961874            T13M
5     419M  1.000497           L419M
6     528M  0.913440           L528M
7     333M  0.943176           L333M
8     491H  0.932670           L491H
9     352M  0.898531           L352M
10    671T  0.999982           L671T
11      WT  1.000

In [9]:
iterations_two, labels_two = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_two
embeddings_pd = embeddings
labels_pd = labels_two
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [10]:
df_all
df_all.to_csv('mlv/round2_all_new.csv', index=False)

In [11]:
df_test

Unnamed: 0,variant,y_pred,y_actual
431,L670I,0.922071,
8916,S675T,0.918968,
10652,L671I,0.918870,
6982,S668T,0.913477,
12270,L670V,0.910197,
...,...,...,...
6767,R411Q,0.585869,
4978,A423W,0.583864,
7645,R585T,0.583376,
3139,R456Q,0.578803,


In [12]:
df_test.to_csv('mlv/round2_predictions_new.csv', index=False)

## MLV Round 3

In [4]:
# import brenan data
dataset_name = 'mlv_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L395V,0.026370,0.035806,0.065681,0.098617,0.022640,0.015231,-0.131188,0.093099,-0.056186,-0.113474,...,-0.088710,0.028330,0.009087,-0.106295,-0.013857,0.057777,0.035376,0.013376,-0.154173,-0.082881
V122A,0.027030,0.039869,0.063317,0.098480,0.023198,0.014039,-0.130743,0.095194,-0.055028,-0.112952,...,-0.090134,0.026029,0.008298,-0.105948,-0.015715,0.058151,0.033629,0.011017,-0.154426,-0.080128
A536T,0.026795,0.037209,0.065507,0.097628,0.024206,0.015874,-0.132245,0.094248,-0.057104,-0.113739,...,-0.089233,0.026587,0.007957,-0.107605,-0.015971,0.057966,0.034426,0.012013,-0.152568,-0.081039
D371W,0.023449,0.037792,0.064348,0.098553,0.021913,0.014635,-0.130733,0.093426,-0.055363,-0.112917,...,-0.088215,0.026495,0.006595,-0.106479,-0.013371,0.058535,0.033883,0.013393,-0.156771,-0.084479


In [7]:
base_path = 'mlv/'
round_file_name_1 = 'mlv_Round1.xlsx'
round_file_name_2 = 'mlv_Round2.xlsx'
round_file_name_3 = 'mlv_Round3.xlsx'
mlv_sequence = 'TLNIEDEYRLHETSKEPDVSLGSTWLSDFPQAWAETGGMGLAVRQAPLIIPLKATSTPVSIKQYPMSQEARLGIKPHIQRLLDQGILVPCQSPWNTPLLPVKKPGTNDYRPVQDLREVNKRVEDIHPTVPNPYNLLSGLPPSHQWYTVLDLKDAFFCLRLHPTSQPLFAFEWRDPEMGISGQLTWTRLPQGFKNSPTLFNEALHRDLADFRIQHPDLILLQYVDDLLLAATSELDCQQGTRALLQTLGNLGYRASAKKAQICQKQVKYLGYLLKEGQRWLTEARKETVMGQPTPKTPRQLREFLGKAGFCRLFIPGFAEMAAPLYPLTKPGTLFNWGPDQQKAYQEIKQALLTAPALGLPDLTKPFELFVDEKQGYAKGVLTQKLGPWRRPVAYLSKKLDPVAAGWPPCLRMVAAIAVLTKDAGKLTMGQPLVILAPHAVEALVKQPPDRWLSNARMTHYQALLLDTDRVQFGPVVALNPATLLPLPEEGLQHNCLDILAEAHGTRPDLTDQPLPDADHTWYTDGSSLLQEGQRKAGAAVTTETEVIWAKALPAGTSAQRAELIALTQALKMAEGKKLNVYTDSRYAFATAHIHGEIYRRRGWLTSEGKEIKNKDEILALLKALFLPKRLSIIHCPGHQKGHSAEARGNRMADQAARKAAITETPDTSTLLIENSSP'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, mlv_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, mlv_sequence)
experimental_data_3 = read_experimental_data(base_path, round_file_name_3, mlv_sequence)
print(experimental_data_1, experimental_data_2, experimental_data_3)
df_list = [experimental_data_1, experimental_data_2, experimental_data_3]

  Variant  fitness_raw   fitness updated_variant
0     24K     0.790631  0.871999            T24K
1     59T     0.848395  0.935708            V59T
2    116K     0.447482  0.493534           R116K
3    183M     0.806511  0.889514           L183M
4    272N     0.647005  0.713592           L272N
5    319F     0.632159  0.697218           E319F
6    346H     0.789644  0.870911           E346H
7    377Y     0.440696  0.486051           A377Y
8    423P     0.169476  0.186918           A423P
9      WT     0.906688  1.000000              WT    Variant   fitness updated_variant
0      52M  0.814861            L52M
1     501D  0.882929           E501D
2     362V  0.762152           L362V
3     346T  0.861033           E346T
4      13M  0.961874            T13M
5     419M  1.000497           L419M
6     528M  0.913440           L528M
7     333M  0.943176           L333M
8     491H  0.932670           L491H
9     352M  0.898531           L352M
10    671T  0.999982           L671T
11      WT  1.000

In [8]:
iterations_three, labels_three = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_three
embeddings_pd = embeddings
labels_pd = labels_three
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [9]:
df_all
df_all.to_csv('mlv/round3_all_new.csv', index=False)

In [10]:
df_test

Unnamed: 0,variant,y_pred,y_actual
511,L670K,1.104138,
2624,L670N,1.099864,
6966,L671R,1.089488,
5177,L333R,1.080575,
5469,L671V,1.073896,
...,...,...,...
11365,R311H,0.689304,
3468,R456C,0.682787,
2223,R116Q,0.680474,
5160,A423L,0.678149,


In [11]:
df_test.to_csv('mlv/round3_predictions_new.csv', index=False)

## MLV Round 4

In [4]:
# import brenan data
dataset_name = 'mlv_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L395V,0.026370,0.035806,0.065681,0.098617,0.022640,0.015231,-0.131188,0.093099,-0.056186,-0.113474,...,-0.088710,0.028330,0.009087,-0.106295,-0.013857,0.057777,0.035376,0.013376,-0.154173,-0.082881
V122A,0.027030,0.039869,0.063317,0.098480,0.023198,0.014039,-0.130743,0.095194,-0.055028,-0.112952,...,-0.090134,0.026029,0.008298,-0.105948,-0.015715,0.058151,0.033629,0.011017,-0.154426,-0.080128
A536T,0.026795,0.037209,0.065507,0.097628,0.024206,0.015874,-0.132245,0.094248,-0.057104,-0.113739,...,-0.089233,0.026587,0.007957,-0.107605,-0.015971,0.057966,0.034426,0.012013,-0.152568,-0.081039
D371W,0.023449,0.037792,0.064348,0.098553,0.021913,0.014635,-0.130733,0.093426,-0.055363,-0.112917,...,-0.088215,0.026495,0.006595,-0.106479,-0.013371,0.058535,0.033883,0.013393,-0.156771,-0.084479


In [7]:
base_path = 'mlv/'
round_file_name_all = 'mlv_Round4_all.xlsx'
mlv_sequence = 'TLNIEDEYRLHETSKEPDVSLGSTWLSDFPQAWAETGGMGLAVRQAPLIIPLKATSTPVSIKQYPMSQEARLGIKPHIQRLLDQGILVPCQSPWNTPLLPVKKPGTNDYRPVQDLREVNKRVEDIHPTVPNPYNLLSGLPPSHQWYTVLDLKDAFFCLRLHPTSQPLFAFEWRDPEMGISGQLTWTRLPQGFKNSPTLFNEALHRDLADFRIQHPDLILLQYVDDLLLAATSELDCQQGTRALLQTLGNLGYRASAKKAQICQKQVKYLGYLLKEGQRWLTEARKETVMGQPTPKTPRQLREFLGKAGFCRLFIPGFAEMAAPLYPLTKPGTLFNWGPDQQKAYQEIKQALLTAPALGLPDLTKPFELFVDEKQGYAKGVLTQKLGPWRRPVAYLSKKLDPVAAGWPPCLRMVAAIAVLTKDAGKLTMGQPLVILAPHAVEALVKQPPDRWLSNARMTHYQALLLDTDRVQFGPVVALNPATLLPLPEEGLQHNCLDILAEAHGTRPDLTDQPLPDADHTWYTDGSSLLQEGQRKAGAAVTTETEVIWAKALPAGTSAQRAELIALTQALKMAEGKKLNVYTDSRYAFATAHIHGEIYRRRGWLTSEGKEIKNKDEILALLKALFLPKRLSIIHCPGHQKGHSAEARGNRMADQAARKAAITETPDTSTLLIENSSP'
experimental_data_all = read_experimental_data(base_path, round_file_name_all, mlv_sequence)
print(experimental_data_all)
df_list = [experimental_data_all]

  Variant  fitness_raw   fitness updated_variant
0     24K     0.790631  0.871999            T24K
1     59T     0.848395  0.935708            V59T
2    116K     0.447482  0.493534           R116K
3    183M     0.806511  0.889514           L183M
4    272N     0.647005  0.713592           L272N
5    319F     0.632159  0.697218           E319F
6    346H     0.789644  0.870911           E346H
7    377Y     0.440696  0.486051           A377Y
8    423P     0.169476  0.186918           A423P
9      WT     0.906688  1.000000              WT    Variant   fitness updated_variant
0      52M  0.814861            L52M
1     501D  0.882929           E501D
2     362V  0.762152           L362V
3     346T  0.861033           E346T
4      13M  0.961874            T13M
5     419M  1.000497           L419M
6     528M  0.913440           L528M
7     333M  0.943176           L333M
8     491H  0.932670           L491H
9     352M  0.898531           L352M
10    671T  0.999982           L671T
11      WT  1.000

In [8]:
iterations_four, labels_four = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_four
embeddings_pd = embeddings
labels_pd = labels_four
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [9]:
df_all
df_all.to_csv('mlv/round4_all_new.csv', index=False)

In [10]:
df_test

Unnamed: 0,variant,y_pred,y_actual
511,L670K,1.104138,
2624,L670N,1.099864,
6966,L671R,1.089488,
5177,L333R,1.080575,
5469,L671V,1.073896,
...,...,...,...
11365,R311H,0.689304,
3468,R456C,0.682787,
2223,R116Q,0.680474,
5160,A423L,0.678149,


In [11]:
df_test.to_csv('mlv/round4_predictions_new.csv', index=False)

## MLV Round 5

In [13]:
# import brenan data
dataset_name = 'mlv_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [14]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L395V,0.026370,0.035806,0.065681,0.098617,0.022640,0.015231,-0.131188,0.093099,-0.056186,-0.113474,...,-0.088710,0.028330,0.009087,-0.106295,-0.013857,0.057777,0.035376,0.013376,-0.154173,-0.082881
V122A,0.027030,0.039869,0.063317,0.098480,0.023198,0.014039,-0.130743,0.095194,-0.055028,-0.112952,...,-0.090134,0.026029,0.008298,-0.105948,-0.015715,0.058151,0.033629,0.011017,-0.154426,-0.080128
A536T,0.026795,0.037209,0.065507,0.097628,0.024206,0.015874,-0.132245,0.094248,-0.057104,-0.113739,...,-0.089233,0.026587,0.007957,-0.107605,-0.015971,0.057966,0.034426,0.012013,-0.152568,-0.081039
D371W,0.023449,0.037792,0.064348,0.098553,0.021913,0.014635,-0.130733,0.093426,-0.055363,-0.112917,...,-0.088215,0.026495,0.006595,-0.106479,-0.013371,0.058535,0.033883,0.013393,-0.156771,-0.084479


In [15]:
base_path = 'mlv/'
round_file_name_all = 'mlv_Round4_all.xlsx'
mlv_sequence = 'TLNIEDEYRLHETSKEPDVSLGSTWLSDFPQAWAETGGMGLAVRQAPLIIPLKATSTPVSIKQYPMSQEARLGIKPHIQRLLDQGILVPCQSPWNTPLLPVKKPGTNDYRPVQDLREVNKRVEDIHPTVPNPYNLLSGLPPSHQWYTVLDLKDAFFCLRLHPTSQPLFAFEWRDPEMGISGQLTWTRLPQGFKNSPTLFNEALHRDLADFRIQHPDLILLQYVDDLLLAATSELDCQQGTRALLQTLGNLGYRASAKKAQICQKQVKYLGYLLKEGQRWLTEARKETVMGQPTPKTPRQLREFLGKAGFCRLFIPGFAEMAAPLYPLTKPGTLFNWGPDQQKAYQEIKQALLTAPALGLPDLTKPFELFVDEKQGYAKGVLTQKLGPWRRPVAYLSKKLDPVAAGWPPCLRMVAAIAVLTKDAGKLTMGQPLVILAPHAVEALVKQPPDRWLSNARMTHYQALLLDTDRVQFGPVVALNPATLLPLPEEGLQHNCLDILAEAHGTRPDLTDQPLPDADHTWYTDGSSLLQEGQRKAGAAVTTETEVIWAKALPAGTSAQRAELIALTQALKMAEGKKLNVYTDSRYAFATAHIHGEIYRRRGWLTSEGKEIKNKDEILALLKALFLPKRLSIIHCPGHQKGHSAEARGNRMADQAARKAAITETPDTSTLLIENSSP'
experimental_data_all = read_experimental_data(base_path, round_file_name_all, mlv_sequence)
experimental_data_5 = read_experimental_data(base_path, round_file_name_5, mlv_sequence)
print(experimental_data_all, experimental_data_5)
df_list = [experimental_data_all, experimental_data_5]

   Variant   fitness  Round  updated_variant
0       WT  1.000000       1              WT
1      23K  0.866590       1            S23K
2      58T  0.963272       1            P58T
3      79R  0.594365       1            Q79R
4     115K  0.614792       1           L115K
5     182M  0.931950       1           Q182M
6     271N  0.801496       1           Y271N
7     318F  0.925166       1           A318F
8     345H  0.953547       1           Q345H
9     376Y  0.202246       1           Y376Y
10    422P  0.333216       1           D422P
11     52M  0.581691       2            L52M
12    501D  0.587692       2           E501D
13    362V  0.514356       2           L362V
14    346T  0.690671       2           E346T
15     13M  0.942222       2            T13M
16    419M  1.018405       2           L419M
17    528M  0.880965       2           L528M
18    333M  0.780678       2           L333M
19    491H  0.911881       2           L491H
20    352M  0.848586       2           L352M
21    671T

In [16]:
iterations_five, labels_five = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_five
embeddings_pd = embeddings
labels_pd = labels_five
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [17]:
df_all
df_all.to_csv('mlv/round5_all_new.csv', index=False)

In [18]:
df_test

Unnamed: 0,variant,y_pred,y_actual
259,L670G,1.281129,
2994,L670W,1.237405,
4782,L671K,1.199501,
3886,L670Q,1.177436,
6464,L670E,1.156969,
...,...,...,...
12452,E596S,0.710999,
9608,F313H,0.700558,
6491,R585H,0.685650,
4473,E596P,0.678846,


In [19]:
df_test.to_csv('mlv/round5_predictions_new.csv', index=False)

## MLV Round 6

In [5]:
# import brenan data
dataset_name = 'mlv_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [6]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L395V,0.026370,0.035806,0.065681,0.098617,0.022640,0.015231,-0.131188,0.093099,-0.056186,-0.113474,...,-0.088710,0.028330,0.009087,-0.106295,-0.013857,0.057777,0.035376,0.013376,-0.154173,-0.082881
V122A,0.027030,0.039869,0.063317,0.098480,0.023198,0.014039,-0.130743,0.095194,-0.055028,-0.112952,...,-0.090134,0.026029,0.008298,-0.105948,-0.015715,0.058151,0.033629,0.011017,-0.154426,-0.080128
A536T,0.026795,0.037209,0.065507,0.097628,0.024206,0.015874,-0.132245,0.094248,-0.057104,-0.113739,...,-0.089233,0.026587,0.007957,-0.107605,-0.015971,0.057966,0.034426,0.012013,-0.152568,-0.081039
D371W,0.023449,0.037792,0.064348,0.098553,0.021913,0.014635,-0.130733,0.093426,-0.055363,-0.112917,...,-0.088215,0.026495,0.006595,-0.106479,-0.013371,0.058535,0.033883,0.013393,-0.156771,-0.084479


In [7]:
base_path = 'mlv/'
round_file_name_all = 'mlv_Round4_all.xlsx'
round_file_name_5 = 'mlv_Round5.xlsx'
round_file_name_6 = 'mlv_Round6.xlsx'
mlv_sequence = 'TLNIEDEYRLHETSKEPDVSLGSTWLSDFPQAWAETGGMGLAVRQAPLIIPLKATSTPVSIKQYPMSQEARLGIKPHIQRLLDQGILVPCQSPWNTPLLPVKKPGTNDYRPVQDLREVNKRVEDIHPTVPNPYNLLSGLPPSHQWYTVLDLKDAFFCLRLHPTSQPLFAFEWRDPEMGISGQLTWTRLPQGFKNSPTLFNEALHRDLADFRIQHPDLILLQYVDDLLLAATSELDCQQGTRALLQTLGNLGYRASAKKAQICQKQVKYLGYLLKEGQRWLTEARKETVMGQPTPKTPRQLREFLGKAGFCRLFIPGFAEMAAPLYPLTKPGTLFNWGPDQQKAYQEIKQALLTAPALGLPDLTKPFELFVDEKQGYAKGVLTQKLGPWRRPVAYLSKKLDPVAAGWPPCLRMVAAIAVLTKDAGKLTMGQPLVILAPHAVEALVKQPPDRWLSNARMTHYQALLLDTDRVQFGPVVALNPATLLPLPEEGLQHNCLDILAEAHGTRPDLTDQPLPDADHTWYTDGSSLLQEGQRKAGAAVTTETEVIWAKALPAGTSAQRAELIALTQALKMAEGKKLNVYTDSRYAFATAHIHGEIYRRRGWLTSEGKEIKNKDEILALLKALFLPKRLSIIHCPGHQKGHSAEARGNRMADQAARKAAITETPDTSTLLIENSSP'
experimental_data_all = read_experimental_data(base_path, round_file_name_all, mlv_sequence)
experimental_data_5 = read_experimental_data(base_path, round_file_name_5, mlv_sequence)
experimental_data_6 = read_experimental_data(base_path, round_file_name_6, mlv_sequence)
print(experimental_data_all, experimental_data_5, experimental_data_6)
df_list = [experimental_data_all, experimental_data_5, experimental_data_6]

   Variant   fitness  Round  updated_variant
0       WT  1.000000       1              WT
1      23K  0.866590       1            S23K
2      58T  0.963272       1            P58T
3      79R  0.594365       1            Q79R
4     115K  0.614792       1           L115K
5     182M  0.931950       1           Q182M
6     271N  0.801496       1           Y271N
7     318F  0.925166       1           A318F
8     345H  0.953547       1           Q345H
9     376Y  0.202246       1           Y376Y
10    422P  0.333216       1           D422P
11     52M  0.581691       2            L52M
12    501D  0.587692       2           E501D
13    362V  0.514356       2           L362V
14    346T  0.690671       2           E346T
15     13M  0.942222       2            T13M
16    419M  1.018405       2           L419M
17    528M  0.880965       2           L528M
18    333M  0.780678       2           L333M
19    491H  0.911881       2           L491H
20    352M  0.848586       2           L352M
21    671T

In [8]:
iterations_six, labels_six = create_dataframes(df_list, embeddings.index)

# Assuming 'labels_five' is your DataFrame
filtered_labels_six = labels_six.dropna(subset=['fitness'])

# Remove the character before the numeric in each variant
filtered_labels_six['variant'] = filtered_labels_six['variant'].str.replace(r'\D+(\d+)', r'\1', regex=True)

# Write the modified DataFrame to an Excel file
filtered_labels_six.to_excel('mlv_variants_six.xlsx', index=False)

In [36]:
iterations_six, labels_six = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_six
embeddings_pd = embeddings
labels_pd = labels_six
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [37]:
df_all
df_all.to_csv('mlv/round6_all_new.csv', index=False)

In [38]:
df_test

Unnamed: 0,variant,y_pred,y_actual
910,L514D,1.400019,
8995,L514S,1.322503,
6582,K193Q,1.295518,
4939,L514G,1.289387,
4383,A659L,1.270535,
...,...,...,...
10619,D422G,0.641581,
1268,A538E,0.628683,
8702,A538P,0.625711,
7945,A538N,0.606969,


In [39]:
df_test.to_csv('mlv/round6_predictions_new.csv', index=False)

## MLV Round 7

In [4]:
# import brenan data
dataset_name = 'mlv_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L395V,0.026370,0.035806,0.065681,0.098617,0.022640,0.015231,-0.131188,0.093099,-0.056186,-0.113474,...,-0.088710,0.028330,0.009087,-0.106295,-0.013857,0.057777,0.035376,0.013376,-0.154173,-0.082881
V122A,0.027030,0.039869,0.063317,0.098480,0.023198,0.014039,-0.130743,0.095194,-0.055028,-0.112952,...,-0.090134,0.026029,0.008298,-0.105948,-0.015715,0.058151,0.033629,0.011017,-0.154426,-0.080128
A536T,0.026795,0.037209,0.065507,0.097628,0.024206,0.015874,-0.132245,0.094248,-0.057104,-0.113739,...,-0.089233,0.026587,0.007957,-0.107605,-0.015971,0.057966,0.034426,0.012013,-0.152568,-0.081039
D371W,0.023449,0.037792,0.064348,0.098553,0.021913,0.014635,-0.130733,0.093426,-0.055363,-0.112917,...,-0.088215,0.026495,0.006595,-0.106479,-0.013371,0.058535,0.033883,0.013393,-0.156771,-0.084479


In [7]:
base_path = 'mlv/'
round_file_name_all = 'mlv_Round7_all.xlsx'
mlv_sequence = 'TLNIEDEYRLHETSKEPDVSLGSTWLSDFPQAWAETGGMGLAVRQAPLIIPLKATSTPVSIKQYPMSQEARLGIKPHIQRLLDQGILVPCQSPWNTPLLPVKKPGTNDYRPVQDLREVNKRVEDIHPTVPNPYNLLSGLPPSHQWYTVLDLKDAFFCLRLHPTSQPLFAFEWRDPEMGISGQLTWTRLPQGFKNSPTLFNEALHRDLADFRIQHPDLILLQYVDDLLLAATSELDCQQGTRALLQTLGNLGYRASAKKAQICQKQVKYLGYLLKEGQRWLTEARKETVMGQPTPKTPRQLREFLGKAGFCRLFIPGFAEMAAPLYPLTKPGTLFNWGPDQQKAYQEIKQALLTAPALGLPDLTKPFELFVDEKQGYAKGVLTQKLGPWRRPVAYLSKKLDPVAAGWPPCLRMVAAIAVLTKDAGKLTMGQPLVILAPHAVEALVKQPPDRWLSNARMTHYQALLLDTDRVQFGPVVALNPATLLPLPEEGLQHNCLDILAEAHGTRPDLTDQPLPDADHTWYTDGSSLLQEGQRKAGAAVTTETEVIWAKALPAGTSAQRAELIALTQALKMAEGKKLNVYTDSRYAFATAHIHGEIYRRRGWLTSEGKEIKNKDEILALLKALFLPKRLSIIHCPGHQKGHSAEARGNRMADQAARKAAITETPDTSTLLIENSSP'
experimental_data_all = read_experimental_data(base_path, round_file_name_all, mlv_sequence)
print(experimental_data_all)
df_list = [experimental_data_all]

   Variant   fitness updated_variant
0      23K  0.866590            S23K
1      58T  0.963272            P58T
2      79R  0.594365            Q79R
3     115K  0.614792           L115K
4     182M  0.931950           Q182M
..     ...       ...             ...
76    660S  1.694228           A660S
77    514G  1.597760           L514G
78    514Q  1.612595           L514Q
79    660W  1.553562           A660W
80      WT  1.000000              WT

[81 rows x 3 columns]


In [8]:
iterations_seven, labels_seven = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_seven
embeddings_pd = embeddings
labels_pd = labels_seven
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [9]:
df_all
df_all.to_csv('mlv/round7_all_new.csv', index=False)

In [10]:
df_test

Unnamed: 0,variant,y_pred,y_actual
8238,L514R,1.262839,
3415,P513G,1.238767,
8995,L514S,1.232648,
1074,A660K,1.223723,
534,A660Y,1.218121,
...,...,...,...
12551,V444S,0.731534,
3974,Y598H,0.729129,
9608,F313H,0.711431,
5270,I212P,0.687774,


In [11]:
df_test.to_csv('mlv/round7_predictions_new.csv', index=False)

## MLV Round Multi 1

In [20]:
# import brenan data
dataset_name = 'mlv_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L395V,0.026370,0.035806,0.065681,0.098617,0.022640,0.015231,-0.131188,0.093099,-0.056186,-0.113474,...,-0.088710,0.028330,0.009087,-0.106295,-0.013857,0.057777,0.035376,0.013376,-0.154173,-0.082881
V122A,0.027030,0.039869,0.063317,0.098480,0.023198,0.014039,-0.130743,0.095194,-0.055028,-0.112952,...,-0.090134,0.026029,0.008298,-0.105948,-0.015715,0.058151,0.033629,0.011017,-0.154426,-0.080128
A536T,0.026795,0.037209,0.065507,0.097628,0.024206,0.015874,-0.132245,0.094248,-0.057104,-0.113739,...,-0.089233,0.026587,0.007957,-0.107605,-0.015971,0.057966,0.034426,0.012013,-0.152568,-0.081039
D371W,0.023449,0.037792,0.064348,0.098553,0.021913,0.014635,-0.130733,0.093426,-0.055363,-0.112917,...,-0.088215,0.026495,0.006595,-0.106479,-0.013371,0.058535,0.033883,0.013393,-0.156771,-0.084479


In [21]:
dataset_name = 'mlv_2nd_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_2nd = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_2nd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT Wild-type sequence,0.026001,0.037897,0.066144,0.098851,0.022965,0.015054,-0.132152,0.093454,-0.056323,-0.11346,...,-0.089845,0.026988,0.007822,-0.106862,-0.014972,0.058078,0.034002,0.011993,-0.153669,-0.081689
L419M_S675T,0.025847,0.037717,0.064906,0.097908,0.021871,0.014418,-0.132551,0.093725,-0.055711,-0.113142,...,-0.089841,0.026318,0.006344,-0.105601,-0.016114,0.058138,0.033815,0.013106,-0.154374,-0.080386
L419M_L671R,0.02587,0.037201,0.065918,0.098843,0.020897,0.014412,-0.133209,0.09308,-0.055894,-0.112526,...,-0.090686,0.027531,0.008045,-0.106092,-0.015871,0.056697,0.033848,0.012675,-0.155744,-0.080776
L419M_L491M,0.025416,0.037745,0.065904,0.09813,0.021897,0.014936,-0.130882,0.092876,-0.054528,-0.113754,...,-0.090225,0.026753,0.007899,-0.105969,-0.015015,0.057167,0.03454,0.01445,-0.153632,-0.080218
L419M_L333I,0.025982,0.037646,0.065845,0.098334,0.02177,0.015081,-0.132362,0.093285,-0.056159,-0.11312,...,-0.090413,0.026604,0.005812,-0.105464,-0.014759,0.057922,0.033617,0.012584,-0.15467,-0.079327
L419M_L514M,0.025798,0.037083,0.064738,0.098669,0.0219,0.014714,-0.132058,0.093561,-0.054708,-0.113041,...,-0.090422,0.028051,0.00662,-0.106047,-0.015275,0.058235,0.034655,0.013468,-0.154771,-0.080684
L419M_L670K,0.02601,0.036224,0.065357,0.099069,0.021127,0.014284,-0.133128,0.092272,-0.055584,-0.112423,...,-0.091713,0.026523,0.007422,-0.105259,-0.014692,0.057442,0.03423,0.012965,-0.156422,-0.08029
L419M_P485V,0.025755,0.03854,0.065753,0.098505,0.021759,0.016041,-0.131912,0.094041,-0.05498,-0.112566,...,-0.089284,0.027252,0.008939,-0.106461,-0.015337,0.05871,0.034242,0.013361,-0.154478,-0.080639
L419M_Q492R,0.026145,0.03656,0.06575,0.098417,0.021248,0.014498,-0.132465,0.093251,-0.056038,-0.110748,...,-0.090389,0.026382,0.006661,-0.104973,-0.016836,0.056282,0.035272,0.013869,-0.156775,-0.080093
L419M_S668G,0.02579,0.037056,0.065716,0.099086,0.022323,0.014679,-0.131006,0.094164,-0.056533,-0.112654,...,-0.089773,0.028229,0.007913,-0.106107,-0.015596,0.057568,0.033906,0.013226,-0.155147,-0.079036


In [22]:
dataset_name = 'mlv_3rd_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_3rd = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_3rd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT Wild-type sequence,0.026001,0.037897,0.066144,0.098851,0.022965,0.015054,-0.132152,0.093454,-0.056323,-0.113460,...,-0.089845,0.026988,0.007822,-0.106862,-0.014972,0.058078,0.034002,0.011993,-0.153669,-0.081689
L419M_S675T_L671R,0.025695,0.037402,0.065491,0.098337,0.020982,0.014274,-0.133485,0.093568,-0.056353,-0.112716,...,-0.090498,0.026493,0.007715,-0.105473,-0.016324,0.056843,0.033331,0.012736,-0.156047,-0.080904
L419M_S675T_L491M,0.025397,0.037759,0.065368,0.097646,0.021961,0.014504,-0.131147,0.093297,-0.054955,-0.114004,...,-0.089837,0.025768,0.007613,-0.105478,-0.015674,0.057222,0.034036,0.014289,-0.153925,-0.079935
L419M_S675T_L333I,0.025973,0.037670,0.065315,0.097843,0.021822,0.014648,-0.132619,0.093712,-0.056595,-0.113379,...,-0.090026,0.025629,0.005532,-0.104967,-0.015434,0.057996,0.033098,0.012431,-0.154956,-0.079039
L419M_S675T_L514M,0.025777,0.037106,0.064211,0.098178,0.021966,0.014281,-0.132314,0.093989,-0.055138,-0.113298,...,-0.090029,0.027068,0.006330,-0.105556,-0.015938,0.058296,0.034142,0.013309,-0.155055,-0.080403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L514M_Q492R_S668G,0.026128,0.035556,0.065986,0.099842,0.023073,0.014371,-0.130813,0.094452,-0.057697,-0.111268,...,-0.089745,0.027792,0.009146,-0.105679,-0.016313,0.055916,0.034901,0.012775,-0.158208,-0.079475
L670K_P485V_Q492R,0.026329,0.036097,0.066691,0.099740,0.021767,0.015496,-0.132475,0.093095,-0.057147,-0.110373,...,-0.090753,0.025105,0.010941,-0.105086,-0.015713,0.056371,0.034827,0.012481,-0.159202,-0.080515
L670K_P485V_S668G,0.025909,0.036470,0.066401,0.100431,0.022513,0.015988,-0.131301,0.094217,-0.057348,-0.112265,...,-0.089545,0.027329,0.012205,-0.106784,-0.014317,0.057574,0.033496,0.011568,-0.156975,-0.079450
L670K_Q492R_S668G,0.026269,0.034463,0.066371,0.100405,0.021993,0.014387,-0.131872,0.093459,-0.058462,-0.110503,...,-0.090701,0.026422,0.009934,-0.105285,-0.015822,0.055089,0.034560,0.012073,-0.159267,-0.078892


In [23]:
dataset_name = 'mlv_4th_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_4th = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_4th

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT Wild-type sequence,0.026001,0.037897,0.066144,0.098851,0.022965,0.015054,-0.132152,0.093454,-0.056323,-0.113460,...,-0.089845,0.026988,0.007822,-0.106862,-0.014972,0.058078,0.034002,0.011993,-0.153669,-0.081689
L419M_S675T_L671R_L491M,0.025252,0.037441,0.065946,0.098083,0.021065,0.014357,-0.132077,0.093136,-0.055597,-0.113572,...,-0.090500,0.025950,0.008987,-0.105346,-0.015883,0.055925,0.033558,0.013907,-0.155601,-0.080448
L419M_S675T_L671R_L333I,0.025838,0.037346,0.065905,0.098264,0.020937,0.014510,-0.133553,0.093552,-0.057248,-0.112951,...,-0.090685,0.025800,0.006913,-0.104841,-0.015643,0.056702,0.032612,0.012066,-0.156620,-0.079544
L419M_S675T_L671R_L514M,0.025648,0.036793,0.064787,0.098600,0.021080,0.014152,-0.133243,0.093822,-0.055774,-0.112863,...,-0.090686,0.027264,0.007709,-0.105437,-0.016155,0.057008,0.033662,0.012933,-0.156721,-0.080907
L419M_S675T_L671R_L670K,0.026019,0.035959,0.065572,0.098631,0.021143,0.013803,-0.134524,0.092408,-0.057241,-0.112832,...,-0.091621,0.026142,0.008511,-0.104633,-0.016341,0.057012,0.033753,0.012447,-0.157973,-0.080144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L514M_L670K_P485V_Q492R,0.026274,0.035501,0.065913,0.100053,0.021915,0.015364,-0.132202,0.093261,-0.056484,-0.110546,...,-0.090935,0.025944,0.010813,-0.105008,-0.015476,0.056582,0.035194,0.012705,-0.159890,-0.080587
L514M_L670K_P485V_S668G,0.025875,0.035864,0.065641,0.100729,0.022644,0.015853,-0.131048,0.094437,-0.056772,-0.112429,...,-0.089718,0.028175,0.012087,-0.106724,-0.014096,0.057791,0.033849,0.011782,-0.157667,-0.079484
L514M_L670K_Q492R_S668G,0.026206,0.033894,0.065590,0.100681,0.022123,0.014248,-0.131608,0.093653,-0.057776,-0.110654,...,-0.090921,0.027225,0.009889,-0.105202,-0.015625,0.055272,0.034918,0.012303,-0.159991,-0.078946
L514M_P485V_Q492R_S668G,0.026061,0.036369,0.066329,0.100068,0.023116,0.015684,-0.130080,0.095155,-0.057447,-0.110831,...,-0.088992,0.027644,0.011362,-0.105880,-0.016371,0.056732,0.034907,0.012900,-0.158627,-0.079315


In [24]:
dataset_name = 'mlv_5th_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_5th = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_5th

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT Wild-type sequence,0.026001,0.037897,0.066144,0.098851,0.022965,0.015054,-0.132152,0.093454,-0.056323,-0.113460,...,-0.089845,0.026988,0.007822,-0.106862,-0.014972,0.058078,0.034002,0.011993,-0.153669,-0.081689
L419M_S675T_L671R_L491M_L333I,0.025402,0.037387,0.066366,0.098005,0.021014,0.014589,-0.132137,0.093114,-0.056490,-0.113803,...,-0.090691,0.025251,0.008189,-0.104713,-0.015202,0.055791,0.032835,0.013237,-0.156173,-0.079086
L419M_S675T_L671R_L491M_L514M,0.025193,0.036807,0.065219,0.098347,0.021164,0.014230,-0.131806,0.093377,-0.055059,-0.113738,...,-0.090700,0.026757,0.008963,-0.105267,-0.015705,0.056073,0.033869,0.014126,-0.156260,-0.080430
L419M_S675T_L671R_L491M_L670K,0.025584,0.036006,0.066017,0.098382,0.021218,0.013884,-0.133114,0.091964,-0.056481,-0.113679,...,-0.091627,0.025592,0.009779,-0.104501,-0.015889,0.056093,0.033980,0.013622,-0.157532,-0.079695
L419M_S675T_L671R_L491M_P485V,0.025143,0.038308,0.066079,0.098181,0.021017,0.015547,-0.131650,0.093747,-0.055424,-0.113323,...,-0.089461,0.026039,0.011199,-0.105614,-0.015680,0.056727,0.033233,0.013816,-0.156284,-0.080411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L333I_L514M_L670K_P485V_S668G,0.026013,0.035809,0.066028,0.100676,0.022613,0.016065,-0.131099,0.094389,-0.057645,-0.112674,...,-0.089897,0.027461,0.011283,-0.106067,-0.013372,0.057628,0.033164,0.011050,-0.158263,-0.078131
L333I_L514M_L670K_Q492R_S668G,0.026355,0.033842,0.066014,0.100623,0.022104,0.014491,-0.131666,0.093608,-0.058646,-0.110886,...,-0.091122,0.026517,0.009104,-0.104550,-0.014925,0.055105,0.034215,0.011605,-0.160562,-0.077589
L333I_L514M_P485V_Q492R_S668G,0.026211,0.036328,0.066721,0.100031,0.023086,0.015910,-0.130138,0.095117,-0.058331,-0.111074,...,-0.089168,0.026941,0.010574,-0.105223,-0.015651,0.056577,0.034210,0.012195,-0.159224,-0.077964
L333I_L670K_P485V_Q492R_S668G,0.026325,0.035227,0.067092,0.100538,0.021974,0.015949,-0.131222,0.094126,-0.059065,-0.110247,...,-0.090137,0.025527,0.011428,-0.104838,-0.015189,0.055715,0.033849,0.011487,-0.160337,-0.077391


In [25]:
dataset_name = 'mlv_6th_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_6th = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_6th

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT Wild-type sequence,0.026001,0.037897,0.066144,0.098851,0.022965,0.015054,-0.132152,0.093454,-0.056323,-0.113460,...,-0.089845,0.026988,0.007822,-0.106862,-0.014972,0.058078,0.034002,0.011993,-0.153669,-0.081689
L419M_S675T_L671R_L491M_L333I_L514M,0.025362,0.036754,0.065645,0.098268,0.021115,0.014456,-0.131864,0.093352,-0.055956,-0.113968,...,-0.090895,0.026051,0.008174,-0.104625,-0.015025,0.055937,0.033152,0.013456,-0.156827,-0.079067
L419M_S675T_L671R_L491M_L333I_L670K,0.025741,0.035942,0.066431,0.098292,0.021167,0.014119,-0.133166,0.091933,-0.057365,-0.113907,...,-0.091835,0.024881,0.008976,-0.103871,-0.015212,0.055965,0.033272,0.012944,-0.158091,-0.078332
L419M_S675T_L671R_L491M_L333I_P485V,0.025295,0.038254,0.066471,0.098113,0.020957,0.015762,-0.131708,0.093714,-0.056321,-0.113557,...,-0.089644,0.025327,0.010394,-0.104971,-0.014985,0.056593,0.032524,0.013115,-0.156874,-0.079048
L419M_S675T_L671R_L491M_L333I_Q492R,0.025517,0.036343,0.066525,0.098003,0.020509,0.014152,-0.132513,0.092974,-0.057199,-0.112068,...,-0.090714,0.024647,0.008120,-0.103525,-0.016343,0.053909,0.033667,0.013646,-0.158928,-0.078455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L491M_L333I_L514M_L670K_Q492R_S668G,0.025758,0.033949,0.066238,0.100344,0.022205,0.014469,-0.130432,0.093100,-0.057834,-0.112150,...,-0.091008,0.026335,0.010245,-0.104333,-0.014195,0.054077,0.034297,0.012606,-0.160199,-0.077071
L491M_L333I_L514M_P485V_Q492R_S668G,0.025622,0.036461,0.066744,0.099714,0.023183,0.015910,-0.128866,0.094519,-0.057659,-0.112331,...,-0.088992,0.026825,0.011616,-0.104943,-0.014904,0.055768,0.034165,0.013000,-0.159125,-0.077371
L491M_L333I_L670K_P485V_Q492R_S668G,0.025748,0.035390,0.067138,0.100226,0.022072,0.015959,-0.129967,0.093534,-0.058355,-0.111482,...,-0.089953,0.025400,0.012473,-0.104601,-0.014448,0.054909,0.033815,0.012276,-0.160246,-0.076849
L491M_L514M_L670K_P485V_Q492R_S668G,0.025550,0.034816,0.065912,0.100578,0.022255,0.015579,-0.129635,0.093741,-0.056850,-0.111438,...,-0.089950,0.026992,0.013133,-0.105113,-0.014931,0.055275,0.034872,0.013237,-0.160348,-0.078247


In [26]:
dataset_name = 'mlv_7th_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_7th = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_7th

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT Wild-type sequence,0.026001,0.037897,0.066144,0.098851,0.022965,0.015054,-0.132152,0.093454,-0.056323,-0.113460,...,-0.089845,0.026988,0.007822,-0.106862,-0.014972,0.058078,0.034002,0.011993,-0.153669,-0.081689
L419M_S675T_L671R_L491M_L333I_L514M_L670K,0.025706,0.035312,0.065683,0.098574,0.021264,0.013980,-0.132882,0.092158,-0.056804,-0.114075,...,-0.092041,0.025677,0.008932,-0.103759,-0.015019,0.056116,0.033607,0.013169,-0.158763,-0.078329
L419M_S675T_L671R_L491M_L333I_L514M_P485V,0.025268,0.037601,0.065721,0.098393,0.021084,0.015625,-0.131448,0.093914,-0.055804,-0.113749,...,-0.089808,0.026181,0.010309,-0.104872,-0.014767,0.056787,0.032851,0.013317,-0.157501,-0.079055
L419M_S675T_L671R_L491M_L333I_L514M_Q492R,0.025453,0.035718,0.065762,0.098267,0.020615,0.014007,-0.132236,0.093150,-0.056581,-0.112240,...,-0.090928,0.025452,0.008099,-0.103397,-0.016149,0.054070,0.034011,0.013863,-0.159600,-0.078480
L419M_S675T_L671R_L491M_L333I_L514M_S668G,0.025278,0.036179,0.066149,0.098803,0.021517,0.014397,-0.130506,0.093993,-0.057320,-0.113992,...,-0.090334,0.027054,0.009495,-0.104708,-0.015086,0.055343,0.033017,0.013504,-0.157636,-0.077267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L671R_L491M_L333I_L514M_P485V_Q492R_S668G,0.025565,0.036053,0.067455,0.099867,0.022083,0.015656,-0.129711,0.094034,-0.058402,-0.112261,...,-0.089354,0.027152,0.013062,-0.104988,-0.015206,0.054363,0.034038,0.012441,-0.160453,-0.077405
L671R_L491M_L333I_L670K_P485V_Q492R_S668G,0.025652,0.035175,0.068161,0.099983,0.021991,0.015543,-0.130931,0.093088,-0.059850,-0.112147,...,-0.090100,0.026274,0.013736,-0.104629,-0.015424,0.054434,0.034095,0.011590,-0.161676,-0.076515
L671R_L491M_L514M_L670K_P485V_Q492R_S668G,0.025450,0.034612,0.066935,0.100333,0.022172,0.015169,-0.130591,0.093292,-0.058343,-0.112114,...,-0.090086,0.027864,0.014395,-0.105151,-0.015906,0.054801,0.035162,0.012543,-0.161780,-0.077915
L671R_L333I_L514M_L670K_P485V_Q492R_S668G,0.026188,0.034412,0.067319,0.100581,0.022057,0.015400,-0.131911,0.093841,-0.059896,-0.111090,...,-0.090441,0.027264,0.012569,-0.104778,-0.015950,0.055474,0.034505,0.011030,-0.162459,-0.077119


In [27]:
# convert the indices of embeddings_2nd, embeddings_3rd, embeddings_4th to strings
embeddings.columns = embeddings.columns.astype(str)
embeddings_2nd.columns = embeddings_2nd.columns.astype(str)
embeddings_3rd.columns = embeddings_3rd.columns.astype(str)
embeddings_4th.columns = embeddings_4th.columns.astype(str)
embeddings_5th.columns = embeddings_5th.columns.astype(str)
embeddings_6th.columns = embeddings_6th.columns.astype(str)
embeddings_7th.columns = embeddings_7th.columns.astype(str)

# remove row that is WT Wild-type sequence
embeddings_2nd = embeddings_2nd.drop('WT Wild-type sequence')
embeddings_3rd = embeddings_3rd.drop('WT Wild-type sequence')
embeddings_4th = embeddings_4th.drop('WT Wild-type sequence')
embeddings_5th = embeddings_5th.drop('WT Wild-type sequence')
embeddings_6th = embeddings_6th.drop('WT Wild-type sequence')
embeddings_7th = embeddings_7th.drop('WT Wild-type sequence')

# Concatenate along rows
embeddings_full = pd.concat([embeddings, embeddings_2nd, embeddings_4th, embeddings_5th, embeddings_6th, embeddings_7th], axis=0)
embeddings_full

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L671R_L491M_L333I_L514M_P485V_Q492R_S668G,0.025565,0.036053,0.067455,0.099867,0.022083,0.015656,-0.129711,0.094034,-0.058402,-0.112261,...,-0.089354,0.027152,0.013062,-0.104988,-0.015206,0.054363,0.034038,0.012441,-0.160453,-0.077405
L671R_L491M_L333I_L670K_P485V_Q492R_S668G,0.025652,0.035175,0.068161,0.099983,0.021991,0.015543,-0.130931,0.093088,-0.059850,-0.112147,...,-0.090100,0.026274,0.013736,-0.104629,-0.015424,0.054434,0.034095,0.011590,-0.161676,-0.076515
L671R_L491M_L514M_L670K_P485V_Q492R_S668G,0.025450,0.034612,0.066935,0.100333,0.022172,0.015169,-0.130591,0.093292,-0.058343,-0.112114,...,-0.090086,0.027864,0.014395,-0.105151,-0.015906,0.054801,0.035162,0.012543,-0.161780,-0.077915
L671R_L333I_L514M_L670K_P485V_Q492R_S668G,0.026188,0.034412,0.067319,0.100581,0.022057,0.015400,-0.131911,0.093841,-0.059896,-0.111090,...,-0.090441,0.027264,0.012569,-0.104778,-0.015950,0.055474,0.034505,0.011030,-0.162459,-0.077119


In [28]:
base_path = 'mlv/'
round_file_name_all = 'mlv_Round4_all.xlsx'
mlv_sequence = 'TLNIEDEYRLHETSKEPDVSLGSTWLSDFPQAWAETGGMGLAVRQAPLIIPLKATSTPVSIKQYPMSQEARLGIKPHIQRLLDQGILVPCQSPWNTPLLPVKKPGTNDYRPVQDLREVNKRVEDIHPTVPNPYNLLSGLPPSHQWYTVLDLKDAFFCLRLHPTSQPLFAFEWRDPEMGISGQLTWTRLPQGFKNSPTLFNEALHRDLADFRIQHPDLILLQYVDDLLLAATSELDCQQGTRALLQTLGNLGYRASAKKAQICQKQVKYLGYLLKEGQRWLTEARKETVMGQPTPKTPRQLREFLGKAGFCRLFIPGFAEMAAPLYPLTKPGTLFNWGPDQQKAYQEIKQALLTAPALGLPDLTKPFELFVDEKQGYAKGVLTQKLGPWRRPVAYLSKKLDPVAAGWPPCLRMVAAIAVLTKDAGKLTMGQPLVILAPHAVEALVKQPPDRWLSNARMTHYQALLLDTDRVQFGPVVALNPATLLPLPEEGLQHNCLDILAEAHGTRPDLTDQPLPDADHTWYTDGSSLLQEGQRKAGAAVTTETEVIWAKALPAGTSAQRAELIALTQALKMAEGKKLNVYTDSRYAFATAHIHGEIYRRRGWLTSEGKEIKNKDEILALLKALFLPKRLSIIHCPGHQKGHSAEARGNRMADQAARKAAITETPDTSTLLIENSSP'
experimental_data_all = read_experimental_data(base_path, round_file_name_all, mlv_sequence)
print(experimental_data_all)
df_list = [experimental_data_all]

   Variant   fitness  Round  updated_variant
0       WT  1.000000       1              WT
1      23K  0.866590       1            S23K
2      58T  0.963272       1            P58T
3      79R  0.594365       1            Q79R
4     115K  0.614792       1           L115K
5     182M  0.931950       1           Q182M
6     271N  0.801496       1           Y271N
7     318F  0.925166       1           A318F
8     345H  0.953547       1           Q345H
9     376Y  0.202246       1           Y376Y
10    422P  0.333216       1           D422P
11     52M  0.581691       2            L52M
12    501D  0.587692       2           E501D
13    362V  0.514356       2           L362V
14    346T  0.690671       2           E346T
15     13M  0.942222       2            T13M
16    419M  1.018405       2           L419M
17    528M  0.880965       2           L528M
18    333M  0.780678       2           L333M
19    491H  0.911881       2           L491H
20    352M  0.848586       2           L352M
21    671T

In [29]:
iterations_multi_one, labels_multi_one = create_dataframes(df_list, embeddings_full.index)

iteration_old = iterations_multi_one
embeddings_pd = embeddings_full
labels_pd = labels_multi_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [30]:
df_all
df_all.to_csv('mlv/round1_multi_all_new.csv', index=False)

In [31]:
df_test

Unnamed: 0,variant,y_pred,y_actual
12884,L671R_L670K,1.376500,
12905,L670K_S668G,1.368531,
13342,L671R_L333I_L670K_P485V_S668G,1.338412,
13074,L671R_L514M_L670K_P485V,1.335860,
13076,L671R_L514M_L670K_S668G,1.335152,
...,...,...,...
2515,A561C,0.742866,
6805,L167V,0.742718,
10497,L52A,0.741292,
11895,I261V,0.737337,


In [32]:
df_test.to_csv('mlv/round1_multi_predictions_new.csv', index=False)

## MLV Round Multi 2

In [4]:
# import brenan data
dataset_name = 'mlv_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L395V,0.026370,0.035806,0.065681,0.098617,0.022640,0.015231,-0.131188,0.093099,-0.056186,-0.113474,...,-0.088710,0.028330,0.009087,-0.106295,-0.013857,0.057777,0.035376,0.013376,-0.154173,-0.082881
V122A,0.027030,0.039869,0.063317,0.098480,0.023198,0.014039,-0.130743,0.095194,-0.055028,-0.112952,...,-0.090134,0.026029,0.008298,-0.105948,-0.015715,0.058151,0.033629,0.011017,-0.154426,-0.080128
A536T,0.026795,0.037209,0.065507,0.097628,0.024206,0.015874,-0.132245,0.094248,-0.057104,-0.113739,...,-0.089233,0.026587,0.007957,-0.107605,-0.015971,0.057966,0.034426,0.012013,-0.152568,-0.081039
D371W,0.023449,0.037792,0.064348,0.098553,0.021913,0.014635,-0.130733,0.093426,-0.055363,-0.112917,...,-0.088215,0.026495,0.006595,-0.106479,-0.013371,0.058535,0.033883,0.013393,-0.156771,-0.084479


In [5]:
dataset_name = 'mlv_2nd_2_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_2nd = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_2nd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT Wild-type sequence,0.026001,0.037897,0.066144,0.098851,0.022965,0.015054,-0.132152,0.093454,-0.056323,-0.113460,...,-0.089845,0.026988,0.007822,-0.106862,-0.014972,0.058078,0.034002,0.011993,-0.153669,-0.081689
L670H_L514E,0.026147,0.036453,0.065666,0.098268,0.022502,0.014186,-0.133652,0.093840,-0.057296,-0.113168,...,-0.089980,0.027333,0.009056,-0.106311,-0.015974,0.058074,0.033203,0.011890,-0.155281,-0.080340
L670H_T669G,0.026574,0.036857,0.066357,0.098588,0.023008,0.014347,-0.132414,0.093425,-0.058449,-0.113251,...,-0.089973,0.028398,0.010604,-0.107426,-0.015411,0.057403,0.034410,0.011929,-0.154520,-0.080406
L670H_A660G,0.026058,0.035763,0.066599,0.097709,0.023039,0.014683,-0.134114,0.093471,-0.056550,-0.113128,...,-0.091054,0.026862,0.010709,-0.107544,-0.015766,0.055952,0.033405,0.012137,-0.153866,-0.079903
L670H_L671W,0.024918,0.037422,0.066137,0.099247,0.022040,0.014408,-0.133644,0.092739,-0.057286,-0.113249,...,-0.089721,0.027719,0.009264,-0.106772,-0.015011,0.057543,0.033274,0.011814,-0.155005,-0.082057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L419M_S675T,0.025847,0.037717,0.064906,0.097908,0.021871,0.014418,-0.132551,0.093725,-0.055711,-0.113142,...,-0.089841,0.026318,0.006344,-0.105601,-0.016114,0.058138,0.033815,0.013106,-0.154374,-0.080386
L419M_L670I,0.026070,0.037405,0.065742,0.098404,0.022122,0.014628,-0.132472,0.093438,-0.055877,-0.112952,...,-0.090832,0.026739,0.006886,-0.106387,-0.014918,0.057629,0.033946,0.012996,-0.154387,-0.080519
L671M_S675T,0.025417,0.037842,0.066221,0.098661,0.022830,0.014609,-0.132972,0.093504,-0.056911,-0.113862,...,-0.090107,0.026466,0.007642,-0.106089,-0.014564,0.057258,0.033294,0.011713,-0.154338,-0.081550
L671M_L670I,0.025890,0.037451,0.066936,0.099253,0.023057,0.014833,-0.132475,0.093078,-0.057236,-0.113802,...,-0.091003,0.026626,0.008330,-0.106873,-0.013831,0.056826,0.033407,0.011861,-0.154285,-0.081364


In [6]:
dataset_name = 'mlv_3rd_2_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_3rd = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_3rd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT Wild-type sequence,0.026001,0.037897,0.066144,0.098851,0.022965,0.015054,-0.132152,0.093454,-0.056323,-0.113460,...,-0.089845,0.026988,0.007822,-0.106862,-0.014972,0.058078,0.034002,0.011993,-0.153669,-0.081689
L670H_L514E_T669G,0.026527,0.036128,0.066071,0.098047,0.022751,0.014131,-0.133010,0.094271,-0.058937,-0.112818,...,-0.089847,0.029059,0.010937,-0.106864,-0.016141,0.057791,0.033849,0.011779,-0.155869,-0.079238
L670H_L514E_A660G,0.026050,0.035040,0.066263,0.097099,0.022739,0.014493,-0.134738,0.094311,-0.057000,-0.112698,...,-0.090934,0.027556,0.011047,-0.106981,-0.016522,0.056350,0.032791,0.011986,-0.155221,-0.078767
L670H_L514E_L671W,0.024850,0.036714,0.065843,0.098700,0.021773,0.014196,-0.134206,0.093601,-0.057784,-0.112838,...,-0.089568,0.028389,0.009566,-0.106205,-0.015743,0.057911,0.032702,0.011655,-0.156332,-0.080914
L670H_L514E_A659M,0.026605,0.035398,0.066844,0.097259,0.022177,0.013868,-0.133274,0.095375,-0.056360,-0.113791,...,-0.091759,0.027781,0.010110,-0.107839,-0.016624,0.056501,0.032723,0.012745,-0.155839,-0.080521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L491M_S675T_L670I,0.025782,0.037696,0.066416,0.098158,0.023425,0.014425,-0.131270,0.093694,-0.056518,-0.114666,...,-0.090123,0.024910,0.008923,-0.106582,-0.014677,0.056804,0.033263,0.012779,-0.153810,-0.080855
L419M_L671M_S675T,0.025288,0.037631,0.065523,0.098199,0.021673,0.014411,-0.133108,0.093348,-0.055870,-0.113290,...,-0.090493,0.026763,0.006453,-0.105320,-0.015044,0.057256,0.033610,0.012980,-0.154736,-0.080529
L419M_L671M_L670I,0.025764,0.037241,0.066232,0.098798,0.021905,0.014625,-0.132613,0.092913,-0.056192,-0.113224,...,-0.091394,0.026920,0.007144,-0.106102,-0.014303,0.056824,0.033730,0.013113,-0.154684,-0.080340
L419M_S675T_L670I,0.026084,0.037454,0.065289,0.097966,0.022189,0.014125,-0.132801,0.093942,-0.056266,-0.113227,...,-0.090517,0.025757,0.006464,-0.105930,-0.015596,0.057733,0.033381,0.012862,-0.154652,-0.080294


In [7]:
dataset_name = 'mlv_4th_2_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_4th = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_4th

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT Wild-type sequence,0.026001,0.037897,0.066144,0.098851,0.022965,0.015054,-0.132152,0.093454,-0.056323,-0.113460,...,-0.089845,0.026988,0.007822,-0.106862,-0.014972,0.058078,0.034002,0.011993,-0.153669,-0.081689
L670H_L514E_T669G_A660G,0.026431,0.034314,0.066498,0.096873,0.022843,0.014424,-0.133830,0.094816,-0.058430,-0.112309,...,-0.090826,0.029466,0.012990,-0.107666,-0.016971,0.056240,0.033431,0.011739,-0.155582,-0.077595
L670H_L514E_T669G_L671W,0.025067,0.036195,0.066453,0.098641,0.021931,0.014120,-0.133172,0.093990,-0.059368,-0.112248,...,-0.089260,0.030308,0.011661,-0.106665,-0.015849,0.057632,0.033085,0.011793,-0.157014,-0.079669
L670H_L514E_T669G_A659M,0.026866,0.034862,0.067196,0.097300,0.022457,0.013741,-0.132581,0.095668,-0.057961,-0.113285,...,-0.091631,0.029622,0.011955,-0.108385,-0.016671,0.056134,0.033500,0.012418,-0.156464,-0.079407
L670H_L514E_T669G_L671R,0.025897,0.034739,0.066681,0.098972,0.022303,0.013670,-0.132939,0.094459,-0.059979,-0.112242,...,-0.089688,0.029339,0.011811,-0.106318,-0.017907,0.056676,0.033572,0.011525,-0.158010,-0.078683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L491M_L419M_L671M_S675T,0.024848,0.037671,0.065983,0.097937,0.021753,0.014498,-0.131694,0.092919,-0.055109,-0.114150,...,-0.090495,0.026208,0.007728,-0.105194,-0.014608,0.056342,0.033833,0.014164,-0.154279,-0.080070
L491M_L419M_L671M_L670I,0.025336,0.037289,0.066690,0.098535,0.021979,0.014712,-0.131197,0.092483,-0.055438,-0.114090,...,-0.091396,0.026367,0.008429,-0.105966,-0.013879,0.055916,0.033942,0.014303,-0.154229,-0.079880
L491M_L419M_S675T_L670I,0.025641,0.037493,0.065748,0.097701,0.022272,0.014216,-0.131399,0.093507,-0.055506,-0.114093,...,-0.090519,0.025201,0.007738,-0.105803,-0.015161,0.056814,0.033598,0.014052,-0.154202,-0.079842
L491M_L671M_S675T_L670I,0.025465,0.037518,0.066907,0.098595,0.023096,0.014503,-0.131452,0.093190,-0.056763,-0.114827,...,-0.090807,0.025117,0.009180,-0.106217,-0.013993,0.056009,0.033078,0.012894,-0.154112,-0.080821


In [8]:
dataset_name = 'mlv_5th_2_esm2_t48_15B_UR50D'
base_path = 'mlv/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings_5th = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
embeddings_5th

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT Wild-type sequence,0.026001,0.037897,0.066144,0.098851,0.022965,0.015054,-0.132152,0.093454,-0.056323,-0.113460,...,-0.089845,0.026988,0.007822,-0.106862,-0.014972,0.058078,0.034002,0.011993,-0.153669,-0.081689
L670H_L514E_T669G_A660G_L671W,0.025015,0.034100,0.066821,0.097470,0.021959,0.014283,-0.133876,0.094343,-0.058613,-0.111808,...,-0.090191,0.031041,0.013815,-0.107300,-0.016727,0.056265,0.032823,0.011521,-0.156968,-0.077627
L670H_L514E_T669G_A660G_A659M,0.028021,0.032470,0.066732,0.094798,0.022510,0.013782,-0.133398,0.096185,-0.056801,-0.111796,...,-0.091904,0.031367,0.014544,-0.109516,-0.018743,0.055199,0.032760,0.009542,-0.155062,-0.076194
L670H_L514E_T669G_A660G_L671R,0.025774,0.032390,0.066715,0.097672,0.022147,0.013877,-0.133729,0.094826,-0.059398,-0.111852,...,-0.090390,0.030157,0.013800,-0.106568,-0.018711,0.055373,0.033324,0.011263,-0.158065,-0.076850
L670H_L514E_T669G_A660G_S668G,0.025932,0.033803,0.066355,0.096623,0.023545,0.013851,-0.132765,0.095640,-0.059238,-0.111717,...,-0.089992,0.030046,0.013695,-0.108055,-0.017813,0.055322,0.033797,0.012154,-0.156547,-0.076254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L670K_D653I_T669W_L671I_L333I,0.024564,0.036822,0.066990,0.098142,0.022720,0.015042,-0.133721,0.093097,-0.058218,-0.112791,...,-0.091122,0.025537,0.010415,-0.107779,-0.014557,0.055984,0.034003,0.011641,-0.157919,-0.077612
L670K_D653I_T669W_L671I_V580S,0.023910,0.038670,0.067703,0.097831,0.022620,0.013823,-0.133143,0.092853,-0.055748,-0.112528,...,-0.090196,0.026351,0.009339,-0.109932,-0.014998,0.053946,0.035588,0.011351,-0.156942,-0.078072
L670K_D653I_T669W_L671I_Q492R,0.024692,0.035697,0.066920,0.098255,0.022244,0.014452,-0.133912,0.093080,-0.058095,-0.110431,...,-0.091119,0.025304,0.011224,-0.107262,-0.016687,0.054321,0.035678,0.012984,-0.160082,-0.078347
L670K_D653I_T669W_L671I_L491M,0.024013,0.036934,0.067006,0.097925,0.022866,0.014881,-0.132282,0.092711,-0.056561,-0.113421,...,-0.090971,0.025691,0.012434,-0.108281,-0.014840,0.055245,0.034938,0.013513,-0.156899,-0.078492


In [9]:
# convert the indices of embeddings_2nd, embeddings_3rd, embeddings_4th to strings
embeddings.columns = embeddings.columns.astype(str)
embeddings_2nd.columns = embeddings_2nd.columns.astype(str)
embeddings_3rd.columns = embeddings_3rd.columns.astype(str)
embeddings_4th.columns = embeddings_4th.columns.astype(str)
embeddings_5th.columns = embeddings_5th.columns.astype(str)

# remove row that is WT Wild-type sequence
embeddings_2nd = embeddings_2nd.drop('WT Wild-type sequence')
embeddings_3rd = embeddings_3rd.drop('WT Wild-type sequence')
embeddings_4th = embeddings_4th.drop('WT Wild-type sequence')
embeddings_5th = embeddings_5th.drop('WT Wild-type sequence')

# Concatenate along rows
embeddings_full = pd.concat([embeddings, embeddings_2nd, embeddings_4th, embeddings_5th], axis=0)
embeddings_full

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
M320I,0.026940,0.037588,0.066551,0.099030,0.021716,0.014472,-0.131924,0.094308,-0.057852,-0.113178,...,-0.089854,0.024963,0.008938,-0.106676,-0.015563,0.057679,0.034707,0.010038,-0.153705,-0.081625
E282D,0.026505,0.037227,0.065662,0.098670,0.023132,0.015304,-0.130645,0.094127,-0.056908,-0.112610,...,-0.089899,0.027284,0.008688,-0.107788,-0.015923,0.059264,0.034138,0.012846,-0.151624,-0.081127
Q383G,0.026118,0.038315,0.065061,0.098543,0.023923,0.014943,-0.129731,0.092198,-0.055923,-0.111561,...,-0.089921,0.024331,0.006300,-0.106282,-0.016044,0.057277,0.035095,0.012304,-0.155163,-0.080189
E372S,0.022405,0.040278,0.064326,0.098013,0.021698,0.016157,-0.132202,0.093713,-0.056204,-0.113769,...,-0.089035,0.027543,0.006830,-0.105350,-0.015514,0.056943,0.034686,0.010381,-0.154470,-0.077426
I672M,0.025102,0.038173,0.065741,0.098679,0.022948,0.014640,-0.131515,0.092997,-0.055775,-0.113919,...,-0.088714,0.028256,0.006979,-0.106508,-0.015773,0.058601,0.034031,0.012295,-0.154616,-0.081792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L670K_D653I_T669W_L671I_L333I,0.024564,0.036822,0.066990,0.098142,0.022720,0.015042,-0.133721,0.093097,-0.058218,-0.112791,...,-0.091122,0.025537,0.010415,-0.107779,-0.014557,0.055984,0.034003,0.011641,-0.157919,-0.077612
L670K_D653I_T669W_L671I_V580S,0.023910,0.038670,0.067703,0.097831,0.022620,0.013823,-0.133143,0.092853,-0.055748,-0.112528,...,-0.090196,0.026351,0.009339,-0.109932,-0.014998,0.053946,0.035588,0.011351,-0.156942,-0.078072
L670K_D653I_T669W_L671I_Q492R,0.024692,0.035697,0.066920,0.098255,0.022244,0.014452,-0.133912,0.093080,-0.058095,-0.110431,...,-0.091119,0.025304,0.011224,-0.107262,-0.016687,0.054321,0.035678,0.012984,-0.160082,-0.078347
L670K_D653I_T669W_L671I_L491M,0.024013,0.036934,0.067006,0.097925,0.022866,0.014881,-0.132282,0.092711,-0.056561,-0.113421,...,-0.090971,0.025691,0.012434,-0.108281,-0.014840,0.055245,0.034938,0.013513,-0.156899,-0.078492


In [10]:
del embeddings
del embeddings_2nd
del embeddings_3rd
del embeddings_4th
del embeddings_5th

In [14]:
base_path = 'mlv/'
round_file_name_all = 'mlv_Round4_all.xlsx'
round_file_name_5 = 'mlv_Round5.xlsx'
round_file_name_6 = 'mlv_Round6.xlsx'
round_file_name_multi_1 = 'mlv_Round1_multi.xlsx'
mlv_sequence = 'TLNIEDEYRLHETSKEPDVSLGSTWLSDFPQAWAETGGMGLAVRQAPLIIPLKATSTPVSIKQYPMSQEARLGIKPHIQRLLDQGILVPCQSPWNTPLLPVKKPGTNDYRPVQDLREVNKRVEDIHPTVPNPYNLLSGLPPSHQWYTVLDLKDAFFCLRLHPTSQPLFAFEWRDPEMGISGQLTWTRLPQGFKNSPTLFNEALHRDLADFRIQHPDLILLQYVDDLLLAATSELDCQQGTRALLQTLGNLGYRASAKKAQICQKQVKYLGYLLKEGQRWLTEARKETVMGQPTPKTPRQLREFLGKAGFCRLFIPGFAEMAAPLYPLTKPGTLFNWGPDQQKAYQEIKQALLTAPALGLPDLTKPFELFVDEKQGYAKGVLTQKLGPWRRPVAYLSKKLDPVAAGWPPCLRMVAAIAVLTKDAGKLTMGQPLVILAPHAVEALVKQPPDRWLSNARMTHYQALLLDTDRVQFGPVVALNPATLLPLPEEGLQHNCLDILAEAHGTRPDLTDQPLPDADHTWYTDGSSLLQEGQRKAGAAVTTETEVIWAKALPAGTSAQRAELIALTQALKMAEGKKLNVYTDSRYAFATAHIHGEIYRRRGWLTSEGKEIKNKDEILALLKALFLPKRLSIIHCPGHQKGHSAEARGNRMADQAARKAAITETPDTSTLLIENSSP'
experimental_data_all = read_experimental_data(base_path, round_file_name_all, mlv_sequence)
experimental_data_5 = read_experimental_data(base_path, round_file_name_5, mlv_sequence)
experimental_data_6 = read_experimental_data(base_path, round_file_name_6, mlv_sequence)
experimental_data_multi_1 = read_experimental_data(base_path, round_file_name_multi_1, mlv_sequence, single_mutant=False)

print(experimental_data_all, experimental_data_5, experimental_data_6, experimental_data_multi_1)
df_list = [experimental_data_all, experimental_data_5, experimental_data_6, experimental_data_multi_1]

   Variant   fitness  Round  updated_variant
0       WT  1.000000       1              WT
1      23K  0.866590       1            S23K
2      58T  0.963272       1            P58T
3      79R  0.594365       1            Q79R
4     115K  0.614792       1           L115K
5     182M  0.931950       1           Q182M
6     271N  0.801496       1           Y271N
7     318F  0.925166       1           A318F
8     345H  0.953547       1           Q345H
9     376Y  0.202246       1           Y376Y
10    422P  0.333216       1           D422P
11     52M  0.581691       2            L52M
12    501D  0.587692       2           E501D
13    362V  0.514356       2           L362V
14    346T  0.690671       2           E346T
15     13M  0.942222       2            T13M
16    419M  1.018405       2           L419M
17    528M  0.880965       2           L528M
18    333M  0.780678       2           L333M
19    491H  0.911881       2           L491H
20    352M  0.848586       2           L352M
21    671T

In [24]:
iterations_multi_two, labels_multi_two = create_dataframes(df_list, embeddings_full.index)

iteration_old = iterations_multi_two
embeddings_pd = embeddings_full
labels_pd = labels_multi_two
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [25]:
df_all
df_all.to_csv('mlv/round2_multi_all_new.csv', index=False)

In [26]:
df_test

Unnamed: 0,variant,y_pred,y_actual
12889,L514E_L670S,1.546589,
15347,L514E_L670S_T669G_S675T,1.534767,
15368,L514E_L670S_A660G_S675T,1.519690,
13398,L670H_L514E_A660G_D666R,1.486858,
12868,L670H_A659M,1.481942,
...,...,...,...
78827,T669G_L670K_L671R_S668G_P485V,0.656916,
3303,D422T,0.647544,
26042,L671W_L670K_S668G_P485V,0.645944,
110877,L670K_S668G_P485V_V580S_Q492R,0.644472,


In [27]:
df_test.to_csv('mlv/round2_multi_predictions_new.csv', index=False)

## Psacas12f round 1

In [15]:
# import brenan data
dataset_name = 'psacas12f_esm2_t48_15B_UR50D'
base_path = 'psacas12f/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [16]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.045623,-0.057608,-0.021596,-0.023409,-0.117766,0.002835,-0.034280,0.108074,-0.147289,-0.047066,...,-0.080725,-0.040105,0.094098,-0.133767,-0.026106,-0.047406,-0.039465,0.047134,-0.176884,0.087281
M1A,0.045933,-0.056515,-0.019767,-0.025154,-0.119241,0.003329,-0.034860,0.107935,-0.149166,-0.043826,...,-0.083531,-0.040632,0.094804,-0.132769,-0.024924,-0.046912,-0.040667,0.045055,-0.176133,0.083391
M1C,0.044838,-0.056597,-0.019064,-0.024549,-0.118145,0.001257,-0.035403,0.109371,-0.148566,-0.043649,...,-0.084472,-0.039289,0.093506,-0.132674,-0.026766,-0.046812,-0.039092,0.045713,-0.175992,0.084699
M1D,0.046248,-0.055031,-0.020539,-0.022602,-0.118585,0.003185,-0.034596,0.105473,-0.148745,-0.044723,...,-0.079447,-0.039959,0.095808,-0.135631,-0.023760,-0.047297,-0.039773,0.045541,-0.178692,0.085272
M1E,0.046363,-0.054679,-0.019546,-0.023710,-0.119949,0.002362,-0.034701,0.107468,-0.148440,-0.044863,...,-0.081398,-0.040111,0.094267,-0.134299,-0.024198,-0.046134,-0.041479,0.045468,-0.176906,0.085000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K586S,0.046908,-0.058761,-0.021707,-0.022943,-0.117715,0.003024,-0.036306,0.108303,-0.147151,-0.049140,...,-0.078514,-0.039117,0.094524,-0.133595,-0.024180,-0.048557,-0.039557,0.046136,-0.174744,0.085911
K586T,0.047970,-0.059332,-0.022386,-0.022880,-0.117853,0.003151,-0.036292,0.107691,-0.146539,-0.051721,...,-0.076575,-0.039546,0.094977,-0.134229,-0.023738,-0.049043,-0.040182,0.046293,-0.174896,0.085357
K586V,0.046595,-0.059537,-0.021523,-0.024160,-0.118069,0.001362,-0.035907,0.108233,-0.147937,-0.048904,...,-0.080355,-0.040149,0.093683,-0.133417,-0.026381,-0.049075,-0.039538,0.046030,-0.173255,0.087002
K586W,0.047063,-0.058231,-0.021960,-0.022216,-0.117791,0.002137,-0.034946,0.107953,-0.146737,-0.049795,...,-0.079466,-0.038489,0.094664,-0.134073,-0.024353,-0.048790,-0.039763,0.047051,-0.175515,0.087269


In [17]:
base_path = 'psacas12f/'
round_file_name_1 = 'psacas12f_Round1.xlsx'
psacas12f_sequence = 'MPSETYITKTLSLKLIPSDEEKQALENYFITFQRAVNFAIDRIVDIRSSFRYLNKNEQFPAVCDCCGKKEKIMYVNISNKTFKFKPSRNQKDRYTKDIYTIKPNAHICKTCYSGVAGNMFIRKQMYPNDKEGWKVSRSYNIKVNAPGLTGTEYAMAIRKAISILRSFEKRRRNAERRIIEYEKSKKEYLELIDDVEKGKTNKIVVLEKEGHQRVKRYKHKNWPEKWQGISLNKAKSKVKDIEKRIKKLKEWKHPTLNRPYVELHKNNVRIVGYETVELKLGNKMYTIHFASISNLRKPFRKQKKKSIEYLKHLLTLALKRNLETYPSIIKRGKNFFLQYPVRVTVKVPKLTKNFKAFGIDRGVNRLAVGCIISKDGKLTNKNIFFFHGKEAWAKENRYKKIRDRLYAMAKKLRGDKTKKIRLYHEIRKKFRHKVKYFRRNYLHNISKQIVEIAKENTPTVIVLEDLRYLRERTYRGKGRSKKAKKTNYKLNTFTYRMLIDMIKYKAEEAGVPVMIIDPRNTSRKCSKCGYVDENNRKQASFKCLKCGYSLNADLNAAVNIAKAFYECPTFRWEEKLHAYVCSEPDK'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, psacas12f_sequence)
print(experimental_data_1)
df_list = [experimental_data_1]

   Variant_dirty  fitness_raw Variant   fitness updated_variant
0          E425Y    23.185630    425Y  2.473795           E425Y
1          I192F    15.803604    192F  1.686168           I192F
2          M155Y     0.165144    155Y  0.017620           M155Y
3          R365F     0.184863    365F  0.019724           R365F
4          R365Y     0.380978    365Y  0.040648           R365Y
5          K159K    19.555174    159K  2.086443           K159K
6           E20K    25.998008     20K  2.773862            E20K
7          D194G    17.751708    194G  1.894022           D194G
8          I452L    33.078882    452L  3.529358           I452L
9          E190R    12.119830    190R  1.293128           E190R
10         D194P    10.972211    194P  1.170682           D194P
11            WT     9.372494      WT  1.000000              WT


In [18]:
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

Embeddings and labels are aligned


In [19]:
df_all
df_all.to_csv('psacas12f/round1_all_new.csv', index=False)

In [20]:
df_test.to_csv('psacas12f/round1_predictions_new.csv', index=False)

In [21]:
df_all

Unnamed: 0,variant,y_pred,y_actual
8578,I452L,2.938166,3.529358
412,K22Q,2.295731,
3594,E190D,2.288140,
369,E20K,2.285385,2.773862
406,K22H,2.248675,
...,...,...,...
7741,M408I,0.931625,
6597,P348E,0.923928,
6921,R365F,0.572717,0.019724
2945,M155Y,0.571976,0.017620
