In [39]:
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from scipy.spatial.distance import cdist
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.exceptions import ConvergenceWarning
import xgboost
from sklearn.decomposition import PCA
import warnings
import random
import time
import os
import sys
import argparse
import torch
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from Bio import SeqIO

# Ignore FutureWarnings and ConvergenceWarnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn.neural_network")
pd.options.mode.chained_assignment = None  # default='warn'


### Functions ported from grid_search

In [40]:
# Function to read in the data
def read_data(dataset_name, base_path, file_type, embeddings_type='both', experimental = False):
    # Construct the file paths
    if file_type == "csvs":
        labels_file = os.path.join(base_path, 'labels', dataset_name.split('_')[0] + '_labels.csv')
        hie_file = os.path.join(base_path, 'hie_temp', dataset_name.split('_')[0] + '.csv')
        embeddings_file = os.path.join(base_path, 'csvs', dataset_name + '.csv')
        # Read in mean embeddings across all rounds
        embeddings = pd.read_csv(embeddings_file, index_col=0)
    elif file_type == "pts":
        labels_file = os.path.join(base_path, 'labels', dataset_name.split('_')[-1] + '_labels.csv')
        hie_file = os.path.join(base_path, 'hie_temp', dataset_name.split('_')[-1] + '.csv')
        embeddings_file = os.path.join(base_path, 'pts', dataset_name + '.pt')
        # Read in pytorch tensor of embeddings
        embeddings = torch.load(embeddings_file)
        # Convert embeddings to a dataframe
        if embeddings_type == 'average':
            embeddings = {key: value['average'].numpy() for key, value in embeddings.items()}
        elif embeddings_type == 'mutated':
            embeddings = {key: value['mutated'].numpy() for key, value in embeddings.items()}
        elif embeddings_type == 'both':
            embeddings = {key: torch.cat((value['average'], value['mutated'])).numpy() for key, value in embeddings.items()}
        else:
            print("Invalid embeddings_type. Please choose 'average', 'mutated', or 'both'")
            return None, None

        # Convert embeddings dictionary to a dataframe
        embeddings = pd.DataFrame.from_dict(embeddings, orient='index')
    else:
        print("Invalid file type. Please choose either 'csvs' or 'pts'")
        return None, None

    # if not experimental
    if not experimental:
        # Read in labels
        labels = pd.read_csv(labels_file)

        # Read in hierarchy
        hie_data = pd.read_csv(hie_file)

        # Filter out rows where fitness is NaN
        labels = labels[labels['fitness'].notna()]

        # Filter out rows in embeddings where row names are not in labels variant column
        embeddings = embeddings[embeddings.index.isin(labels['variant'])]

        # Align labels by variant
        labels = labels.sort_values(by=['variant'])

        # Align embeddings by row name
        embeddings = embeddings.sort_index()

        # Confirm that labels and embeddings are aligned, reset index
        labels = labels.reset_index(drop=True)

        # Get the variants in labels and embeddings, convert to list
        label_variants = labels['variant'].tolist()
        embedding_variants = embeddings.index.tolist()

        # Check if embedding row names and label variants are identical
        if label_variants == embedding_variants:
            print('Embeddings and labels are aligned')

        # return embeddings and labels
        return embeddings, labels, hie_data

    else:
        return embeddings


# Active learning function for one iteration
def top_layer(iter_train, iter_test, embeddings_pd, labels_pd, measured_var, regression_type='ridge', top_n=None, final_round=10):
    # reset the indices of embeddings_pd and labels_pd
    embeddings_pd = embeddings_pd.reset_index(drop=True)
    labels_pd = labels_pd.reset_index(drop=True)

    # save column 'iteration' in the labels dataframe
    iteration = labels_pd['iteration']

    # save labels
    labels = labels_pd

    # save mean embeddings as numpy array
    a = embeddings_pd

    # subset a, y to only include the rows where iteration = iter_train and iter_test
    idx_train = iteration[iteration.isin(iter_train)].index.to_numpy()
    idx_test = iteration[iteration.isin([iter_test])].index.to_numpy()

    # subset a to only include the rows where iteration = iter_train and iter_test
    X_train = a.loc[idx_train, :]
    X_test = a.loc[idx_test, :]

    y_train = labels[iteration.isin(iter_train)][measured_var]

    y_test = labels[iteration.isin([iter_test])][measured_var]

    # fit
    if regression_type == 'ridge':
        model = linear_model.RidgeCV()
    elif regression_type == 'lasso':
        model = linear_model.LassoCV(max_iter=100000,tol=1e-3)
    elif regression_type == 'elasticnet':
        model = linear_model.ElasticNetCV(max_iter=100000,tol=1e-3)
    elif regression_type == 'linear':
        model = linear_model.LinearRegression()
    elif regression_type == 'neuralnet':
        model = MLPRegressor(hidden_layer_sizes=(5), max_iter=1000, activation='relu', solver='adam', alpha=0.001,
                             batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5,
                             momentum=0.9, nesterovs_momentum=True, shuffle=True, random_state=1, tol=0.0001,
                             verbose=False, warm_start=False, early_stopping=False, validation_fraction=0.1, beta_1=0.9,
                             beta_2=0.999, epsilon=1e-08)
    elif regression_type == 'randomforest':
        model = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_depth=None, min_samples_split=2,
                                      min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
                                      max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
                                      n_jobs=None, random_state=1, verbose=0, warm_start=False, ccp_alpha=0.0,
                                      max_samples=None)
    elif regression_type == 'gradientboosting':
        model = xgboost.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                                     max_depth=5, alpha=10, n_estimators=10)

    model.fit(X_train, y_train)

    # make predictions on train data
    y_pred_train = model.predict(X_train)
    y_std_train = np.zeros(len(y_pred_train))
    # make predictions on test data
    # NOTE: can work on alternate 2-n round strategies here
    y_pred_test = model.predict(X_test)
    y_std_test = np.zeros(len(y_pred_test))

    # combine predicted and actual thermostability values with sequence IDs into a new dataframe
    df_train = pd.DataFrame({'variant': labels.variant[idx_train], 'y_pred': y_pred_train, 'y_actual': y_train})
    df_test = pd.DataFrame({'variant': labels.variant[idx_test], 'y_pred': y_pred_test, 'y_actual': y_test})
    
    # sort df_test by y_pred
    df_test = df_test.sort_values(by=['y_pred'], ascending=False)

    df_all = pd.concat([df_train, df_test])

    # sort df_all by y_pred
    df_all = df_all.sort_values(by=['y_pred'], ascending=False)

    return df_test, df_all

### New functions

In [41]:
def read_experimental_data(base_path, round_file_name, t7_sequence):
    file_path = base_path + '/rounds/' + round_file_name
    df = pd.read_excel(file_path)

    # Iterate through the 'Variant' column and update the values based on t7_sequence
    updated_variants = []
    for _, row in df.iterrows():
        variant = row['Variant']
        if variant == 'WT':
            updated_variants.append(variant)
        else:
            position = int(variant[:-1])
            wt_aa = t7_sequence[position - 1]
            updated_variant = wt_aa + variant
            updated_variants.append(updated_variant)
    
    df['updated_variant'] = updated_variants  # Add the updated variants to the DataFrame
    
    return df

def create_dataframes(df_list, expected_index):
    # First dataframe
    dfs = []  # List to store modified dataframes
    
    for i, df in enumerate(df_list, start=1):
        # Create a copy of the dataframe
        df_copy = df_list[i - 1].copy()
        # If the variant is WT, and i is equal to 1 assign iteration number 0
        if i == 1:
            df_copy.loc[df_copy['updated_variant'] == 'WT', 'iteration'] = 0
        else:
            df_copy = df_copy[df_copy['updated_variant'] != 'WT']
        df_copy.loc[df_copy['updated_variant'] != 'WT', 'iteration'] = i
        df_copy['iteration'] = df_copy['iteration'].astype(int)
        df_copy.rename(columns={'updated_variant': 'variant'}, inplace=True)  # Rename the column
    
        dfs.append(df_copy)

    df1 = pd.concat(dfs, ignore_index=True)
    df2 = pd.concat(dfs, ignore_index=True)

    df1 = df1[['variant', 'iteration']]
    df2 = df2[['variant', 'fitness', 'iteration']]

    # reorder df2 to match expected_index
    df2 = df2.set_index('variant')
    df2 = df2.reindex(expected_index)
    df2 = df2.reset_index()



    expected_index = [variant for variant in expected_index if variant not in df2['variant'].tolist()]
    # make a df_external that has a column 'variant' with all the variants in expected_index
    df_external = pd.DataFrame({'variant': expected_index})
    df_external = pd.DataFrame({'variant': expected_index})
    df_external['fitness'] = np.nan  
    df_external['iteration'] = 1001 
    df2 = df2.append(df_external, ignore_index=True)

    return df1, df2

In [42]:
# import brenan data
dataset_name = 'esm2_15B_t7_pol'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
file_type = 'pts'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [43]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A883S,0.145864,-0.184623,0.031468,-0.051257,-0.142275,-0.012640,-0.061425,-0.027589,0.017693,0.025046,...,-0.110233,0.051852,0.058792,-0.157249,0.013417,0.018916,0.019343,0.065511,-0.009308,-0.109120
A883T,0.145933,-0.184290,0.031539,-0.051148,-0.141504,-0.012715,-0.062040,-0.028451,0.017504,0.024164,...,-0.110211,0.052196,0.058730,-0.158339,0.012717,0.018639,0.020539,0.065699,-0.010361,-0.109183
A883V,0.145253,-0.182491,0.031516,-0.052928,-0.142025,-0.013264,-0.060926,-0.027928,0.017077,0.024314,...,-0.111040,0.051419,0.060660,-0.157816,0.012737,0.018943,0.017333,0.065630,-0.008499,-0.109677
A883W,0.145614,-0.183985,0.031491,-0.051818,-0.142407,-0.012759,-0.059910,-0.027657,0.016999,0.023699,...,-0.110127,0.052603,0.059757,-0.156924,0.013284,0.018394,0.016497,0.064918,-0.009098,-0.108907


In [44]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
round_file_name = 'T7_Round1.xlsx'
t7_sequence = 'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'
experimental_data = read_experimental_data(base_path, round_file_name, t7_sequence)
print(experimental_data)
df_list = [experimental_data]

  Variant   fitness updated_variant
0     12N  1.073846            S12N
1     25N  0.677227            A25N
2      WT  1.000000              WT
3     89R  0.740499            F89R
4    134T  1.074891           V134T
5    177L  1.042706           V177L
6    225E  1.075861           G225E
7    241W  0.938351           S241W
8    273H  0.785147           V273H


In [45]:
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

In [46]:
df_test

Unnamed: 0,variant,y_pred,y_actual
4715,E249C,1.011400,
5569,L294C,1.010837,
8905,G469Q,1.010637,
5298,T279S,1.009233,
5329,I281L,1.008760,
...,...,...,...
16,M1I,0.907080,
15452,F814G,0.906483,
10,M1C,0.906116,
12,M1E,0.905141,


In [47]:
# write the dataframe to a csv file
df_test.to_csv('t7/round1_predictions.csv', index=False)

In [48]:
# import brenan data
dataset_name = 'esm2_15B_t7_pol'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
file_type = 'pts'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [49]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
WT,0.147168,-0.183875,0.031651,-0.052828,-0.141786,-0.013741,-0.061830,-0.028053,0.018626,0.025174,...,-0.110809,0.051659,0.059534,-0.157181,0.013038,0.020313,0.015764,0.064493,-0.008039,-0.108970
M1A,0.147994,-0.184054,0.035075,-0.053447,-0.140782,-0.016417,-0.063300,-0.028770,0.018573,0.022787,...,-0.111060,0.049681,0.060578,-0.156948,0.011523,0.018843,0.014856,0.063135,-0.005746,-0.106902
M1C,0.149553,-0.182680,0.033869,-0.054389,-0.140723,-0.015113,-0.064822,-0.028185,0.018523,0.027269,...,-0.111689,0.051024,0.059965,-0.157671,0.012664,0.019815,0.015817,0.063816,-0.007434,-0.108856
M1D,0.146734,-0.182717,0.034278,-0.052991,-0.140832,-0.015275,-0.063740,-0.028154,0.018495,0.025465,...,-0.111113,0.050927,0.060537,-0.156441,0.011660,0.019746,0.013886,0.063376,-0.006656,-0.107104
M1E,0.149375,-0.184232,0.034520,-0.054731,-0.140585,-0.016518,-0.064851,-0.028408,0.018998,0.025971,...,-0.110941,0.051635,0.058811,-0.157911,0.012537,0.020981,0.014563,0.062794,-0.007955,-0.106604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A883S,0.145864,-0.184623,0.031468,-0.051257,-0.142275,-0.012640,-0.061425,-0.027589,0.017693,0.025046,...,-0.110233,0.051852,0.058792,-0.157249,0.013417,0.018916,0.019343,0.065511,-0.009308,-0.109120
A883T,0.145933,-0.184290,0.031539,-0.051148,-0.141504,-0.012715,-0.062040,-0.028451,0.017504,0.024164,...,-0.110211,0.052196,0.058730,-0.158339,0.012717,0.018639,0.020539,0.065699,-0.010361,-0.109183
A883V,0.145253,-0.182491,0.031516,-0.052928,-0.142025,-0.013264,-0.060926,-0.027928,0.017077,0.024314,...,-0.111040,0.051419,0.060660,-0.157816,0.012737,0.018943,0.017333,0.065630,-0.008499,-0.109677
A883W,0.145614,-0.183985,0.031491,-0.051818,-0.142407,-0.012759,-0.059910,-0.027657,0.016999,0.023699,...,-0.110127,0.052603,0.059757,-0.156924,0.013284,0.018394,0.016497,0.064918,-0.009098,-0.108907


In [50]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/t7/'
round_file_name_1 = 'T7_Round1.xlsx'
round_file_name_2 = 'T7_Round2.xlsx'
t7_sequence = 'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'
experimental_data_1 = read_experimental_data(base_path, round_file_name_1, t7_sequence)
experimental_data_2 = read_experimental_data(base_path, round_file_name_2, t7_sequence)
print(experimental_data_1)
print(experimental_data_2)
df_list = [experimental_data_1, experimental_data_2]

  Variant   fitness updated_variant
0     12N  1.073846            S12N
1     25N  0.677227            A25N
2      WT  1.000000              WT
3     89R  0.740499            F89R
4    134T  1.074891           V134T
5    177L  1.042706           V177L
6    225E  1.075861           G225E
7    241W  0.938351           S241W
8    273H  0.785147           V273H
   Variant   fitness updated_variant
0     249C  1.055202           E249C
1     279S  0.999604           T279S
2     281L  0.589696           I281L
3     229I  0.478365           L229I
4     735S  1.840046           V735S
5     152N  1.908253           G152N
6     822S  1.678626           A822S
7     531T  1.096505           S531T
8     256P  0.600498           T256P
9     469Q  1.319594           G469Q
10    668E  1.279096           T668E
11      WT  1.000000              WT


In [51]:
iterations_two, labels_two = create_dataframes(df_list, embeddings.index)

iteration_old = iterations_two
embeddings_pd = embeddings
labels_pd = labels_two
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

In [52]:
df_test

Unnamed: 0,variant,y_pred,y_actual
198,D10K,1.318788,
12212,E643N,1.312872,
7033,N370V,1.311535,
103,N5I,1.305016,
15037,R792H,1.299831,
...,...,...,...
11140,I587D,0.996957,
1267,D66S,0.992142,
11242,N592L,0.987486,
6504,K343A,0.968447,


In [53]:
# write the dataframe to a csv file
df_test.to_csv('t7/round2_predictions.csv', index=False)

In [52]:
# import brenan data
dataset_name = 'fanzor_esm2_t48_15B_UR50D'
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/fanzor/'
file_type = 'csvs'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, experimental=experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})

In [53]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
D289I,0.076350,-0.096517,-0.039485,0.022141,-0.140778,0.060272,-0.157692,0.053028,-0.101141,-0.058941,...,-0.163110,-0.056517,0.018743,-0.116311,0.010050,-0.082736,-0.068338,0.026717,-0.135575,0.090331
K54A,0.081564,-0.108703,-0.035292,0.022103,-0.143718,0.055708,-0.158636,0.051587,-0.099935,-0.056645,...,-0.159871,-0.058828,0.010209,-0.116926,0.010969,-0.086742,-0.072210,0.026919,-0.131113,0.088953
Y403W,0.083564,-0.103152,-0.035773,0.019590,-0.140105,0.058891,-0.161704,0.049788,-0.096187,-0.057417,...,-0.163753,-0.054475,0.013977,-0.118503,0.010152,-0.083362,-0.071264,0.024228,-0.136338,0.091752
N441W,0.085060,-0.102434,-0.034094,0.019113,-0.140549,0.057376,-0.160521,0.053643,-0.102079,-0.058173,...,-0.163639,-0.054493,0.015502,-0.117870,0.010070,-0.085296,-0.072524,0.028138,-0.135484,0.088637
E336I,0.081061,-0.103307,-0.037303,0.019810,-0.141395,0.060582,-0.161910,0.051436,-0.100989,-0.056924,...,-0.159600,-0.057073,0.012353,-0.119047,0.011846,-0.084845,-0.072005,0.029802,-0.134100,0.089094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D132W,0.079420,-0.105425,-0.030861,0.021674,-0.141255,0.057854,-0.160459,0.051246,-0.100951,-0.055793,...,-0.163258,-0.056315,0.014693,-0.118476,0.013192,-0.081750,-0.067947,0.028840,-0.137237,0.088986
R263G,0.083450,-0.106207,-0.033075,0.021937,-0.141942,0.058554,-0.160312,0.051613,-0.098827,-0.060687,...,-0.162723,-0.054210,0.014769,-0.119330,0.010308,-0.084599,-0.070771,0.026594,-0.133062,0.091404
V123P,0.082387,-0.104821,-0.032308,0.020482,-0.139570,0.055889,-0.161555,0.051725,-0.097712,-0.054436,...,-0.163546,-0.057811,0.011002,-0.118673,0.012264,-0.085351,-0.068471,0.026130,-0.137345,0.090820
I355T,0.082915,-0.105555,-0.036674,0.019969,-0.141193,0.058157,-0.162853,0.050937,-0.098292,-0.057070,...,-0.162178,-0.056842,0.014052,-0.117169,0.009596,-0.084980,-0.070661,0.026135,-0.134839,0.087683


In [57]:
base_path = '/Users/matteodibernardo/Documents/GitHub/directed_evolution/notebooks/fanzor/'
round_file_name = 'fanzor_Round1.xlsx'
fanzor_sequence = 'MKRKREDLTLWDAANVHKHKSMWYWWEYIRRKDMVNHEKTDCDVIQLLQSASVKKQKTQSDKFLTSFSVGIRPTKHQKRVLNEMLRVSNYTYNWCLWLVNEKGLKPHQFELQKIVCKTNANDVDPQYRMENDDWFFNNKMTSVKLTSCKNFCTSYKSAKSLKSKLKRPMSVSNIIQGSFCVPKLFIRHLSSKDVSTDNTNMQNRYICMMPDNFEKRSNPKERFLKLAKPITKIPPIDHDVKIVKRADGMFIMNIPCDPKYTRRNASNDTIEKRVCGIDPGGRTFATVYDPIDCCVFQVGIKEDKQYVISKLHNKIDHAHMHLTKAQNKKQQQAARERIVSLKKTHLKLKTFVDDIHLKLSSHLVKEYQYVALGKINVAQLVKTDRPKPLSKRAKRDLLYWQHYRFRQRLTHRTTNTECILDVQNEAYTSKTCGVCGTINKNLEKSETFYCDQCKYNTHRDVNGARNILLKSLRMFPFEKQQQ'
experimental_data = read_experimental_data(base_path, round_file_name, fanzor_sequence)
print(experimental_data)

  Variant   fitness updated_variant
0     27T  0.579502            E27T
1     20M  0.804204            K20M
2     22V  0.867465            M22V
3     46E  0.790334            Q46E
4     58Q  0.812868            T58Q
5     69M  0.434715            V69M
6     93A  0.976318            N93A
7    109N  0.428318           F109N
8     78T  0.568579            K78T
9      WT  1.000000              WT


In [60]:
iterations_one, labels_one = create_dataframes(experimental_data, embeddings.index)

iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16

df_test, df_all = top_layer(
    iter_train=iteration_old['iteration'].unique().tolist(),
    iter_test=1001,
    embeddings_pd=embeddings_pd,
    labels_pd=labels_pd,
    measured_var=measured_var,
    regression_type=regression_type,
    top_n=None,
    final_round=final_round
)

In [61]:
df_test

Unnamed: 0,variant,y_pred,y_actual
8723,E214D,0.848486,
766,G70C,0.846307,
2207,N131H,0.845992,
6029,H238F,0.844248,
2813,F223H,0.839626,
...,...,...,...
4577,L47R,0.628901,
4705,W400T,0.628636,
7806,I242S,0.627154,
3099,E130A,0.607574,


In [64]:
# write the dataframe to a csv file
df_test.to_csv('fanzor/fanzor_round1_predictions.csv', index=False)