In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# add code directory to path
import sys
import os
sys.path.append(os.path.join(os.path.abspath(''), 'code'))

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
from config import load_config

In [62]:
config = load_config("config_hammer.yaml")
sequences_df = pd.read_csv(config["dataset_path"])

Loaded configuration:
  root_path: /home/labs/fleishman/omripo/Projects/adaptive_learning
  data_dir_name: data
  pretraining_dir_name: pretraining
  bootstrap_dir_name: bootstrap
  results_dir_name: results
  enzyme: NNMT
  substrate: malathion
  dataset_filename: Hammer.csv
  tag: dataset_perparation
  weights_filename: final_model.pt
  model_type: plm
  finetune: False
  opmode: mean
  train_type: msa_backbone
  nmuts_column: num_muts
  sequence_column_name: full_seq
  activity_column_name: inactive
  first_column_name: 
  last_column_name: 
  plm_name: esm2_t12_35M_UR50D
  ref_seq: MESGFTSKDTYLSHFNPRDYLEKYYKFGSRHSAESQILKHLLKNLFKIFCLDGVKGDLLIDIGSGPTIYQLLSACESFKEIVVTDYSDQNLQELEKWLKKEPEAFDWSPVVTYVCDLEGNRVKGPEKEEKLRQAVKQVLKCDVTQSQPLGAVPLPPADCVLSTLCLDAACPDLPTYCRALRNLGSLLKPGGFLVIMDALKSSYYMIGEQKFSSLPLGREAVEAAVKEAGYTIEWFEVISQSYSSTMANNEGLFSLVARKLSRPLLEHHHHHH
  bootstrap_indices_prefix: indices
  train_indices: all
  train_fraction: 0.8
  test_indices: [6]
  pos_to_use: [72, 98, 220, 223, 23

In [63]:
# take the columns named 1 to 272 from sequences_df
positions = [str(i) for i in range(1, 273) if str(i) not in ['18', '32', '165', '195', '240']]

vocab_df = sequences_df[positions].nunique()
first_experiment_sequences_df = sequences_df[sequences_df["design"].str.startswith("FL1")]
second_experiment_sequences_df = sequences_df[sequences_df["design"].str.startswith("FL2")]
third_experiment_sequences_df = sequences_df[sequences_df["design"].str.startswith("FL3")]

first_experiment_vocab = first_experiment_sequences_df[positions].nunique()
second_experiment_vocab = second_experiment_sequences_df[positions].nunique()
third_experiment_vocab = third_experiment_sequences_df[positions].nunique()

first_experiment_designed_positions = first_experiment_vocab[first_experiment_vocab > 1].index.tolist()
second_experiment_designed_positions = second_experiment_vocab[second_experiment_vocab > 1].index.tolist()
third_experiment_designed_positions = third_experiment_vocab[third_experiment_vocab > 1].index.tolist()

designed_positions = vocab_df[vocab_df > 1].index.tolist()


In [55]:
wt_row_raw = sequences_df[sequences_df["design"] == "WT"].iloc[0]

# find positions which are constantly different from WT in each experiment
first_experiment_constantly_mutated_positions = (first_experiment_sequences_df[designed_positions] != wt_row_raw[designed_positions]).all()
first_experiment_constantly_mutated_positions = first_experiment_constantly_mutated_positions[first_experiment_constantly_mutated_positions].index.tolist()
second_experiment_constantly_mutated_positions = (second_experiment_sequences_df[designed_positions] != wt_row_raw[designed_positions]).all()
second_experiment_constantly_mutated_positions = second_experiment_constantly_mutated_positions[second_experiment_constantly_mutated_positions].index.tolist()
third_experiment_constantly_mutated_positions = (third_experiment_sequences_df[designed_positions] != wt_row_raw[designed_positions]).all()
third_experiment_constantly_mutated_positions = third_experiment_constantly_mutated_positions[third_experiment_constantly_mutated_positions].index.tolist()

In [66]:
sorted(first_experiment_designed_positions + first_experiment_constantly_mutated_positions, key=lambda x: int(x))

['20', '24', '167', '197', '198', '201', '204', '213', '242', '247', '249']

In [69]:
# get unique values in each designed position
for pos in sorted(first_experiment_designed_positions + first_experiment_constantly_mutated_positions, key=lambda x: int(x)):
    print(f"Position {pos}: {first_experiment_sequences_df[pos].unique().tolist()}")

Position 20: ['H', 'Y', 'F']
Position 24: ['Y', 'F']
Position 167: ['C', 'H', 'D', 'E']
Position 197: ['D', 'G']
Position 198: ['M', 'S', 'T', 'A', 'L']
Position 201: ['A', 'C', 'R', 'Q', 'N', 'T', 'E', 'S', 'M']
Position 204: ['Y', 'F']
Position 213: ['A', 'S', 'C', 'H', 'M', 'T']
Position 242: ['Y', 'W', 'F']
Position 247: ['A', 'M', 'S', 'C']
Position 249: ['N', 'C', 'S', 'A']


In [79]:
resfile_template = "nataa\nstart\n"
resfile = resfile_template
# get unique values in each designed position
for pos in sorted(third_experiment_designed_positions + third_experiment_constantly_mutated_positions, key=lambda x: int(x)):
    resfile += f"{pos} A\tPIKAA\t{''.join(third_experiment_sequences_df[pos].unique().tolist())}\n"

In [80]:
# save resfile to disk
with open(f"{config["root_path"]}/{config["data_dir_name"]}/{config["enzyme"]}/resfile_third_experiment.txt", "w") as f:
    f.write(resfile)

In [81]:
# create a resfile for all designed positions
resfile = resfile_template
for pos in designed_positions:
    resfile += f"{pos} A\tPIKAA\t{''.join(sequences_df[pos].unique().tolist())}\n"

In [83]:
with open(f"{config["root_path"]}/{config["data_dir_name"]}/{config["enzyme"]}/resfile_all_designed_positions.txt", "w") as f:
    f.write(resfile)

In [None]:
# select only the designed positions + design,fold_improvement,p1,p2,full_seq columns
designed_sequences_df = sequences_df[["design", "fold_improvement", "p1", "p2"] + designed_positions + ["full_seq"]]
wt_seq = sequences_df.loc[0, "full_seq"]
wt_row = designed_sequences_df[designed_sequences_df["design"] == "WT"].iloc[0]
# add n_muts column
designed_sequences_df["n_muts"] = designed_sequences_df[designed_positions].ne(wt_row[designed_positions]).sum(axis=1)
designed_sequences_df["pad_regions"] = "0_271"
# rename designed positions according to their AA in the WT full_seq
renamed_columns = {pos: f"{wt_seq[int(pos)-1]}{pos}" for pos in designed_positions}
designed_sequences_df = designed_sequences_df.rename(columns=renamed_columns)

1      1
2      1
3      1
4      1
5      1
      ..
268    1
269    1
270    1
271    1
272    1
Length: 267, dtype: int64

In [7]:
designed_sequences_df

Unnamed: 0,design,fold_improvement,p1,p2,Y20,Y24,E34,I37,D167,Y176,...,S213,Y242,A247,N248,N249,L252,F253,full_seq,n_muts,pad_regions
0,WT,1.0,55.1,44.9,Y,Y,E,I,D,Y,...,S,Y,A,N,N,L,F,MESGFTSKDTYLSHFNPRDYLEKYYKFGSRHSAESQILKHLLKNLF...,0,0_271
1,FL1_01,0.1,,,H,Y,E,I,C,Y,...,A,Y,A,N,N,L,F,MESGFTSKDTYLSHFNPRDHLEKYYKFGSRHSAESQILKHLLKNLF...,5,0_271
2,FL1_02,21.2,60.6,39.4,Y,Y,E,I,H,Y,...,A,Y,A,N,C,L,F,MESGFTSKDTYLSHFNPRDYLEKYYKFGSRHSAESQILKHLLKNLF...,5,0_271
3,FL1_03,5.0,31.9,68.1,Y,Y,E,I,H,Y,...,S,W,A,N,S,L,F,MESGFTSKDTYLSHFNPRDYLEKYYKFGSRHSAESQILKHLLKNLF...,5,0_271
4,FL1_04,3.7,35.7,64.3,F,Y,E,I,H,Y,...,C,Y,A,N,N,L,F,MESGFTSKDTYLSHFNPRDFLEKYYKFGSRHSAESQILKHLLKNLF...,5,0_271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,FL3_H157,335.9,79.8,20.2,Y,Y,E,W,H,F,...,C,H,C,N,A,M,Y,MESGFTSKDTYLSHFNPRDYLEKYYKFGSRHSAESQWLKHLLKNLF...,11,0_271
234,FL3_H158,384.7,8.1,91.9,Y,Y,E,W,H,F,...,T,M,A,N,A,H,Y,MESGFTSKDTYLSHFNPRDYLEKYYKFGSRHSAESQWLKHLLKNLF...,10,0_271
235,FL3_H159,147.6,62.1,37.9,Y,Y,E,W,H,F,...,S,H,A,N,T,L,F,MESGFTSKDTYLSHFNPRDYLEKYYKFGSRHSAESQWLKHLLKNLF...,6,0_271
236,FL3_H161,1196.9,43.6,56.4,Y,Y,E,W,H,F,...,V,F,A,N,A,Y,Y,MESGFTSKDTYLSHFNPRDYLEKYYKFGSRHSAESQWLKHLLKNLF...,10,0_271


In [8]:
first_experiment_df = designed_sequences_df[designed_sequences_df["design"].str.startswith("FL1")]
second_experiment_df = designed_sequences_df[designed_sequences_df["design"].str.startswith("FL2")]
third_experiment_df = designed_sequences_df[designed_sequences_df["design"].str.startswith("FL3")]
# add WT row to each df
wt_row = designed_sequences_df[designed_sequences_df["design"] == "WT"]
first_experiment_df = pd.concat([wt_row, first_experiment_df], ignore_index=True)
second_experiment_df = pd.concat([wt_row, second_experiment_df], ignore_index=True)
third_experiment_df = pd.concat([wt_row, third_experiment_df], ignore_index=True)

In [9]:
# save all three dfs to csv files
designed_sequences_df.to_csv(f"{config["root_path"]}/{config["data_dir_name"]}/{config["enzyme"]}/hammer_full.csv", index=False)
first_experiment_df.to_csv(f"{config["root_path"]}/{config["data_dir_name"]}/{config["enzyme"]}/hammer_first_experiment.csv", index=False)
second_experiment_df.to_csv(f"{config["root_path"]}/{config["data_dir_name"]}/{config["enzyme"]}/hammer_second_experiment.csv", index=False)
third_experiment_df.to_csv(f"{config["root_path"]}/{config["data_dir_name"]}/{config["enzyme"]}/hammer_third_experiment.csv", index=False)