In [1]:
import pandas as pd

import epam.simulation as simulation
from epam.sequences import translate_sequences
from epam.simulation import mimic_aa_mutations
from epam.toy_dnsm import train_model

In [2]:
aa_parents = pd.Series(["YYAYYYY", "YFY", "CYTFD", "KAY"])
aa_sub_counts = pd.Series([3, 2, 4, 1])

mimic_aa_mutations(
    simulation.hydrophobic_neighbor_mutator,
    aa_parents,
    aa_sub_counts,
)

0    YPAYLHY
1        STY
2      WYSLQ
3        KAD
dtype: object

In [3]:
pcp_df = pd.read_csv("~/data/wyatt-10x-1p5m_pcp_2023-10-07.csv", index_col=0)

pcp_df["aa_parent"] = translate_sequences(pcp_df["parent"])
pcp_df["aa_child"] = translate_sequences(pcp_df["child"])

pcp_df = pcp_df[pcp_df["aa_parent"] != pcp_df["aa_child"]].reset_index(drop=True)

pcp_df["aa_sub_count"] = pcp_df.apply(lambda row: sum([p != c for p, c in zip(row["aa_parent"], row["aa_child"])]), axis=1)

In [4]:
fake_df = pd.DataFrame({"aa_parent": pcp_df["aa_parent"], "aa_child": mimic_aa_mutations(simulation.hydrophobic_neighbor_mutator, pcp_df["aa_parent"], pcp_df["aa_sub_count"])})

fake_df["aa_sub_count"] = pcp_df.apply(lambda row: sum([p != c for p, c in zip(row["aa_parent"], row["aa_child"])]), axis=1)

assert (fake_df["aa_sub_count"] == pcp_df["aa_sub_count"]).all()

In [5]:
fake_df

Unnamed: 0,aa_parent,aa_child,aa_sub_count
0,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSSGMHWVRQAPGKGLE...,QVQLVESGGGVVQPGRSLRLSCAARGFTFSSSGMHWVRQAPGKGLE...,2
1,QVQLVESGGGVVQPGRSLRLSCAASGFTFNSSGMHWVRQAPGKGLE...,QVQLVESGGGVVQPGRSLRLSCAASGFTFNSSGMHVVRQAPGKGLE...,3
2,QVQLVESGGGVVQPGRSLRLSCAASGFTFDSSGMHWVRQAPGKGLE...,QVQRVESGGGVVQPGRSLRLSCAASGFTFDSSGMHRVRQAPGKGLE...,7
3,QVQLVESGGGVVQPGRSLRLSCATSGFNFDTSGMHWVRQAPGKGLE...,QVQLVESGGGVVQPGRMLRLSCATSGFNFDTSMMHWVRTAPGKGLE...,9
4,QVQLVESGGGVVQPGRSLRLSCATSGFNFDTSGMHWVRQAPGKGLE...,QVQLVESGGGVVQPGRSLRLSCATSGFNFDTSGMHWVLQAPGKGLE...,4
...,...,...,...
55731,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,QLQLQESGPGLVKPSETLYLTCTVSGGSISSSSYMWGWIRQPPGKG...,8
55732,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,1
55733,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,3
55734,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSFAMSWVRQAPGKGLE...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSFASSWVRQAPGKGLE...,2


In [6]:
def y_count(seq):
    return sum([aa == "Y" for aa in seq])

fake_df["y_parent_count"] = fake_df["aa_parent"].apply(y_count)
fake_df["y_child_count"] = fake_df["aa_child"].apply(y_count)
fake_df["y_diff"] = fake_df["y_child_count"] - fake_df["y_parent_count"]
fake_df["y_diff"].value_counts()

y_diff
 0    33326
-1    11434
 1     6150
-2     3002
 2      790
-3      700
-4      164
 3      121
-5       29
 4       13
-6        7
Name: count, dtype: int64

In [7]:
!rm _logs/*

nhead = 1
dim_feedforward = 16
layer_count = 1
model = train_model(fake_df, nhead=nhead, dim_feedforward=dim_feedforward, layer_count=layer_count, batch_size=32, num_epochs=2, learning_rate=0.001, checkpoint_dir="./_checkpoints", log_dir="./_logs")

preparing data...
Using Metal Performance Shaders




Epoch [0/2], Training Loss: 1.877058620859937, Validation Loss: 1.8647038502816142
training model...
Epoch [1/2], Training Loss: 0.16272467374801636, Validation Loss: 0.18125704404788578
Epoch [2/2], Training Loss: 0.21040047705173492, Validation Loss: 0.18125464423508902
