In [1]:
import pandas as pd

from epam.sequences import translate_sequences
from epam.simulation import mimic_aa_mutations, tyrosine_mutator
from epam.toy_dnsm import train_model

In [2]:
aa_parents = pd.Series(["YYAYYYY", "YFY", "CYTFD", "KAY"])
aa_sub_counts = pd.Series([3, 2, 4, 1])

mimic_aa_mutations(
    tyrosine_mutator,
    aa_parents,
    aa_sub_counts,
)

0    WYALMYY
1        DFR
2      HQTIT
3        KAT
dtype: object

In [3]:
pcp_df = pd.read_csv("~/data/wyatt-10x-1p5m_pcp_2023-10-07.csv", index_col=0)

pcp_df["aa_parent"] = translate_sequences(pcp_df["parent"])
pcp_df["aa_child"] = translate_sequences(pcp_df["child"])

pcp_df = pcp_df[pcp_df["aa_parent"] != pcp_df["aa_child"]].reset_index(drop=True)

pcp_df["aa_sub_count"] = pcp_df.apply(lambda row: sum([p != c for p, c in zip(row["aa_parent"], row["aa_child"])]), axis=1)

In [4]:
fake_df = pd.DataFrame({"aa_parent": pcp_df["aa_parent"], "aa_child": mimic_aa_mutations(tyrosine_mutator, pcp_df["aa_parent"], pcp_df["aa_sub_count"])})

fake_df["aa_sub_count"] = pcp_df.apply(lambda row: sum([p != c for p, c in zip(row["aa_parent"], row["aa_child"])]), axis=1)

assert (fake_df["aa_sub_count"] == pcp_df["aa_sub_count"]).all()

In [5]:
fake_df

Unnamed: 0,aa_parent,aa_child,aa_sub_count
0,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSSGMHWVRQAPGKGLE...,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSSGMHWVRQAPGKGLE...,2
1,QVQLVESGGGVVQPGRSLRLSCAASGFTFNSSGMHWVRQAPGKGLE...,QVQLVESGGGVVQPGRSLRLSCAASGFTFNSSGMHWVRQAPGKGLE...,3
2,QVQLVESGGGVVQPGRSLRLSCAASGFTFDSSGMHWVRQAPGKGLE...,QVQLVESGGGVVQPGRSLRLSCAASGFTFDSSGMHWVRQAPGKGLE...,7
3,QVQLVESGGGVVQPGRSLRLSCATSGFNFDTSGMHWVRQAPGKGLE...,QVQLVESGGGVVQPGRSLRLSCATSGFNFDTSGMHWVRQAPGKGLE...,9
4,QVQLVESGGGVVQPGRSLRLSCATSGFNFDTSGMHWVRQAPGKGLE...,QVQLVESGGGVVQPGRSLRLSCATSGFNFDTSGMHWVRQAPGKGLE...,4
...,...,...,...
55731,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,QLQLQESGPGLVKPSETLSLTCLVSGGSISSSSELWGWIRQPPGKG...,8
55732,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,1
55733,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSCYWGWIRQPPGKG...,3
55734,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSFAMSWVRQAPGKGLE...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSFAMSWVRQAPGKGLE...,2


In [6]:
def y_count(seq):
    return sum([aa == "Y" for aa in seq])

fake_df["y_parent_count"] = fake_df["aa_parent"].apply(y_count)
fake_df["y_child_count"] = fake_df["aa_child"].apply(y_count)
fake_df["y_diff"] = fake_df["y_child_count"] - fake_df["y_parent_count"]
fake_df["y_diff"].value_counts()

y_diff
-1     11805
-2      8426
-3      6345
-5      5794
-6      5773
-4      5663
-7      4969
-8      3409
-9      1978
-10      933
-11      424
-12      147
-13       49
-14       17
-15        4
Name: count, dtype: int64

In [7]:
nhead = 4
dim_feedforward = 2048
layer_count = 3
model = train_model(fake_df, nhead=nhead, dim_feedforward=dim_feedforward, layer_count=layer_count, batch_size=32, num_epochs=10, learning_rate=0.00001, checkpoint_dir="./_checkpoints", log_dir="./_logs")

preparing data...
Using Metal Performance Shaders
training model...
2064	predictions out of range.
1932	predictions out of range.
1846	predictions out of range.
1706	predictions out of range.
1686	predictions out of range.
1518	predictions out of range.
1378	predictions out of range.
1364	predictions out of range.
1277	predictions out of range.
1224	predictions out of range.
1055	predictions out of range.
995	predictions out of range.
969	predictions out of range.
948	predictions out of range.
821	predictions out of range.
773	predictions out of range.
676	predictions out of range.
596	predictions out of range.
566	predictions out of range.
509	predictions out of range.
449	predictions out of range.
457	predictions out of range.
435	predictions out of range.
307	predictions out of range.
308	predictions out of range.
266	predictions out of range.
227	predictions out of range.
221	predictions out of range.
202	predictions out of range.
218	predictions out of range.
151	predictions out o