In [2]:
import numpy as np
import pandas as pd

train_df = pd.read_csv("data/train.csv", index_col=0).drop("data_source", axis=1)
train_df

Unnamed: 0_level_0,protein_sequence,pH,tm
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5
...,...,...,...
31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8
31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2
31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6
31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7


Do cleanup, by replacing all rows with the ones from the new csv and dropping nans. This should fix both the swapped values and the NaNs.

In [3]:
updates_df = pd.read_csv('data/train_updates_20220929.csv', index_col="seq_id")
updates_df = updates_df.drop("data_source", axis=1)
train_df.loc[updates_df.index, :] = updates_df[:]
train_df = train_df.dropna()
train_df

Unnamed: 0_level_0,protein_sequence,pH,tm
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5
...,...,...,...
31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8
31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2
31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6
31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7


Write Sequences to fasta to calculate embeddings

In [13]:
with open("data/enzymes.fa", 'a') as f:
    for index, row in train_df[['protein_sequence']].iterrows():
        f.write(f">{index}\n{row.item()}\n")

Calculate Embedding seperatly and load them.

In [10]:
import torch
import os

train_df = train_df[["pH", "tm"]]

emb_path = os.path.join("data", "t5_embeddings", "train_enzymes.t5.pt")

d = {int(seq_id): emb.numpy() for seq_id, emb in torch.load(emb_path).items()}
len(d)

31390

In [11]:
emb_df = pd.DataFrame.from_dict(d, orient='index')
emb_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.058197,0.013725,0.060089,-0.008591,-0.025604,0.072388,-0.040985,-0.120056,0.013443,0.010544,...,-0.057983,-0.022919,0.000931,-0.069519,-0.016968,-0.010460,-0.031204,0.039062,0.036652,0.019562
1,-0.029129,-0.022873,0.021851,-0.035400,0.017914,0.056519,0.009972,-0.134644,0.056183,0.056885,...,-0.015068,-0.048859,0.015366,-0.012978,-0.075684,0.023300,-0.073975,0.011299,0.045593,0.031433
2,0.035522,0.011467,-0.015930,0.033844,-0.039795,0.060669,-0.033386,-0.023819,-0.009575,0.021454,...,-0.023636,-0.034241,-0.013916,0.000024,0.011055,-0.028564,-0.066956,-0.006252,0.002590,0.016113
3,0.042328,-0.000993,0.025223,-0.005268,-0.006237,0.062805,-0.022552,-0.085205,0.097473,0.041992,...,-0.046600,-0.027893,0.036316,0.015450,-0.012398,-0.074097,-0.086182,-0.027618,0.015182,0.087891
4,0.016861,0.067078,0.015793,0.023529,-0.002163,0.002775,-0.026962,-0.044525,0.082581,-0.003222,...,-0.026016,0.026428,-0.000928,-0.020126,0.052490,0.013245,-0.003328,0.034851,0.015327,-0.013245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31385,0.025406,0.078674,0.085388,-0.010124,-0.017944,0.065491,-0.046265,-0.036346,0.003952,0.004055,...,-0.043274,-0.031143,-0.029404,0.046265,0.004345,0.017761,0.023666,0.020523,-0.051331,-0.018768
31386,0.052338,0.065918,0.070801,-0.000024,0.016602,0.046417,-0.099060,-0.061920,0.031097,-0.001596,...,-0.060364,-0.023605,0.034241,-0.013603,0.033508,-0.020630,0.024857,-0.038483,0.002399,-0.003370
31387,-0.082031,-0.016617,-0.030365,-0.025482,-0.054138,0.101318,-0.093750,-0.053711,0.019424,-0.024612,...,-0.026367,-0.028366,-0.002083,-0.140991,0.030975,-0.057709,-0.013451,-0.111938,0.017136,0.070618
31388,0.043152,0.063110,0.031464,0.051910,-0.034698,0.037262,-0.032471,-0.079590,0.039185,-0.051056,...,-0.041565,-0.033203,-0.020233,-0.020645,0.050812,-0.020386,0.002647,-0.018845,0.029068,0.001176


In [None]:
train_df = train_df.join(emb_df)
train_df

Unnamed: 0_level_0,pH,tm,0,1,2,3,4,5,6,7,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,7.0,75.7,0.058197,0.013725,0.060089,-0.008591,-0.025604,0.072388,-0.040985,-0.120056,...,-0.057983,-0.022919,0.000931,-0.069519,-0.016968,-0.010460,-0.031204,0.039062,0.036652,0.019562
1,7.0,50.5,-0.029129,-0.022873,0.021851,-0.035400,0.017914,0.056519,0.009972,-0.134644,...,-0.015068,-0.048859,0.015366,-0.012978,-0.075684,0.023300,-0.073975,0.011299,0.045593,0.031433
2,7.0,40.5,0.035522,0.011467,-0.015930,0.033844,-0.039795,0.060669,-0.033386,-0.023819,...,-0.023636,-0.034241,-0.013916,0.000024,0.011055,-0.028564,-0.066956,-0.006252,0.002590,0.016113
3,7.0,47.2,0.042328,-0.000993,0.025223,-0.005268,-0.006237,0.062805,-0.022552,-0.085205,...,-0.046600,-0.027893,0.036316,0.015450,-0.012398,-0.074097,-0.086182,-0.027618,0.015182,0.087891
4,7.0,49.5,0.016861,0.067078,0.015793,0.023529,-0.002163,0.002775,-0.026962,-0.044525,...,-0.026016,0.026428,-0.000928,-0.020126,0.052490,0.013245,-0.003328,0.034851,0.015327,-0.013245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31385,7.0,51.8,0.025406,0.078674,0.085388,-0.010124,-0.017944,0.065491,-0.046265,-0.036346,...,-0.043274,-0.031143,-0.029404,0.046265,0.004345,0.017761,0.023666,0.020523,-0.051331,-0.018768
31386,7.0,37.2,0.052338,0.065918,0.070801,-0.000024,0.016602,0.046417,-0.099060,-0.061920,...,-0.060364,-0.023605,0.034241,-0.013603,0.033508,-0.020630,0.024857,-0.038483,0.002399,-0.003370
31387,7.0,64.6,-0.082031,-0.016617,-0.030365,-0.025482,-0.054138,0.101318,-0.093750,-0.053711,...,-0.026367,-0.028366,-0.002083,-0.140991,0.030975,-0.057709,-0.013451,-0.111938,0.017136,0.070618
31388,7.0,50.7,0.043152,0.063110,0.031464,0.051910,-0.034698,0.037262,-0.032471,-0.079590,...,-0.041565,-0.033203,-0.020233,-0.020645,0.050812,-0.020386,0.002647,-0.018845,0.029068,0.001176


In [20]:
cols = train_df.columns.tolist()
cols = cols[2:] + cols[:2]
train_df = train_df[cols]
train_df 

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1016,1017,1018,1019,1020,1021,1022,1023,pH,tm
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.058197,0.013725,0.060089,-0.008591,-0.025604,0.072388,-0.040985,-0.120056,0.013443,0.010544,...,0.000931,-0.069519,-0.016968,-0.010460,-0.031204,0.039062,0.036652,0.019562,7.0,75.7
1,-0.029129,-0.022873,0.021851,-0.035400,0.017914,0.056519,0.009972,-0.134644,0.056183,0.056885,...,0.015366,-0.012978,-0.075684,0.023300,-0.073975,0.011299,0.045593,0.031433,7.0,50.5
2,0.035522,0.011467,-0.015930,0.033844,-0.039795,0.060669,-0.033386,-0.023819,-0.009575,0.021454,...,-0.013916,0.000024,0.011055,-0.028564,-0.066956,-0.006252,0.002590,0.016113,7.0,40.5
3,0.042328,-0.000993,0.025223,-0.005268,-0.006237,0.062805,-0.022552,-0.085205,0.097473,0.041992,...,0.036316,0.015450,-0.012398,-0.074097,-0.086182,-0.027618,0.015182,0.087891,7.0,47.2
4,0.016861,0.067078,0.015793,0.023529,-0.002163,0.002775,-0.026962,-0.044525,0.082581,-0.003222,...,-0.000928,-0.020126,0.052490,0.013245,-0.003328,0.034851,0.015327,-0.013245,7.0,49.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31385,0.025406,0.078674,0.085388,-0.010124,-0.017944,0.065491,-0.046265,-0.036346,0.003952,0.004055,...,-0.029404,0.046265,0.004345,0.017761,0.023666,0.020523,-0.051331,-0.018768,7.0,51.8
31386,0.052338,0.065918,0.070801,-0.000024,0.016602,0.046417,-0.099060,-0.061920,0.031097,-0.001596,...,0.034241,-0.013603,0.033508,-0.020630,0.024857,-0.038483,0.002399,-0.003370,7.0,37.2
31387,-0.082031,-0.016617,-0.030365,-0.025482,-0.054138,0.101318,-0.093750,-0.053711,0.019424,-0.024612,...,-0.002083,-0.140991,0.030975,-0.057709,-0.013451,-0.111938,0.017136,0.070618,7.0,64.6
31388,0.043152,0.063110,0.031464,0.051910,-0.034698,0.037262,-0.032471,-0.079590,0.039185,-0.051056,...,-0.020233,-0.020645,0.050812,-0.020386,0.002647,-0.018845,0.029068,0.001176,7.0,50.7


Write Data with Embeddings to file.

In [21]:
train_df.to_csv('data/train_embs.csv')