# CARP

In [1]:
%cd ~/repo/protein-transfer

/home/t-fli/repo/protein-transfer


In [2]:
%load_ext blackcellmagic

In [3]:
from scr.encoding.encoding_classes import CARPEncoder

In [4]:
from scr.utils import pickle_load
import pandas as pd

In [5]:
df = pd.read_csv("data/proeng/gb1/two_vs_rest.csv")

In [6]:
df_train = df.loc[(df["set"] == "train") & (df["validation"] != True)]
df_val = df.loc[(df["set"] == "train") & (df["validation"] == True)]
df_test = df.loc[(df["set"] == "test")]

len(df_train), len(df_val), len(df_test), len(df)

(381, 43, 7817, 8242)

In [7]:

seqs = [[seq] for seq in df_val.sequence.astype(str).str[0 : 56].values[0:2]]
seqs

[['MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVNGEWTYDDATKTFTVTE'],
 ['MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGEYGEWTYDDATKTFTVTE']]

In [8]:
list(df_val.sequence.astype(str).str[0 : 56].values[0:2])

['MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVNGEWTYDDATKTFTVTE',
 'MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGEYGEWTYDDATKTFTVTE']

In [9]:
no_flat_encoder = CARPEncoder(
    encoder_name="carp_600k",
).encode(mut_seqs=list(df_val.sequence.astype(str).str[0 : 56].values[0:2]))
one_emb = next(no_flat_encoder)
one_emb[0], one_emb[0].shape

Generating carp_600k upto 16 layer embedding ...
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening
No embedding flattening


(array([[[ 0.1123378 , -0.03096455,  0.09588866, ..., -0.21515828,
          -0.41078544,  0.8417189 ],
         [ 0.20500934, -0.14373404, -0.15035409, ...,  0.13152508,
          -0.129341  ,  0.50128317],
         [ 0.42361322, -0.09426615,  0.04951379, ...,  0.22084078,
          -3.9057856 ,  0.7927066 ],
         ...,
         [ 0.31486925, -0.00696402,  0.12247566, ..., -0.05261485,
          -4.633561  ,  0.5593644 ],
         [ 0.27489305, -0.01615846,  0.14888935, ..., -0.03519905,
           0.3777458 ,  0.5697537 ],
         [ 0.34990996,  0.2566081 ,  0.21758933, ...,  0.10277106,
          -3.0541043 ,  0.3111604 ]],
 
        [[ 0.1123378 , -0.03096455,  0.09588866, ..., -0.21515828,
          -0.41078544,  0.8417189 ],
         [ 0.20500934, -0.14373404, -0.15035409, ...,  0.13152508,
          -0.129341  ,  0.50128317],
         [ 0.42361322, -0.09426615,  0.04951379, ...,  0.22084078,
          -3.9057856 ,  0.7927066 ],
         ...,
         [ 0.31486925, -0.0069640

In [10]:
mean_flat_encoder = CARPEncoder(
    encoder_name="carp_600k",
).encode(mut_seqs=list(df_val.sequence.astype(str).str[0 : 56].values[0:2]),flatten_emb="mean")
one_mean_emb = next(mean_flat_encoder)
one_mean_emb[0].shape

Generating carp_600k upto 16 layer embedding ...


(2, 128)

In [11]:
from sequence_models.pretrained import load_model_and_alphabet

model, collater = load_model_and_alphabet('carp_600k')

x = collater(seqs)[0]  # (n, max_len)
# rep = model(x)  # (n, max_len, d_model)

activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

model.model.embedder.layers[0].register_forward_hook(get_activation("layer0"))

rep = model(x)

activation["layer0"], activation["layer0"].shape

(tensor([[[ 0.1123, -0.0310,  0.0959,  ..., -0.2152, -0.4108,  0.8417],
          [ 0.2050, -0.1437, -0.1504,  ...,  0.1315, -0.1293,  0.5013],
          [ 0.4236, -0.0943,  0.0495,  ...,  0.2208, -3.9058,  0.7927],
          ...,
          [ 0.3149, -0.0070,  0.1225,  ..., -0.0526, -4.6336,  0.5594],
          [ 0.2749, -0.0162,  0.1489,  ..., -0.0352,  0.3777,  0.5698],
          [ 0.3499,  0.2566,  0.2176,  ...,  0.1028, -3.0541,  0.3112]],
 
         [[ 0.1123, -0.0310,  0.0959,  ..., -0.2152, -0.4108,  0.8417],
          [ 0.2050, -0.1437, -0.1504,  ...,  0.1315, -0.1293,  0.5013],
          [ 0.4236, -0.0943,  0.0495,  ...,  0.2208, -3.9058,  0.7927],
          ...,
          [ 0.3149, -0.0070,  0.1225,  ..., -0.0526, -4.6336,  0.5594],
          [ 0.2749, -0.0162,  0.1489,  ..., -0.0352,  0.3777,  0.5698],
          [ 0.3499,  0.2566,  0.2176,  ...,  0.1028, -3.0541,  0.3112]]]),
 torch.Size([2, 56, 128]))

In [12]:
rep = model(x, repr_layers=[0, 1, 2])
# rep = model(x, repr_layers=[0, 2, 32], logits=True)
rep.keys(), rep["representations"].keys(), rep[0], rep["representations"][1]

(dict_keys(['representations', 0]),
 dict_keys([1, 2]),
 tensor([[[ 0.1829,  0.1422,  0.0960,  ...,  0.1757, -0.4901,  0.1721],
          [ 0.1508,  0.1438, -0.0511,  ...,  0.2018,  0.7165,  0.2890],
          [ 0.0963,  0.2517,  0.1105,  ...,  0.2192, -2.5209,  0.4881],
          ...,
          [ 0.0797,  0.0780,  0.0807,  ...,  0.1697, -3.6828,  0.1462],
          [ 0.1263,  0.1968,  0.2430,  ...,  0.0919,  1.3361,  0.5400],
          [ 0.1361,  0.2477,  0.1058,  ...,  0.0659, -1.3406,  0.3698]],
 
         [[ 0.1829,  0.1422,  0.0960,  ...,  0.1757, -0.4901,  0.1721],
          [ 0.1508,  0.1438, -0.0511,  ...,  0.2018,  0.7165,  0.2890],
          [ 0.0963,  0.2517,  0.1105,  ...,  0.2192, -2.5209,  0.4881],
          ...,
          [ 0.0797,  0.0780,  0.0807,  ...,  0.1697, -3.6828,  0.1462],
          [ 0.1263,  0.1968,  0.2430,  ...,  0.0919,  1.3361,  0.5400],
          [ 0.1361,  0.2477,  0.1058,  ...,  0.0659, -1.3406,  0.3698]]],
        grad_fn=<TransposeBackward0>),
 tenso

In [13]:
seqs = [['MDREQ'], ['MGTRXXLP']]
x = collater(seqs)[0]  # (n, max_len)
rep = model(x)  # (n, max_len, d_model)

In [14]:
rep.keys(), rep["representations"].keys()

(dict_keys(['representations']), dict_keys([16]))