In [1]:
!pwd # you should be in the cleavenet directory (outside of notebooks)
!pip install tensorflow # will compile on CPU 
!pip install cleavenet

Looking in indexes: https://test.pypi.org/simple/


In [2]:
import os 
import csv
import pandas as pd

import cleavenet 
from cleavenet.models import predict_scores_simple #, generate_substrates_simple
from cleavenet.utils import mmps
from cleavenet.analysis import calc_selectivity_score

2025-02-04 07:56:59.029052: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738684619.049601 3732378 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738684619.056278 3732378 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-04 07:56:59.078273: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# CleaveNet predictions 

Predict Z-scores for a custom set of substrates

In [3]:
## ---- Inputs to change ----- ### 

# replace the following example substrates with a list of your substrates 
substrates = [
    '--AAALS---',
    'FMPLNFTASG',
    'LGPYAMTSRG',
    'AARFKKFATE',
    'AKPNRLVNSY',
    'YGPRAVTASG',
    'SGPRAFWGTA'
    ]


# The default prediction settings should work for most cases

save_dir = 'outputs/' # where to save outputs 
model_architecture = 'transformer' # 'transformer' or 'lstm'
checkpoint_dir = 'weights/' # relative path to weights folder

In [4]:
# Run this cell as is; there will be lots of print statements, you can ignore them 
pred_zscores, unc_scores = predict_scores_simple(substrates, 
                                                 checkpoint_dir=checkpoint_dir, 
                                                 save_dir=save_dir, 
                                                 model_architecture=model_architecture)

Splits previously written to file
Vocab: 
 {'-': 0, 'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, '!': 21}
7 --AAALS---
Running 0 transformer_0/
EVALUATING SEQUENCES FROM outputs/


W0000 00:00:1738684621.794453 3732378 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Running 1 transformer_1/
EVALUATING SEQUENCES FROM outputs/


Running 2 transformer_2/
EVALUATING SEQUENCES FROM outputs/


Running 3 transformer_3/
EVALUATING SEQUENCES FROM outputs/


Running 4 transformer_4/
EVALUATING SEQUENCES FROM outputs/


(5, 7, 18)
Calculated confidence
Iterating over MMP1
writing all scores
writing all scores
Iterating over MMP10
Iterating over MMP11
Iterating over MMP12
Iterating over MMP13
Iterating over MMP14
Iterating over MMP15
Iterating over MMP16
Iterating over MMP17
Iterating over MMP19
Iterating over MMP2
Iterating over MMP20
Iterating over MMP24
Iterating over MMP25
Iterating over MMP3
Iterating over MMP7
Iterating over MMP8
Iterating over MMP9


In [5]:
# Predictions and plots have been saved to the outputs dir, lets look at what is inside these files 

# To get cleavage predictions 
pred_file = os.path.join(save_dir, 'all_scores.csv')
predictions = pd.read_csv(pred_file)
sequences = [seq[1:] for seq in predictions['sequences'].tolist()] # clean model outputs, get rid of extra tokens 
predictions['sequences'] = sequences

# Print z-score predictions 
predictions.head()

Unnamed: 0,sequences,MMP1,MMP10,MMP11,MMP12,MMP13,MMP14,MMP15,MMP16,MMP17,MMP19,MMP2,MMP20,MMP24,MMP25,MMP3,MMP7,MMP8,MMP9
0,--AAALS---,0.089758,0.416106,0.395101,0.417816,0.151502,0.17703,0.23421,0.239401,1.015475,-0.051086,-0.08409,0.634886,0.163208,1.292108,0.648528,0.674973,0.186854,0.679014
1,FMPLNFTASG,3.285574,3.17174,2.848731,1.982434,2.459345,1.657082,1.633623,1.669428,1.150125,0.990345,2.918435,2.426162,1.772338,2.173389,2.123676,1.374689,3.808247,2.432182
2,LGPYAMTSRG,3.140482,2.308038,2.695278,1.706032,2.48026,2.179933,1.696465,2.002263,1.492033,0.693252,3.440998,1.984635,2.076391,1.580685,2.486686,1.349398,2.749578,2.60829
3,AARFKKFATE,-0.653065,-0.384744,-0.549065,-0.549792,-0.762877,-0.659807,-0.961453,-0.986819,-0.156165,0.445911,-0.466166,-0.208736,-0.936325,-0.405631,-0.042812,-0.653908,-0.798303,-0.099508
4,AKPNRLVNSY,-0.39782,-0.292255,-0.24378,0.316298,0.154016,-0.758082,-0.627322,-0.753332,0.444348,0.44907,0.348303,0.192085,-0.638082,0.001487,0.165671,0.584138,0.031637,0.092842


In [6]:
# Convert to selectivity scores
selectivity = calc_selectivity_score(mmps, predictions)
selectivity.rename(columns=lambda x:x+'_ss', inplace=True)
selectivity['sequences'] = sequences

# Print selectivity scores 
selectivity.head()

Unnamed: 0,MMP1_ss,MMP10_ss,MMP11_ss,MMP12_ss,MMP13_ss,MMP14_ss,MMP15_ss,MMP16_ss,MMP17_ss,MMP19_ss,MMP2_ss,MMP20_ss,MMP24_ss,MMP25_ss,MMP3_ss,MMP7_ss,MMP8_ss,MMP9_ss,sequences
0,-0.333244,0.012301,-0.00994,0.014112,-0.267868,-0.240838,-0.180295,-0.174798,0.646926,-0.482373,-0.517318,0.24395,-0.255474,0.939832,0.258395,0.286395,-0.230436,0.290674,--AAALS---
1,1.133106,1.012575,0.670565,-0.246691,0.258274,-0.59118,-0.616019,-0.578108,-1.127959,-1.297137,0.74437,0.223139,-0.469144,-0.044503,-0.09714,-0.890185,1.686524,0.229513,FMPLNFTASG
2,1.05047,0.169058,0.579077,-0.468361,0.351411,0.033418,-0.478489,-0.154704,-0.694947,-1.540715,1.368663,-0.173369,-0.076215,-0.60108,0.358215,-0.845972,0.636571,0.486971,LGPYAMTSRG
3,-0.172112,0.111993,-0.061994,-0.062764,-0.288384,-0.179251,-0.498641,-0.525499,0.354017,0.99151,0.025781,0.298354,-0.472034,0.089877,0.474039,-0.173005,-0.325894,0.414007,AARFKKFATE
4,-0.36647,-0.254695,-0.203368,0.389656,0.217828,-0.747923,-0.609472,-0.742894,0.525238,0.530238,0.423543,0.258136,-0.620865,0.056327,0.230168,0.67325,0.088249,0.153055,AKPNRLVNSY


In [7]:
# To get predicted uncertainty  
pred_file = os.path.join(save_dir, 'all_uncertainty.csv')
uncertainty = pd.read_csv(pred_file)
sequences = [seq[1:] for seq in uncertainty['sequences'].tolist()] # clean model outputs, get rid of extra tokens 
uncertainty['sequences'] = sequences
uncertainty.rename(columns=lambda x:x+'_unc', inplace=True)
uncertainty.rename(columns={'sequences_unc':'sequences'}, inplace=True)

# Print uncertainty predictions 
uncertainty.head()

Unnamed: 0,sequences,MMP1_unc,MMP10_unc,MMP11_unc,MMP12_unc,MMP13_unc,MMP14_unc,MMP15_unc,MMP16_unc,MMP17_unc,MMP19_unc,MMP2_unc,MMP20_unc,MMP24_unc,MMP25_unc,MMP3_unc,MMP7_unc,MMP8_unc,MMP9_unc
0,--AAALS---,0.262109,0.340885,0.2739,0.304532,0.274281,0.302288,0.298653,0.289559,0.424891,0.169061,0.186358,0.465333,0.359177,0.598374,0.289095,0.242053,0.259371,0.346406
1,FMPLNFTASG,0.155442,0.193782,0.158475,0.120033,0.119394,0.249987,0.108309,0.160417,0.230688,0.210301,0.164949,0.134725,0.199982,0.313814,0.092466,0.098594,0.220759,0.306856
2,LGPYAMTSRG,0.250653,0.284855,0.268286,0.158185,0.231395,0.357613,0.341678,0.375396,0.318326,0.277784,0.196136,0.304907,0.39313,0.5044,0.162541,0.228936,0.212801,0.228025
3,AARFKKFATE,0.081461,0.111044,0.149062,0.10675,0.335088,0.092935,0.229756,0.135304,0.136927,0.169514,0.167405,0.22052,0.108881,0.136434,0.202799,0.120205,0.087421,0.225216
4,AKPNRLVNSY,0.054186,0.195143,0.126771,0.184922,0.129664,0.058403,0.143238,0.148128,0.238515,0.226073,0.176992,0.208891,0.096399,0.156046,0.122133,0.225223,0.110016,0.083212


In [8]:
# Looking closer at MMP13, lets look at the distribution of predicted cleavage, selectivity, and uncertainty
mmp_choice = 'MMP13'

mmp_df = predictions[['sequences',mmp_choice]].merge(selectivity[['sequences',mmp_choice+'_ss']], how='outer')
mmp_df = mmp_df.merge(uncertainty[['sequences',mmp_choice+'_unc']], how='outer')

# We can assign a binary cleavage prediction using thresholds determined for each MMP in Table S5 in the paper
z_cutoff = 1.0
mmp_df['Prediction'] = mmp_df[mmp_choice].apply(lambda x: 'Cleaved' if x > z_cutoff else 'Not Cleaved')
mmp_df

Unnamed: 0,sequences,MMP13,MMP13_ss,MMP13_unc,Prediction
0,--AAALS---,0.151502,-0.267868,0.274281,Not Cleaved
1,AARFKKFATE,-0.762877,-0.288384,0.335088,Not Cleaved
2,AKPNRLVNSY,0.154016,0.217828,0.129664,Not Cleaved
3,FMPLNFTASG,2.459345,0.258274,0.119394,Cleaved
4,LGPYAMTSRG,2.48026,0.351411,0.231395,Cleaved
5,SGPRAFWGTA,1.671879,1.66056,0.150237,Cleaved
6,YGPRAVTASG,1.777247,1.664469,0.22228,Cleaved


# CleaveNet generations

Design a new set of substrates unconditionally. To provide a set of z-scores for conditional generation, we recommend using the command line input arguments as described in the `README.md`

In [9]:
## ---- Inputs to change ----- ### 

# replace the following with desired parameters

temperature=1 # higher == more diverse, lower == less diverse. We reccomend a range between 0.7 and 1.2 
repeat_penalty=1.2 # higher numbers impose a penalty on repeat AA, a value of 1 is no penalty 
num_seqs = 10 # how many sequences to generate 

In [10]:
from cleavenet.models import simple_inference

In [12]:
substrate_designs = simple_inference(num_seqs, repeat_penalty, temperature)

Splits previously written to file
Vocab: 
 {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '$': 20, '*': 21}


In [13]:
substrate_designs

['SLPLNLSYEK',
 'RPLSHWQATI',
 'SLPRYLYDVQ',
 'PLAWHYISGI',
 'MSLATLWAGW',
 'IMARYLYDPW',
 'LPLASLWSGY',
 'CAVASLRDWY',
 'PVPKWITASG',
 'SPVKWLQSGY']