# **Novozymes-Enzyme-Stability-Prediction**

##### Develop models that can predict the ranking of protein stability (as measured by melting point, ```tm```) after single-point amino acid mutation and deletion. 

### **Import Libraries**

In [1]:
import tensorflow as tf 
import tensorflow_hub as tfhub 
import tensorflow_addons as tfa 
import tensorflow_io as tfio
import Bio as pyb
import biopandas
from biopandas.pdb import PandasPdb
pdb = PandasPdb()

import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import sklearn
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from pandarallel import pandarallel
pandarallel.initialize()

from sklearn.model_selection import GroupKFold, StratifiedKFold
from scipy.spatial import cKDTree

2022-09-24 10:32:23.495514: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-24 10:32:23.860486: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-24 10:32:24.878503: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-24 10:32:24.878618: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
from collections import Counter
from datetime import datetime
from zipfile import ZipFile
from glob import glob
import warnings, requests, hashlib, imageio, IPython, urllib
import zipfile, pickle, random, shutil, string, json
import math, time, gzip, ast, sys, io, os, gc, re

In [5]:
from matplotlib import animation, rc
import matplotlib
from PIL import Image, ImageEnhance
import seaborn as sns
import tifffile as tif
import plotly.express as px
import plotly.io as pio
import cv2, PIL, plotly
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
import matplotlib.patches as patches
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
tqdm.pandas()

Image.MAX_IMAGE_PIXELS = 5_000_000_000

rc('animation', html='jshtml')

def seed_it_all(seed=7):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [8]:
DATA_DIR = "../mutating-doodle/"

In [9]:
tf.config.optimizer .set_jit(True)

In [2]:
def sep():
    print("-"*100)

class paths:
    TRAIN = "../mutating-doodle/train.csv"
    TEST = "../mutating-doodle/test.csv"
    SUBMISSION = "../mutating-doodle/submission.csv"
    PDB_FILE = "../mutating-doodle/wildtype_structure_prediction_af2.pdb"

### **Load Data**

In [3]:
train_df = pd.read_csv(paths.TRAIN)
test_df = pd.read_csv(paths.TEST)
print(f"Train dataframe has shape: {train_df.shape}")
print(f"Test dataframe has shape: {test_df.shape}")
display(train_df.head())
display(test_df.head())

Train dataframe has shape: (31390, 5)
Test dataframe has shape: (2413, 4)


Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5


Unnamed: 0,seq_id,protein_sequence,pH,data_source
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes


In [4]:
train_df.profile_report()

Summarize dataset: 100%|██████████| 29/29 [00:10<00:00,  2.85it/s, Completed]                        
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.01s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.98it/s]




In [5]:
test_df.profile_report()

Summarize dataset: 100%|██████████| 18/18 [00:00<00:00, 27.03it/s, Completed]                       
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.77it/s]




In [7]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go 

fig = go.Figure()
colors = ["#FF0000", "#00FFFB", "#FCFCFC", "#1FFF00"]

i = 1

fig.add_trace(
    go.Histogram(x=train_df["tm"],
                name="tm",
                hovertemplate="target_column"+'%{y:.2f}',
                marker=dict(color=colors[i])
        )
)

fig.update_xaxes(
    title_text="Target column",
    title_font_color=colors[i],
    tickfont_color=colors[i]
)

fig.update_yaxes(
    title_text="Frequency",
    title_font_color=colors[i],
    tickfont_color=colors[i]
)

fig.update_layout(height=800,
                  width=1000,
                  title_text="Target column: Higher tm means the protein variant is more stable.",
                  template="plotly_dark",
                  xaxis=dict(color="#FF9300"),
                  yaxis=dict(color="#FF9300")
)

fig.show()

In [8]:
train_df["protein_sequence_len"] = train_df["protein_sequence"].apply(lambda x: len(x))
test_df["protein_sequence_len"] = test_df["protein_sequence"].apply(lambda x: len(x))
display(train_df.describe())
display(test_df.describe())

Unnamed: 0,seq_id,pH,tm,protein_sequence_len
count,31390.0,31104.0,31390.0,31390.0
mean,15694.5,6.892339,49.147337,447.669513
std,9061.656811,1.612225,14.010089,640.728935
min,0.0,1.99,-1.0,5.0
25%,7847.25,7.0,42.1,197.0
50%,15694.5,7.0,48.0,336.0
75%,23541.75,7.0,53.8,523.0
max,31389.0,64.9,130.0,32767.0


Unnamed: 0,seq_id,pH,protein_sequence_len
count,2413.0,2413.0,2413.0
mean,32596.0,8.0,220.96809
std,696.717422,0.0,0.175798
min,31390.0,8.0,220.0
25%,31993.0,8.0,221.0
50%,32596.0,8.0,221.0
75%,33199.0,8.0,221.0
max,33802.0,8.0,221.0


In [9]:
from biopandas.pdb import PandasPdb

pdb_df = PandasPdb().read_pdb(paths.PDB_FILE)
pdb_df.df.keys()

dict_keys(['ATOM', 'HETATM', 'ANISOU', 'OTHERS'])

In [10]:
atom_df = pdb_df.df['ATOM']
hetatm_df = pdb_df.df['HETATM']
anisou_df = pdb_df.df['ANISOU']
others_df = pdb_df.df['OTHERS']
print(f"ATOM dataset is of shape: {atom_df.shape}"), sep()
print(f"HETATM dataset is of shape: {hetatm_df.shape}"), sep()
print(f"ANISOU dataset is of shape: {anisou_df.shape}"), sep()
print(f"OTHERS dataset is of shape: {others_df.shape}"), sep()
display(atom_df.head())
display(hetatm_df.head())
display(anisou_df.head())
display(others_df.head())

ATOM dataset is of shape: (3317, 21)
----------------------------------------------------------------------------------------------------
HETATM dataset is of shape: (0, 21)
----------------------------------------------------------------------------------------------------
ANISOU dataset is of shape: (0, 21)
----------------------------------------------------------------------------------------------------
OTHERS dataset is of shape: (2, 3)
----------------------------------------------------------------------------------------------------


Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,x_coord,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx
0,ATOM,1,,N,,VAL,,A,1,,...,34.064,-6.456,50.464,1.0,45.11,,,N,,0
1,ATOM,2,,H,,VAL,,A,1,,...,33.576,-6.009,51.228,1.0,45.11,,,H,,1
2,ATOM,3,,H2,,VAL,,A,1,,...,33.882,-7.449,50.477,1.0,45.11,,,H,,2
3,ATOM,4,,H3,,VAL,,A,1,,...,35.06,-6.323,50.566,1.0,45.11,,,H,,3
4,ATOM,5,,CA,,VAL,,A,1,,...,33.643,-5.877,49.162,1.0,45.11,,,C,,4


Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,x_coord,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx


Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,"U(1,1)","U(2,2)","U(3,3)","U(1,2)","U(1,3)","U(2,3)",blank_4,element_symbol,charge,line_idx


Unnamed: 0,record_name,entry,line_idx
0,TER,3318 LYS A 221,3317
1,END,,3318


In [11]:
import plotly.express as px

fig = px.scatter_3d(atom_df, x="x_coord",
                    y="y_coord",
                    z="z_coord",
                    color="element_symbol",
                    color_discrete_sequence=["#84FFA9", "#00FFF7", "#003AFF", "#F000FF", "#FBFF00"])
fig.update_traces(marker=dict(size=3))
fig.update_coloraxes(showscale=False)
fig.update_layout(template="plotly_dark")
fig.show()

In [12]:
from scipy.sparse import csr_matrix

train_df = train_df[train_df["protein_sequence_len"] <= 221]
train_df.reset_index(inplace=True)
sequences = [list(string) for string in train_df["protein_sequence"].values.tolist()]
sequences_train = pd.DataFrame(sequences)
sequences_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,211,212,213,214,215,216,217,218,219,220
0,A,A,F,Q,V,T,S,N,E,I,...,,,,,,,,,,
1,A,A,G,G,Q,P,Q,G,A,T,...,A,Q,Q,Q,C,N,,,,
2,A,A,I,G,I,G,I,L,G,G,...,,,,,,,,,,
3,A,A,K,S,G,D,A,E,E,A,...,,,,,,,,,,
4,A,A,L,A,L,G,L,P,A,F,...,,,,,,,,,,


In [13]:
from sklearn.preprocessing import LabelEncoder

sequences_train = sequences_train.apply(LabelEncoder().fit_transform)
sequences_train["tm"] = train_df["tm"]
sequences_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,212,213,214,215,216,217,218,219,220,tm
0,0,0,4,13,17,16,15,11,3,7,...,20,19,20,20,20,20,20,20,18,49.7
1,0,0,5,5,13,12,13,5,0,16,...,13,13,13,1,11,20,20,20,18,45.1
2,0,0,7,5,7,5,7,9,5,5,...,20,19,20,20,20,20,20,20,18,62.8
3,0,0,8,15,5,2,0,3,3,0,...,20,19,20,20,20,20,20,20,18,36.3
4,0,0,9,0,9,5,9,12,0,4,...,20,19,20,20,20,20,20,20,18,83.0


In [14]:
from sklearn.model_selection import train_test_split
import xgboost

X = sequences_train.loc[:, sequences_train.columns != "tm"]
y = sequences_train.loc[:, sequences_train.columns == "tm"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = xgboost.XGBRegressor(n_estimators=500, max_depth=15)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [15]:
from scipy import stats

stats.spearmanr(y_test, y_pred)

SpearmanrResult(correlation=0.3547764232104556, pvalue=9.909469529327458e-55)

In [16]:
from scipy.sparse import csr_matrix

test_df = test_df[test_df["protein_sequence_len"] <= 221]
sequences = [list(string) for string in test_df["protein_sequence"].values.tolist()]
sequences_test = pd.DataFrame(sequences)
sequences_test = sequences_test.apply(LabelEncoder().fit_transform)
sequences_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,211,212,213,214,215,216,217,218,219,220
0,0,0,0,0,0,0,0,0,0,0,...,7,11,7,5,1,8,13,15,2,6
1,0,0,0,0,0,0,0,0,0,0,...,7,11,7,5,1,8,13,15,2,6
2,0,0,0,0,0,0,0,0,0,0,...,10,11,6,2,5,16,11,4,4,13
3,0,0,0,0,0,0,0,0,0,0,...,7,11,7,5,1,8,13,15,2,6
4,0,0,0,0,0,0,0,0,0,0,...,7,11,7,5,1,8,13,15,2,6


In [None]:
submission = pd.DataFrame()
submission["tm"] = model.predict(sequences_test)
submission["seq_id"] = test_df["seq_id"]
submission.to_csv("submission.csv", index=False)