# Detect alternative insect-based ingredients for solubility properties.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import xmltodict
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Input
from IPython.display import clear_output

  from .autonotebook import tqdm as notebook_tqdm
2022-11-16 11:31:00.777771: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-16 11:31:00.897648: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-16 11:31:00.897667: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-16 11:31:00.916439: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-16 11:31:0

In [None]:
raw_data = pd.read_csv('preprocessing/esol_data.tab',sep='\t')

In [None]:
# delete data if value of Solubility is Nan
data = raw_data.dropna(subset=['Solubility(%)'])

## Retrieve Additional Protein Data from UniProt

In [None]:
def retrieve_data_from_uniprot(gene, description, organism):
    # Initialize Query Parameters
    query = {
        "offset": 0,
        "gene": gene,
        "protein": description,
        "organism": organism
    }

    # Call UniProt Api
    response = requests.get("https://www.ebi.ac.uk/proteins/api/proteins", params=query)

    raw_content = response.content

    # Read XML Response Body
    data = xmltodict.parse(raw_content)

    return data['uniprot']['entry'][0]

In [None]:
data.columns

Index(['JW_ID', 'ECK number', 'B number', 'Gene name K-12', 'Locus name K-12',
       'Synonyms of locus names K-12', 'Solubility(%)', 'Yield(uM)',
       'Yield(ug/ml)', 'Minus Sol', 'TF Sol', 'GroE Sol', 'KJE Sol',
       'Minus(uM)', 'TF(uM)', 'GroE(uM)', 'KJE(uM)', 'Minus(ug/ml)',
       'TF(ug/ml)', 'GroE(ug/ml)', 'KJE(ug/ml)', 'Calculated MW(kDa)',
       'Calculated pI', 'Type of gene product', 'Gene product description',
       'Cell location', 'Structure (PDB) id', 'SCOP assignment'],
      dtype='object')

In [None]:
def update_progress(actual, total, clear=True, title="Progress"):
    bar_length = 100

    progress = (actual / total)

    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    if clear:
        clear_output(wait = True)

    text = "{0}: [{1}] {2:.1f}%".format(title, "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [None]:
def retrieve_and_merge(esol_dataset):
    merged = pd.DataFrame(columns=['Gene Name', 'Solubility(%)', 'Yield(uM)',
       'Yield(ug/ml)', 'Minus Sol', 'TF Sol', 'GroE Sol', 'KJE Sol',
       'Minus(uM)', 'TF(uM)', 'GroE(uM)', 'KJE(uM)', 'Minus(ug/ml)',
       'TF(ug/ml)', 'GroE(ug/ml)', 'KJE(ug/ml)', 'Calculated MW(kDa)',
       'Calculated pI', 'Type of gene product', 'Gene product description',
       'Cell location', 'Structure (PDB) id', 'SCOP assignment',
       'Sequence', 'Sequence length', 'Sequence mass', 'Organism', 'UP_ID', 'UP_NAME'])

    unknown = pd.DataFrame(columns=data.columns)

    total = len(esol_dataset)
    progress = 0
    success = 0
    failed = 0

    for index, row in esol_dataset.iterrows():
        try:
            uniprot_dict = retrieve_data_from_uniprot(gene = row['Gene name K-12'], description = row['Gene product description'], organism = "Escherichia coli")
            new_data = {
                'Gene Name': row['Gene name K-12'],
                'Solubility(%)': row['Solubility(%)'],
                'Yield(uM)': row['Yield(uM)'],
                'Yield(ug/ml)': row['Yield(ug/ml)'],
                'Minus Sol': row['Minus Sol'],
                'TF Sol': row['TF Sol'],
                'GroE Sol': row['GroE Sol'],
                'KJE Sol': row['KJE Sol'],
                'Minus(uM)': row['Minus(uM)'],
                'TF(uM)': row['TF(uM)'],
                'GroE(uM)': row['GroE(uM)'],
                'KJE(uM)': row['KJE(uM)'],
                'Minus(ug/ml)': row['Minus(ug/ml)'],
                'TF(ug/ml)': row['TF(ug/ml)'],
                'GroE(ug/ml)': row['GroE(ug/ml)'],
                'KJE(ug/ml)': row['KJE(ug/ml)'],
                'Calculated MW(kDa)': row['Calculated MW(kDa)'],
                'Calculated pI':  row['Calculated pI'],
                'Type of gene product': row['Type of gene product'],
                'Gene product description': row['Gene product description'],
                'Cell location':  row['Cell location'],
                'Structure (PDB) id': row['Structure (PDB) id'],
                'SCOP assignment': row['SCOP assignment'],
                'Sequence': uniprot_dict['sequence']['#text'],
                'Sequence length': uniprot_dict['sequence']['@length'],
                'Sequence mass': uniprot_dict['sequence']['@mass'],
                'Organism': uniprot_dict['organism']['name']['#text'],
                'UP_ID': uniprot_dict['accession'],
                'UP_NAME': uniprot_dict['name']
            }

            merged = pd.concat([merged, pd.DataFrame([new_data], columns=merged.columns)])
            success += 1
        except Exception as ex:
            failed += 1
            unknown = pd.concat([unknown, pd.DataFrame([row], columns=unknown.columns)])
        
        progress += 1
        
        update_progress(progress, total, clear=True)
        update_progress(success, total, title="Sucess Ratio", clear=False)
        update_progress(failed, total, title="Failure Ratio", clear=False)

    return merged, unknown

In [None]:
complete_data, unknown_data = retrieve_and_merge(data)

Progress: [####################################################################################################] 100.0%
Sucess Ratio: [##########################################----------------------------------------------------------] 41.6%
Failure Ratio: [##########################################################------------------------------------------] 58.4%


In [None]:
complete_data.to_csv('complete_esol_uniprot_data.csv', index=False)
unknown_data.to_csv('missing_esol_uniprot_data.csv', index=False)

In [None]:
data[data['JW_ID'] == 'JW0004']

Unnamed: 0,JW_ID,ECK number,B number,Gene name K-12,Locus name K-12,Synonyms of locus names K-12,Solubility(%),Yield(uM),Yield(ug/ml),Minus Sol,...,TF(ug/ml),GroE(ug/ml),KJE(ug/ml),Calculated MW(kDa),Calculated pI,Type of gene product,Gene product description,Cell location,Structure (PDB) id,SCOP assignment
2,JW0004,ECK0005,b0005,yaaX,yaaX,,78.0,1.2,14.0,,...,,,,11.4,10.9,o,predicted protein,Periplasmic,,


In [None]:
retrieve_data_from_uniprot(gene = "AbgB", description = "predicted peptidase, aminobenzoyl-glutamate utilization protein", organism = "Escherichia coli")

{'@xmlns': 'http://uniprot.org/uniprot',
 '@dataset': 'TrEMBL',
 '@created': '2010-11-02',
 '@modified': '2022-08-03',
 '@version': '63',
 'accession': 'E0IUS0',
 'name': 'E0IUS0_ECOLW',
 'protein': {'submittedName': {'fullName': {'@evidence': '3',
    '#text': 'Predicted peptidase, aminobenzoyl-glutamate utilization protein'}}},
 'gene': {'name': [{'@evidence': '3', '@type': 'primary', '#text': 'abgB'},
   {'@evidence': '3', '@type': 'ordered locus', '#text': 'ECW_m1434'}]},
 'organism': {'@evidence': '3 4',
  'name': {'@type': 'scientific',
   '#text': 'Escherichia coli (strain ATCC 9637 / CCM 2024 / DSM 1116 / LMG 11080 / NBRC 13500 / NCIMB 8666 / NRRL B-766 / W)'},
  'dbReference': {'@type': 'NCBI Taxonomy', '@id': '566546'},
  'lineage': {'taxon': ['Bacteria',
    'Proteobacteria',
    'Gammaproteobacteria',
    'Enterobacterales',
    'Enterobacteriaceae',
    'Escherichia']}},
 'reference': {'@evidence': '3 4',
  '@key': '1',
  'citation': {'@type': 'journal article',
   '@date'

## Train Model

In [None]:
data = pd.read_csv('complete_esol_uniprot_data.csv')
# useful_data = data.loc[:,['Yield(uM)','Yield(ug/ml)','Calculated MW(kDa)','Calculated pI','Type of gene product','Cell location' ]]
data_only_num = data.loc[:,['Yield(uM)','Yield(ug/ml)','Calculated MW(kDa)','Calculated pI', 'Sequence length', 'Sequence mass']]
# data_only_num = data.loc[:,['Yield(uM)','Yield(ug/ml)','Calculated MW(kDa)','Calculated pI']]

In [None]:
Y = data['Solubility(%)']
Y = np.array(Y)
X = np.array(data_only_num)
# normalized_Y = Y / Y.max()
normalized_Y = Y / np.linalg.norm(Y)
normalized_X = X / X.max(axis=0)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(normalized_X, normalized_Y,
  test_size=0.2, random_state=10)
# Y_train = tf.convert_to_tensor(Y_train)
# learning_rate = 0.001
# num_epochs = 1

In [None]:
model = keras.Sequential()
model.add(Input(shape=(6,)))
model.add(layers.Dense(4, activation="relu"))
model.add(layers.Dense(10, activation="relu"))
model.add(layers.Dense(1, activation='sigmoid'))
# y = model(normalized_X)
model.compile(
    optimizer=keras.optimizers.RMSprop(),  # Optimizer
    # Loss function to minimize
    loss='binary_crossentropy',
    # List of metrics to monitor
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

In [None]:
model = keras.Sequential()
model.add(layers.Dense(12, input_dim=6, activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy',  optimizer=keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])

In [None]:
history = model.fit(
    normalized_X,
    normalized_Y,
    batch_size=12,
    epochs=20,
    # We pass some validation for
    # monitoring validation loss and metrics
    # at the end of each epoch
    # validation_data=(x_val, y_val),
)



In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(X_train,Y_train)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(X_test)
print("predictions shape:", predictions.shape)

Evaluate on test data
test loss, test acc: [0.07540898025035858, 0.00039401103276759386]
Generate predictions for 3 samples
predictions shape: (635, 1)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8ea8948f-b978-492f-a6dd-8ac5feb4b472' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>