In [1]:
import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

from IPython.display import clear_output

In [2]:
data = pd.read_csv('../Data/Processed Data/complete_data.csv')

In [3]:
def normalize(col_data):
    return (col_data - col_data.min())/(col_data.max()-col_data.min())

In [4]:
def update_progress(actual, total, clear=True, title="Progress"):
    bar_length = 100

    progress = (actual / total)

    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    if clear:
        clear_output(wait = True)

    text = "{0}: [{1}] {2:.1f}%".format(title, "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [5]:
if data is None or len(data) <= 0:
    raise Error("Data has not been provided or an empty array has been given.")
    
MAX_ACIDS = 2000

acid_columns = np.arange(0, MAX_ACIDS, 1)
X = pd.DataFrame([], columns=acid_columns)
y = pd.DataFrame()

sequences = pd.DataFrame(data['Sequence'], columns=['Sequence'])
y = pd.DataFrame(data['Solubility(%)'])

total_seq = len(sequences)
i_seq = 0

for idx, row in sequences.iterrows():
    sequence = row.iloc[0]
    
    total_acid = len(sequence)
    i_acid = 0
    
    amino_acid_arr = np.zeros(MAX_ACIDS)
    
    for acid in sequence:
        acid_num_val = ord(acid)
        amino_acid_arr[i_acid] = acid_num_val
        i_acid += 1
        update_progress(i_seq, total_seq, clear=True, title='Sequences Progress')
        update_progress(i_acid, total_acid, title="Current Sequence Amino Acid Progress", clear=False)
    
    X_dict = dict(zip(acid_columns, amino_acid_arr))
    
    X = pd.concat([X, pd.DataFrame([X_dict], columns=acid_columns)], ignore_index=True)
    i_seq += 1
    update_progress(i_seq, total_seq, clear=True, title='Sequences Progress')

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [6]:
# Add Solubility Data to the data
export_data = X.copy()
export_data['UP_ID'] = pd.DataFrame(data['UP_ID'])

# # Move Solubility as the first column
export_data = export_data[ ['UP_ID'] + [ col for col in export_data.columns if col != 'UP_ID' ] ]

In [7]:
merged_export_data = data.merge(export_data, how='inner', on='UP_ID')

In [8]:
merged_export_data.to_csv('complete_segmented_acid_data.csv', index=False)

In [84]:
if X is None or y is None:
    raise Error("The data has not been processed. Please process the data first.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [3, 3160]

In [49]:
class ANN:
    def __init__(self, data):
        self.data = pd.DataFrame([data.loc[0]])
        self.X = None
        self.X_train = None
        self.X_test = None
        self.y = None
        self.y_train = None
        self.y_test = None
        self.model = None

    @staticmethod
    def MaxAminoAcidCount():
        return 2000

    def process(self):
        if data is None or len(self.data) <= 0:
            raise Error("Data has not been provided or an empty array has been given.")

        self.X = pd.DataFrame([], columns=np.arange(0, 2000, 1, dtype=int))
        self.y = pd.DataFrame()

        sequences = data['Sequence']
        self.y = data['Solubility(%)']
        
        for idx, sequence in sequences.items():
            current_value = sequences.iloc[0]
            serie = pd.Series([], dtype='float64')
            
            for acid in sequence:
                serie = pd.concat([serie, pd.Series([ord(acid)])])

            if len(serie) < ANN.MaxAminoAcidCount():
                zeros_to_add = ANN.MaxAminoAcidCount() - len(serie)

                serie = pd.concat([serie, pd.Series(np.zeros(zeros_to_add))])

            self.X = pd.concat([self.X, pd.DataFrame(serie)])

        self.y = normalize(self.y)
    
    def split(self):
        if self.X is None or self.y is None:
            raise Error("The data has not been processed. Please process the data first.")

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
    
    def fit(self):
        if self.X_train is None or self.y_train is None:
            raise Error("The Training set does not exist. Please split the data using 'split()' before training the network.")

        self.model = tf.keras.Sequential([
            layers.Dense(64),
            layers.Dense(32),
            layers.Dense(16),
            layers.Dense(8),
            layers.Dense(1)
        ])

        self.model.compile(loss = tf.keras.losses.MeanSquaredError(),
        optimizer = tf.keras.optimizers.Adam())

        self.model.fit(self.X_train, self.y_train, epochs=25)

        return self.model

    def evaluate(self):
        if self.model is None:
            raise Error("Network Model is not present. Train the network first before evaluating.")
        
        # Should not occur as its a pre-condition for `fit()` and the precondition of evaluate is for `fit()` to have been run.
        if self.X_test is None or self.y_test is None:
            raise Error("The Training set does not exist. Please split the data using 'split()' before training the network.")

        eval_results = self.model.evaluate(self.X_test, self.y_test)
        print("loss, accuracy", eval_results)

        return eval_results

In [50]:
network = ANN(data)

In [52]:
network.process()

KeyboardInterrupt: 

In [None]:
network.split()

In [None]:
network.X

In [None]:
network.fit()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8ea8948f-b978-492f-a6dd-8ac5feb4b472' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>