<a href="https://colab.research.google.com/github/pavankumarallu/Bolt-PRO/blob/main/Boltpro_StackGAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np


from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import scipy as sp
import pandas as pd
import sys, os, math, argparse

In [6]:
from keras.models import Model, Sequential
from keras.layers import GRU, LSTM
from keras.layers import Input, Dropout, Dense, Reshape, Flatten, Activation
# from keras.layers.merge import _Merge
from keras.layers import concatenate
from keras.layers import Conv1D, Conv2D
from keras.layers.convolutional import Convolution2D, Conv2DTranspose, UpSampling2D
from tensorflow.keras.layers import BatchNormalization
from keras.layers import Activation, ZeroPadding2D
from keras.layers import TimeDistributed, RepeatVector
# from keras.layers.advanced_activations import LeakyReLU
from keras.layers import LeakyReLU
from keras import optimizers
from keras import losses
from keras.datasets import mnist
from keras.utils import np_utils
from keras.preprocessing import sequence as ksq
from keras import backend as K
from functools import partial
from keras.optimizers import Adam

In [12]:
data = pd.read_csv('/content/drive/MyDrive/Sequence_HSepians.csv')

In [13]:
data.head()

Unnamed: 0,simple_fasta
0,MSLIQKEAQGQSGTDQTVVVLSNPTYYMSNDIPYTFHQDNNFLYLC...
1,MQRDHTMDYKESCPSVSIPSSDEHREKKKRFTVYKVLVSVGRSEWF...
2,MARLSGTVGVAAVTAGPGLTNTVTAVKNAQMAQSPILLLGGAASTL...
3,MATSLDFKTYVDQACRAAEEFVNIYYETMDKRRRALTRLYLDKATL...
4,MVFRRFVEVGRVAYVSFGPHAGKLVAIVDVIDQNRALVDGPCTQVR...


In [68]:
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [69]:
# maximum length of sequence, everything afterwards is discarded!
max_length = 400

#create and fit tokenizer
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['simple_fasta'])
#represent input data as word rank number sequences
train_X = tokenizer.texts_to_sequences(data['simple_fasta'])
train_X = pd.DataFrame(pad_sequences(train_X, maxlen=max_length))

In [70]:
train_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,3,14,10,13,8,16,11,15,17,12,...,10,13,2,12,13,19,3,12,4,3
1,0,0,0,0,0,0,0,0,0,0,...,18,10,3,8,5,17,12,3,10,8
2,0,0,0,0,0,0,0,0,0,0,...,1,10,13,8,12,4,3,8,12,12
3,0,0,0,0,0,0,0,0,0,0,...,19,15,9,15,12,10,20,3,3,3
4,0,0,0,0,0,0,0,0,0,0,...,4,5,5,18,2,12,14,14,12,2


In [71]:
max(train_X.max())

20

In [72]:
class proteinGAN():
    def __init__(self, n_aa=max_length, n_render=1, latent_dim=10):
        self.n_aa = n_aa
        self.n_render = n_render
        self.pro_shape = (self.n_aa, self.n_render)
        self.latent_dim = latent_dim

        optimizer = Adam(0.0002, 0.5)

        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        # Build the generator
        self.generator = self.build_generator()
        # Generated peptide
        z = Input(shape=(self.latent_dim,))
        peptide = self.generator(z)
        # The discriminator takes generated peptides as input and determines validity
        validity = self.discriminator(peptide)
        # For the combined model we will only train the generator
        for layer in self.generator.layers:
            layer.trainable=True
        for layer in self.discriminator.layers:
            layer.trainable=False
        self.generator.trainable=True
        self.discriminator.trainable = False
        self.combined = Model(z, validity)
        self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)
        self.combined.summary()
        for layer in self.generator.layers:
            layer.trainable=False
        for layer in self.discriminator.layers:
            layer.trainable=True
        self.generator.trainable=False
        self.discriminator.trainable = True
        self.discriminator.compile(loss='binary_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy'])
    
    def build_generator(self):

        model = Sequential()
        model.add(Dense(256, input_shape=(self.latent_dim,)))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(512))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(1024))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(np.prod(self.pro_shape), activation='tanh'))
        model.add(Reshape(self.pro_shape))

        model.summary()

        noise = Input(shape=(self.latent_dim,))
        peptide = model(noise)

        return Model(noise, peptide)

    def build_discriminator(self):

        model = Sequential()

        model.add(Flatten(input_shape=self.pro_shape))
        model.add(Dense(512))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(256))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.summary()

        peptide = Input(shape=self.pro_shape)
        validity = model(peptide)
        return Model(peptide, validity)
    
    def train(self, X, epochs, batch_size=128, sample_interval=50):
        # Load the dataset
        X_train = X

        # Rescale
        X_train = X_train / 20.0

        # Adversarial ground truths
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        for epoch in range(epochs):

            # ---------------------
            #  Train Discriminator
            # ---------------------

            # Select a random batch
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            peptides = X_train[idx]

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim ))

            # Generate a new batch
            gen_peptides = self.generator.predict(noise)

            # Train the discriminator
            d_loss_real = self.discriminator.train_on_batch(peptides, valid)
            d_loss_fake = self.discriminator.train_on_batch(gen_peptides, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # ---------------------
            #  Train Generator
            # ---------------------
            noise = np.random.normal(0, 1, (batch_size, self.latent_dim ))

            # Train the generator (to have the discriminator label samples as valid)
            g_loss = self.combined.train_on_batch(noise, valid)

            # Plot the progress
            print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))

            # If at save interval
            if epoch % sample_interval == 0:
                self.sample_peptides(epoch)
            
        return self.combined
    
    def sample_peptides(self, epoch):
        noise = np.random.normal(0, 1, (1, self.latent_dim))
        gen_peptides = self.generator.predict(noise)

        # Rescale images 0 - 1
        gen_peptides = 20.0 * gen_peptides
        return gen_peptides

In [73]:
np.random.normal(0,1,5).shape

(5,)

In [74]:
pro_gan = proteinGAN()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_3 (Flatten)         (None, 400)               0         
                                                                 
 dense_21 (Dense)            (None, 512)               205312    
                                                                 
 leaky_re_lu_15 (LeakyReLU)  (None, 512)               0         
                                                                 
 dense_22 (Dense)            (None, 256)               131328    
                                                                 
 leaky_re_lu_16 (LeakyReLU)  (None, 256)               0         
                                                                 
 dense_23 (Dense)            (None, 1)                 257       
                                                                 
Total params: 336,897
Trainable params: 336,897
Non-tr

In [75]:
train_X.values.shape

(6456, 400)

In [76]:
train_X_3 = np.reshape(train_X.values, (train_X.shape[0], train_X.shape[1], 1))

In [77]:
pro_gan.train(X=train_X_3,  epochs=300, batch_size=32, sample_interval=200)

0 [D loss: 0.559327, acc.: 57.81%] [G loss: 0.602466]
1 [D loss: 0.471426, acc.: 53.12%] [G loss: 0.531420]
2 [D loss: 0.444248, acc.: 50.00%] [G loss: 0.446733]
3 [D loss: 0.415827, acc.: 54.69%] [G loss: 0.373658]
4 [D loss: 0.354211, acc.: 96.88%] [G loss: 0.397156]
5 [D loss: 0.260465, acc.: 100.00%] [G loss: 0.377389]
6 [D loss: 0.199350, acc.: 100.00%] [G loss: 0.381437]
7 [D loss: 0.217494, acc.: 100.00%] [G loss: 0.362095]
8 [D loss: 0.154608, acc.: 100.00%] [G loss: 0.387876]
9 [D loss: 0.136955, acc.: 100.00%] [G loss: 0.514702]
10 [D loss: 0.169159, acc.: 100.00%] [G loss: 0.488575]
11 [D loss: 0.175530, acc.: 100.00%] [G loss: 0.521162]
12 [D loss: 0.192212, acc.: 98.44%] [G loss: 0.721539]
13 [D loss: 0.181732, acc.: 98.44%] [G loss: 0.967330]
14 [D loss: 0.184510, acc.: 98.44%] [G loss: 1.116003]
15 [D loss: 0.192848, acc.: 98.44%] [G loss: 1.471613]
16 [D loss: 0.183676, acc.: 96.88%] [G loss: 1.698413]
17 [D loss: 0.139417, acc.: 100.00%] [G loss: 1.969699]
18 [D loss: 

<keras.engine.functional.Functional at 0x7fd6c65592e0>

In [62]:
tmp_arr = list(np.abs(np.reshape(pro_gan.sample_peptides(1), (max_length,)).astype(np.int)))



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  tmp_arr = list(np.abs(np.reshape(pro_gan.sample_peptides(1), (max_length,)).astype(np.int)))


In [84]:
alpha=list('ARNDCQEGHILKMFPSTWYVOXUZB')

In [85]:
seq = ''.join([alpha[id] for id in tmp_arr])

In [86]:
max(tmp_arr)

24

In [87]:
len(seq)

400

In [88]:
ran_seq = "CAHBHBKCICAKMNKHEEDOCNRDEFEHOBOMENEIGHGGDBCCNFCNLKFNMBECAKEDQHGPELAKLECADNLCDBENCNCOOGOFGEOAFFLCPBLDJMFPIJLHCMFAAADEAKDGCMFPJBBFEEEFFDCNQAFAGCJBIHDGQFAAIGFJDMGEHCCDJFFJIDHCBHCLGAKKFNIIBLFAFGHMIHLEIJCNKANHHBKCIAOFLODIHMDKDCJKRCELRCAPBLKFAQKDBBBQCFENDTJRHDHOGAMCGIEECDKDGAACKGFHDCEIEGBEPCEHCEECADJFMBFKMEFPJFEINAELMLKEBBCHIQLDAJHKDFCJCDKFLEALDFHMSFMGFBCPKDFFGEGKMIFCAJFBOEMFFABDGFMJCOCBEFLECFKAKLDDJNHDLBMFDMMCBHKNHGELCFCAGIDABNBABBDPAGCDIDEDINKEINBEKDEFCNBFAHHMLGNAJAAFUCGNSIEMHKOQQANLDLQDAECIFFAGFDUIAPEEADIJMJLCHGEBJPGIOGAOKBDBOJPCLJFFPBEAMIGAGGGRMGMGHHKMIGMKGBOHGMFIDCAGFLHIFKTDCCBAAHPADHDIEIJCBCFFEFAHGGJABGDNJADFLNIDADKDBBQELJDIKDJJNCBVDJCMDDAEOEHBBFHEFEFEAAHCEEHHHBNGADCEACACEDACMCEIMKIDHJAFJJJMEEOHJAAJAEKGEASFCDAPBHJDDNLDEDDCLACHAUBKNBIQDGEDFBFBMDCFAEEEBEDJAILGHGFPGBBMKAOCFHCFOMLFQNBPGUDDQBOFALINEBJKIBCCPFCEEKPJBEIRFLCEBBODBEEBEIAEDUCKFGFAFCFSSPUEBCMBUJFEORCQLEGPCIQUCEBBKCCBAIQFJAJQBTMLJSKQBQAHRBCABUCAACJMCAEKGBFQMGHICBEEABMBCDKHJBEEMJMBEBINODFDBJABCAALJCHBHEDEGENBIBQSHDIIGEFJDOGKBOJBANFLQFGSERHEKJSIAFJIKLIOSLDJ"

In [89]:
len(ran_seq)

1024

In [90]:
seq


'YFRVNLWGENCCIREHKCDQLFNGDNDHCEVYSSEQHFMPDENAHLGKLRGAHPAALFEKEXMEHHIHAMQDPSKPDCNGNLIKMDCORQPNNTQEENEFNEDQNHAFEWASRAMLWPWYPGNIDNCDCRNMAFENRRALAEHMGRYRKYNENYCKLRGKSNTWFILHNETILAKDMFLIMMYLIFIRGHEWNTDFQNYQLAMMSADDCTTQLCNCYVYDQWNLFKSMMDDFYRWLPOSHQYGYFTONRGCDLGPHQCLTETRVKOGWNQGWQKHWXSKIKQDCIWAWCNEMDKRKQVOYPVFHGNTFNMKFCFSQGVXWIAPVIAQHCTLWPRYDKAXMMMGBOTUFACTPFSGITWIWCFIZEHUDOSHSSEPYPARDFWWSYAWTXXVXFSMXMUAF'