# Load related packages

In [12]:
import cPickle
import numpy as np
import keras
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop

# Function related to loading data

In [13]:
def Pfam_from_pickle_file_encoding(name_list_pickle_filename,model_names_list_filename):
	with open(name_list_pickle_filename,'r') as f:
		name_list=cPickle.load(f)

	with open(model_names_list_filename,'r') as f:
		model_list=cPickle.load(f)

	encoding=[]
	for i in range(len(name_list)):
		if i%10000==0:
			print('Processing %dth sequence.'%i)
		single_encoding=np.zeros(16306)
		if name_list[i] != []:
			for single_name in name_list[i]:
				single_encoding[model_list.index(single_name)]=1
		# encoding.append(single_encoding.reshape([5000, 4]))
		encoding.append(single_encoding)
	return encoding

# Load the data

In [14]:
enzyme_feature=Pfam_from_pickle_file_encoding(
    'Pfam_name_list_new_data.pickle',
    'Pfam_model_names_list.pickle')
non_enzyme_feature=Pfam_from_pickle_file_encoding(
    'Pfam_name_list_non_enzyme.pickle',
    'Pfam_model_names_list.pickle')
feature = np.concatenate([enzyme_feature, non_enzyme_feature], axis=0)
label = np.concatenate([np.ones([22168,1]), np.zeros([22168,1])], axis=0).flatten()
label = tf.keras.utils.to_categorical(label,num_classes=2)

Processing 0th sequence.
Processing 10000th sequence.
Processing 20000th sequence.
Processing 0th sequence.
Processing 10000th sequence.
Processing 20000th sequence.


# Define hyper-parameters

In [15]:
test_ratio = 0.1 # how much data for training and how much data for testing
number_class = 2 # total number of classes, useful for define network structure
number_features = 16306 # total number of feature, useful for define network structure
batch_size = 1024 # stochastic gradient descent, training batch size
epochs = 5 # training epoches

# Splite training data and testing data

In [16]:
x_train, x_test, y_train, y_test = train_test_split(
    feature, label, test_size=test_ratio, random_state=0)

# Build the network

In [17]:
model = Sequential() # linear stack of layers
model.add(Dense(1024, activation='relu', input_shape=(number_features,))) # fully connected layer
model.add(Dropout(0.3)) # dropout some nodes to avoid overfitting
model.add(Dense(1024, activation='relu')) # fully conncted layer
model.add(Dropout(0.3)) # dropout
model.add(Dense(number_class, activation='softmax')) # final classification layer
model.summary() # summarize the model structure and parameters

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 1024)              16698368  
_________________________________________________________________
dropout_5 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dropout_6 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 2050      
Total params: 17,750,018
Trainable params: 17,750,018
Non-trainable params: 0
_________________________________________________________________


# Define loss, optimizer (update rule), and metrics of monitoring the training process

In [18]:
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.adam(),
              metrics=['accuracy'])

# Run the training loop

In [19]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_test, y_test))

Train on 39902 samples, validate on 4434 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Evaluate the trained model

In [21]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

('Test loss:', 0.1959872514036361)
('Test accuracy:', 0.9454217409840241)
