In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

import keras.metrics as metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load dataset 

full_dataset = pd.read_csv("/Users/nickpark/Desktop/codon-data/codon_usage.csv", low_memory=False)
dataset = full_dataset[~full_dataset["Kingdom"].eq('plm')]

# Remove irrelevant columns

dataset = dataset.drop(["DNAtype", "SpeciesID", "Ncodons", "SpeciesName"], axis=1)
dataset["Kingdom"] = dataset["Kingdom"].map({
    'pln': 'euk',
    'inv': 'euk',
    'vrt': 'euk',
    'mam': 'euk',
    'rod': 'euk',
    'pri': 'euk',
    'bct': 'bct',
    'vrl': 'vrl',
    'arc': 'arc',
    'plm': 'plm',
    'phg': 'phg'
})

print(set(dataset["Kingdom"]))

# Remove weird rows

cols=[i for i in dataset.columns if i not in ["Kingdom"]]
for col in cols:
    dataset[col] = pd.to_numeric(dataset[col], errors='coerce')
# dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset = dataset[~dataset.applymap(pd.isnull).any(1)]
dataset = dataset.to_numpy()

dataset

{'arc', 'phg', 'bct', 'vrl', 'euk'}


array([['vrl', 0.01654, 0.012029999999999999, ..., 0.00251, 0.0005, 0.0],
       ['vrl', 0.027139999999999997, 0.013569999999999999, ..., 0.00271,
        0.0006799999999999999, 0.0],
       ['vrl', 0.01974, 0.0218, ..., 0.00391, 0.0, 0.00144],
       ...,
       ['euk', 0.014230000000000001, 0.03321, ..., 0.0035600000000000002,
        0.00119, 0.02017],
       ['euk', 0.01757, 0.02028, ..., 0.00099, 0.00079, 0.00156],
       ['euk', 0.01778, 0.037239999999999995, ..., 0.00156, 0.00114,
        0.02161]], dtype=object)

In [14]:
X = dataset[:,1:].astype('float')
y = dataset[:,0].astype('str')

encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

dummy_y = np_utils.to_categorical(encoded_Y)

print(X, dummy_y)

[[0.01654 0.01203 0.0005  ... 0.00251 0.0005  0.     ]
 [0.02714 0.01357 0.00068 ... 0.00271 0.00068 0.     ]
 [0.01974 0.0218  0.01357 ... 0.00391 0.      0.00144]
 ...
 [0.01423 0.03321 0.01661 ... 0.00356 0.00119 0.02017]
 [0.01757 0.02028 0.00767 ... 0.00099 0.00079 0.00156]
 [0.01778 0.03724 0.01732 ... 0.00156 0.00114 0.02161]] [[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]]
{'arc', 'phg', 'bct', 'vrl', 'euk'}


In [17]:
def baseline_model():
	# Create model
	model = Sequential()
	model.add(Dense(20, input_dim=64, activation='relu'))
	model.add(Dense(5, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=100, batch_size=5, verbose=2)
kfold = KFold(n_splits=10, shuffle=True)

In [18]:
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/100
2342/2342 - 2s - loss: 0.9787 - accuracy: 0.5857
Epoch 2/100
2342/2342 - 2s - loss: 0.6468 - accuracy: 0.7803
Epoch 3/100
2342/2342 - 2s - loss: 0.5162 - accuracy: 0.8370
Epoch 4/100
2342/2342 - 2s - loss: 0.4489 - accuracy: 0.8567
Epoch 5/100
2342/2342 - 2s - loss: 0.4058 - accuracy: 0.8678
Epoch 6/100
2342/2342 - 3s - loss: 0.3758 - accuracy: 0.8784
Epoch 7/100
2342/2342 - 2s - loss: 0.3537 - accuracy: 0.8855
Epoch 8/100
2342/2342 - 2s - loss: 0.3359 - accuracy: 0.8908
Epoch 9/100
2342/2342 - 2s - loss: 0.3228 - accuracy: 0.8950
Epoch 10/100
2342/2342 - 2s - loss: 0.3099 - accuracy: 0.8977
Epoch 11/100
2342/2342 - 2s - loss: 0.2995 - accuracy: 0.9009
Epoch 12/100
2342/2342 - 2s - loss: 0.2897 - accuracy: 0.9030
Epoch 13/100
2342/2342 - 2s - loss: 0.2788 - accuracy: 0.9053
Epoch 14/100
2342/2342 - 2s - loss: 0.2704 - accuracy: 0.9087
Epoch 15/100
2342/2342 - 3s - loss: 0.2611 - accuracy: 0.9127
Epoch 16/100
2342/2342 - 2s - loss: 0.2540 - accuracy: 0.9136
Epoch 17/100
2342

Epoch 33/100
2342/2342 - 2s - loss: 0.2030 - accuracy: 0.9280
Epoch 34/100
2342/2342 - 2s - loss: 0.2001 - accuracy: 0.9304
Epoch 35/100
2342/2342 - 2s - loss: 0.1983 - accuracy: 0.9295
Epoch 36/100
2342/2342 - 2s - loss: 0.1953 - accuracy: 0.9306
Epoch 37/100
2342/2342 - 2s - loss: 0.1935 - accuracy: 0.9306
Epoch 38/100
2342/2342 - 2s - loss: 0.1923 - accuracy: 0.9324
Epoch 39/100
2342/2342 - 2s - loss: 0.1897 - accuracy: 0.9329
Epoch 40/100
2342/2342 - 2s - loss: 0.1870 - accuracy: 0.9347
Epoch 41/100
2342/2342 - 2s - loss: 0.1851 - accuracy: 0.9353
Epoch 42/100
2342/2342 - 2s - loss: 0.1840 - accuracy: 0.9341
Epoch 43/100
2342/2342 - 2s - loss: 0.1819 - accuracy: 0.9396
Epoch 44/100
2342/2342 - 2s - loss: 0.1794 - accuracy: 0.9373
Epoch 45/100
2342/2342 - 2s - loss: 0.1791 - accuracy: 0.9363
Epoch 46/100
2342/2342 - 2s - loss: 0.1769 - accuracy: 0.9382
Epoch 47/100
2342/2342 - 2s - loss: 0.1758 - accuracy: 0.9385
Epoch 48/100
2342/2342 - 2s - loss: 0.1742 - accuracy: 0.9388
Epoch 49

KeyboardInterrupt: 