In [9]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

import keras.metrics as metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

In [10]:
# Load dataset 

full_dataset = pd.read_csv("/Users/nickpark/Desktop/codon-data/codon_usage.csv", low_memory=False)
dataset = full_dataset[full_dataset["DNAtype"].eq(0) | full_dataset["DNAtype"].eq(1) | full_dataset["DNAtype"].eq(2)]

# Remove irrelevant columns

dataset = dataset.drop(["Kingdom", "SpeciesID", "Ncodons", "SpeciesName"], axis=1)

# Remove weird rows

dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset = dataset[~dataset.applymap(np.isnan).any(1)]
dataset = dataset.to_numpy(dtype='float64')

dataset

array([[0.000e+00, 1.654e-02, 1.203e-02, ..., 2.510e-03, 5.000e-04,
        0.000e+00],
       [0.000e+00, 2.714e-02, 1.357e-02, ..., 2.710e-03, 6.800e-04,
        0.000e+00],
       [0.000e+00, 1.974e-02, 2.180e-02, ..., 3.910e-03, 0.000e+00,
        1.440e-03],
       ...,
       [1.000e+00, 1.423e-02, 3.321e-02, ..., 3.560e-03, 1.190e-03,
        2.017e-02],
       [0.000e+00, 1.757e-02, 2.028e-02, ..., 9.900e-04, 7.900e-04,
        1.560e-03],
       [1.000e+00, 1.778e-02, 3.724e-02, ..., 1.560e-03, 1.140e-03,
        2.161e-02]])

In [8]:
X = dataset[:,1:]
y = dataset[:,0].astype('int')
print(X.shape)
print(y.shape)

(12980, 64)
(12980,)


In [None]:
# Multi-class classification with Keras

# Encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# Convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

# Define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(15, input_dim=64, activation='relu'))
	model.add(Dense(3, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)
kfold = KFold(n_splits=5, shuffle=True)

accuracies = cross_val_score(estimator, X, dummy_y, cv=kfold)
rocs = cross_val_score(estimator, X, dummy_y, cv=kfold, scoring='roc_auc_ovr')
macros = cross_val_score(estimator, X, dummy_y, cv=kfold, scoring='f1_macro')

print("Accuracy:", accuracies.mean())
print("AUROC:", rocs.mean())
print("Macro F1:", macros.mean())
