In [1]:
import numpy as np
import pandas as pd

from keras.utils import np_utils
import keras.metrics as metrics

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import sklearn.metrics
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from itertools import combinations 

from sklearn.feature_selection import RFECV

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import (KNeighborsClassifier,
                               NeighborhoodComponentsAnalysis)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score

In [2]:
# Load dataset 

full_dataset = pd.read_csv("/Users/nickpark/Desktop/codon-data/codon_usage.csv", low_memory=False)
dataset = full_dataset[full_dataset["DNAtype"].eq(0) | full_dataset["DNAtype"].eq(1) | full_dataset["DNAtype"].eq(2)]

# Remove irrelevant columns

dataset = dataset.drop(["Kingdom", "SpeciesID", "Ncodons", "SpeciesName"], axis=1)

# Remove weird rows

dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset = dataset[~dataset.applymap(np.isnan).any(1)]
dataset = dataset.to_numpy(dtype='float64')

dataset

array([[0.000e+00, 1.654e-02, 1.203e-02, ..., 2.510e-03, 5.000e-04,
        0.000e+00],
       [0.000e+00, 2.714e-02, 1.357e-02, ..., 2.710e-03, 6.800e-04,
        0.000e+00],
       [0.000e+00, 1.974e-02, 2.180e-02, ..., 3.910e-03, 0.000e+00,
        1.440e-03],
       ...,
       [1.000e+00, 1.423e-02, 3.321e-02, ..., 3.560e-03, 1.190e-03,
        2.017e-02],
       [0.000e+00, 1.757e-02, 2.028e-02, ..., 9.900e-04, 7.900e-04,
        1.560e-03],
       [1.000e+00, 1.778e-02, 3.724e-02, ..., 1.560e-03, 1.140e-03,
        2.161e-02]])

In [3]:
X = dataset[:,1:]
y = dataset[:,0].astype('int')
print(X.shape)
print(y.shape)

(12980, 64)
(12980,)


In [4]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size=0.20, stratify = dummy_y)

In [5]:
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

sklearn.metrics.roc_auc_score(y_test, y_pred, multi_class='ovr')

0.990088341921604

In [6]:
# Wrapper function with 5-fold cross-validation

def train_knn(X, y, k=3):
    
    classifier = KNeighborsClassifier(n_neighbors=k)
    roc_scores = cross_val_score(
        classifier,
        X, 
        y, 
        cv=5, 
        scoring='roc_auc_ovr', 
    )
    
    roc_average = sum(roc_scores)/len(roc_scores)
    
    f1_micro_scores = cross_val_score(
        classifier,
        X, 
        y, 
        cv=5, 
        scoring='f1_micro', 
    )
    
    f1_micro_average = sum(f1_micro_scores)/len(f1_micro_scores)
    
    f1_macro_scores = cross_val_score(
        classifier,
        X, 
        y, 
        cv=5, 
        scoring='f1_macro', 
    )
    
    f1_macro_average = sum(f1_macro_scores)/len(f1_macro_scores)
    
    accuracy_scores = cross_val_score(
        classifier,
        X, 
        y, 
        cv=5, 
        scoring='accuracy', 
    )
    
    accuracy_average = sum(accuracy_scores)/len(accuracy_scores)
    
    return roc_average, f1_micro_average, f1_macro_average, accuracy_average

In [7]:
X = dataset[:,1:].astype('float')

y = dataset[:,0].astype('str')


# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

for k in [1,2,3,4,5]:
    roc_average, f1_micro_average, f1_macro_average, accuracy_average = train_knn(X, y, k=k)
    print('scores for', k, ':', roc_average, f1_micro_average, f1_macro_average, accuracy_average)

scores for 1 : 0.9689345611170717 0.9711093990755006 0.9360099278073882 0.9711093990755006
scores for 2 : 0.977276270094408 0.9699537750385208 0.9351451600181878 0.9699537750385208
scores for 3 : 0.9795064003270367 0.9713405238828967 0.9343384020978013 0.9713405238828967
scores for 4 : 0.9805340170768375 0.9712634822804315 0.9334032084178606 0.9712634822804315
scores for 5 : 0.982289424319239 0.9713405238828967 0.9333616670481479 0.9713405238828967


In [8]:
# Feature selection using PCA:

def train_knn_with_pca(X, y, p=15, random_state=13):

    pca = make_pipeline(
        StandardScaler(),
        PCA(n_components=p, random_state=random_state)
    )
    
    pca.fit(X,y)
    
    return train_knn(pca.transform(X),y)

In [9]:
X = dataset[:,1:].astype('float')

y = dataset[:,0].astype('str')


# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_y)

for p in range(1,64):
    scores = train_knn_with_pca(X, y, p=p)
    print('With', p, 'pca features, got scores of', scores)

With 1 pca features, got scores of (0.643742827885795, 0.6356702619414484, 0.43847667854630357, 0.6356702619414484)
With 2 pca features, got scores of (0.8994087431408271, 0.8926810477657936, 0.757632677102782, 0.8926810477657936)
With 3 pca features, got scores of (0.9308779781823764, 0.9287365177195686, 0.8395822634024309, 0.9287365177195686)
With 4 pca features, got scores of (0.9463586813121957, 0.9410631741140214, 0.8664889330919665, 0.9410631741140214)
With 5 pca features, got scores of (0.9634375358311932, 0.9553929121725732, 0.897329166810442, 0.9553929121725732)
With 6 pca features, got scores of (0.9695469554223017, 0.9592449922958398, 0.9089050646776045, 0.9592449922958398)
With 7 pca features, got scores of (0.966387135645396, 0.9587057010785826, 0.9102083697213432, 0.9587057010785826)
With 8 pca features, got scores of (0.9683478044929279, 0.9619414483821263, 0.9168320956880429, 0.9619414483821263)
With 9 pca features, got scores of (0.9696867026916018, 0.963482280431433, 