In [17]:
import numpy as np
import pandas as pd

from keras.utils import np_utils
import keras.metrics as metrics

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import sklearn.metrics
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from itertools import combinations 

from sklearn.feature_selection import RFECV

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import (KNeighborsClassifier,
                               NeighborhoodComponentsAnalysis)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score

In [7]:
# Load dataset 

full_dataset = pd.read_csv("/Users/nickpark/Desktop/codon-data/codon_usage.csv", low_memory=False)
dataset = full_dataset[~full_dataset["Kingdom"].eq('plm')]

# Remove irrelevant columns

dataset = dataset.drop(["DNAtype", "SpeciesID", "Ncodons", "SpeciesName"], axis=1)
dataset["Kingdom"] = dataset["Kingdom"].map({
    'pln': 'euk',
    'inv': 'euk',
    'vrt': 'euk',
    'mam': 'euk',
    'rod': 'euk',
    'pri': 'euk',
    'bct': 'bct',
    'vrl': 'vrl',
    'arc': 'arc',
    'plm': 'plm',
    'phg': 'phg'
})

print(set(dataset["Kingdom"]))

# Remove weird rows

cols=[i for i in dataset.columns if i not in ["Kingdom"]]
for col in cols:
    dataset[col] = pd.to_numeric(dataset[col], errors='coerce')
# dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset = dataset[~dataset.applymap(pd.isnull).any(1)]
dataset = dataset.to_numpy()

dataset

{'phg', 'vrl', 'euk', 'arc', 'bct'}


array([['vrl', 0.01654, 0.012029999999999999, ..., 0.00251, 0.0005, 0.0],
       ['vrl', 0.027139999999999997, 0.013569999999999999, ..., 0.00271,
        0.0006799999999999999, 0.0],
       ['vrl', 0.01974, 0.0218, ..., 0.00391, 0.0, 0.00144],
       ...,
       ['euk', 0.014230000000000001, 0.03321, ..., 0.0035600000000000002,
        0.00119, 0.02017],
       ['euk', 0.01757, 0.02028, ..., 0.00099, 0.00079, 0.00156],
       ['euk', 0.01778, 0.037239999999999995, ..., 0.00156, 0.00114,
        0.02161]], dtype=object)

In [8]:
X = dataset[:,1:].astype('float')
y = dataset[:,0].astype('str')
print(X,y)

[[0.01654 0.01203 0.0005  ... 0.00251 0.0005  0.     ]
 [0.02714 0.01357 0.00068 ... 0.00271 0.00068 0.     ]
 [0.01974 0.0218  0.01357 ... 0.00391 0.      0.00144]
 ...
 [0.01423 0.03321 0.01661 ... 0.00356 0.00119 0.02017]
 [0.01757 0.02028 0.00767 ... 0.00099 0.00079 0.00156]
 [0.01778 0.03724 0.01732 ... 0.00156 0.00114 0.02161]] ['vrl' 'vrl' 'vrl' ... 'euk' 'euk' 'euk']


In [9]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size=0.20, stratify = dummy_y)

In [10]:
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

sklearn.metrics.roc_auc_score(y_test, y_pred, multi_class='ovr')

0.9509593612385394

In [40]:
# Wrapper function with 5-fold cross-validation

def train_knn(dataset, features_to_remove=[], k=3):
    X = dataset[:,1:].astype('float')
    np.delete(X, features_to_remove, axis=1)

    y = dataset[:,0].astype('str')
    
    
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_Y = encoder.transform(y)

    # convert integers to dummy variables (i.e. one hot encoded)
    dummy_y = np_utils.to_categorical(encoded_Y)
    
    classifier = KNeighborsClassifier(n_neighbors=k)
    roc_scores = cross_val_score(
        classifier,
        X, 
        y, 
        cv=5, 
        scoring='roc_auc_ovo', 
    )
    
    roc_average = sum(roc_scores)/len(roc_scores)
    
    f1_micro_scores = cross_val_score(
        classifier,
        X, 
        y, 
        cv=5, 
        scoring='f1_micro', 
    )
    
    f1_micro_average = sum(f1_micro_scores)/len(f1_micro_scores)
    
    f1_macro_scores = cross_val_score(
        classifier,
        X, 
        y, 
        cv=5, 
        scoring='f1_macro', 
    )
    
    f1_macro_average = sum(f1_macro_scores)/len(f1_macro_scores)
    
    return roc_average, f1_micro_average, f1_macro_average

In [39]:
for k in [1,2,3,4,5]:
    roc_average, f1_micro_average, f1_macro_average = train_knn(dataset, k=k)
    print('scores for', k, ':', roc_average, f1_micro_average, f1_macro_average)

scores for 1 : 0.9303958688422387 0.9431158003735925 0.8704629293414918
scores for 2 : 0.9533098371132785 0.9338141098099502 0.8335897874487894
scores for 3 : 0.9610614614248553 0.9391184612079371 0.867450903806995
scores for 4 : 0.9639147408269018 0.9350439625745552 0.8593840991613122
scores for 5 : 0.9675258844624643 0.9364276909992343 0.8633037218236309


In [41]:
for k in [1,2,3,4,5]:
    roc_average, f1_micro_average, f1_macro_average = train_knn(dataset, k=k)
    print('scores for', k, ':', roc_average, f1_micro_average, f1_macro_average)

scores for 1 : 0.9234897472528795 0.9431158003735925 0.8704629293414918
scores for 2 : 0.9483630819613846 0.9338141098099502 0.8335897874487894
scores for 3 : 0.9563593553356389 0.9391184612079371 0.867450903806995
scores for 4 : 0.9589661681867385 0.9350439625745552 0.8593840991613122
scores for 5 : 0.9623565468131066 0.9364276909992343 0.8633037218236309
