In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

import keras
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Input, UpSampling1D, Reshape
from keras import Model, Sequential

K = keras.backend

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
data = pd.read_csv("arcene_data.tsv", header=None, sep=" ")

labels = pd.read_csv("arcene_labels.tsv", header=None)

data = data.fillna(0)

data = data.iloc[:, 0:2501]


# Joining before train/test split so the labels and data get shuffled together
joined = pd.concat((data, labels), axis=1)

joined = joined.values

#data

X_train_, X_test_ = train_test_split(joined, test_size=0.33, random_state=42)

In [3]:
#Splitting again
Y_train = X_train_[:, -1]

Y_test = X_test_[:, -1]

X_train = X_train_[:, 0:-1]

X_test = X_test_[:, 0:-1]

array([78,  0,  0,  0,  0,  0, 19,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int64)

In [4]:
#This is necessary for the way Keras handles inputs, it doesn't change the data at all
X_train = np.expand_dims(X_train, axis=2)

X_test = np.expand_dims(X_test, axis=2)

In [5]:
np.shape(X_test)

(33, 2501, 1)

In [6]:
# ENCODER
input_sig = Input(batch_shape=(None,2501,1))
x = Conv1D(16,3, activation='relu', padding='valid')(input_sig)
x1 = MaxPooling1D(2)(x)
x2 = Conv1D(2,3, activation='relu', padding='valid')(x1)
x3 = MaxPooling1D(2)(x2)
flat = Flatten()(x3)
encoded = Dense(32,activation = 'relu')(flat)
 
print("shape of encoded {}".format(K.int_shape(encoded)))
 
# DECODER 
x2_ = Conv1D(32, 3, activation='relu', padding='valid')(x3)
x1_ = UpSampling1D(2)(x2_)
x_ = Conv1D(64, 3, activation='relu', padding='valid')(x1_)
upsamp = UpSampling1D(2)(x_)
flat = Flatten()(upsamp)
decoded = Dense(2501,activation = 'relu')(flat)
decoded = Reshape((2501,1))(decoded)
 
print("shape of decoded {}".format(K.int_shape(decoded)))
 
autoencoder = Model(input_sig, decoded)
autoencoder.compile(optimizer='adadelta', loss='mse', metrics=['accuracy'])

autoencoder.summary()

shape of encoded (None, 32)
shape of decoded (None, 2501, 1)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2501, 1)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2499, 16)          64        
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1249, 16)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1247, 2)           98        
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 623, 2)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 621, 32)           224       
_________________________________________________________________
up_sampling1d_1

In [7]:
autoencoder.fit(X_train, X_train, epochs=10, validation_data=(X_test, X_test))

Train on 67 samples, validate on 33 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2856149aa90>

In [8]:
#This gets the compressed representation (think principle components). The compressed version has 1/2 the features of the full one
compressed_layer = 4
get_3rd_layer_output = K.function([autoencoder.layers[0].input], [autoencoder.layers[compressed_layer].output])
compressed = get_3rd_layer_output([X_test])[0]

np.shape(compressed)

(33, 623, 2)

In [9]:
representation = compressed.reshape(33, 623*2)

In [10]:
#Training K-Means
from tensorflow.contrib.factorization.python.ops import clustering_ops
import tensorflow as tf
 
def train_input_fn():
    data = tf.constant(representation, tf.float32)
    return (data, None)
 
unsupervised_model = tf.contrib.learn.KMeansClustering(
2 #num of clusters
, distance_metric = clustering_ops.SQUARED_EUCLIDEAN_DISTANCE
, initial_clusters=tf.contrib.learn.KMeansClustering.RANDOM_INIT
)
 
unsupervised_model.fit(input_fn=train_input_fn, steps=1000)

Instructions for updating:
Please use tf.contrib.factorization.KMeansClustering instead of tf.contrib.learn.KMeansClustering. It has a similar interface, but uses the tf.estimator.Estimator API instead of tf.contrib.learn.Estimator.
Instructions for updating:
Please replace uses of any Estimator from tf.contrib.learn with an Estimator from tf.estimator.*
Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000285639B9748>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_s

KMeansClustering(params={'num_clusters': 2, 'training_initial_clusters': 'random', 'distance_metric': 'squared_euclidean', 'random_seed': 0, 'use_mini_batch': True, 'mini_batch_steps_per_iteration': 1, 'kmeans_plus_plus_num_retries': 2, 'relative_tolerance': None})

In [11]:
#Getting clusters for data points
clusters = unsupervised_model.predict(input_fn=train_input_fn)
predicted = []
 
index = 0
for i in clusters:
    current_cluster = i['cluster_idx']
    predicted.append(current_cluster)
    features = X_test[index]
    index = index + 1

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Michael\AppData\Local\Temp\tmp_hldudjb\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [12]:
#Changing from 1vs0 classes to 1vs-1
predicted = [x if x==1 else -1 for x in predicted]

predicted = np.asarray(predicted)

In [13]:
Y_test

array([ 1, -1, -1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,
        1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1, -1],
      dtype=int64)

In [17]:
score = 0

for i in range(len(predicted)):
    if predicted[i] == Y_test[i]:
        score += 1
        
percent = (score / len(predicted))*100

print("CAE is {}% accurate".format(percent))

CAE is 60.60606060606061% accurate
