In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from keras.layers import Input, Dense, Lambda, Flatten, Reshape
from keras.layers import Conv1D,UpSampling1D
from keras.models import Model
from keras import backend as K

n_features = 4
boat_csv = pd.read_csv("Data/Boat_nominal_data/Boat_sequences_mix.csv")
boat_csv = boat_csv.drop(columns=["Unnamed: 0", "M0C", "M1C", "Acceleration","Speed"])
scaler = StandardScaler()
normal_data = scaler.fit_transform(boat_csv)
print(normal_data.shape)

boat_val = pd.read_csv("Data/Boat_nominal_data/Boat_sequence_mix_val.csv")
boat_val = boat_val.drop(columns=["Unnamed: 0", "M0C", "M1C", "Acceleration","Speed"])
scaler = StandardScaler()
val_nom_data = scaler.fit_transform(boat_val)

def prepare_sequences(data, batch_size):
    samples = []
    for i in range(0,data.shape[0], batch_size):
        sample = data[i:i+batch_size]	
        samples.append(sample)
    sequences = np.array(samples)
    trainX = np.reshape(sequences, (len(sequences), batch_size, n_features))
    return trainX


def prepare_data():    
    trainX_nominal = prepare_sequences(normal_data,1024) 
    print(trainX_nominal.shape)
    
    valX_nominal = prepare_sequences(val_nom_data,1024)
    print(valX_nominal.shape)

    return trainX_nominal, valX_nominal

trainX_nominal, valX_nominal = prepare_data()


(307200, 4)
(300, 1024, 4)
(30, 1024, 4)


In [None]:
from keras import Sequential
from keras.layers import MaxPooling1D, RepeatVector, LSTM
from keras_preprocessing.sequence import TimeseriesGenerator

input_shape = (1024, n_features)
kernel_size = 3
latent_dim = 10
use_mse = True   
load_weights = False

def create_vae():
    units = len(trainX_nominal)

    inputs = Input(shape=input_shape, name='encoder_input')
    x = inputs
    encoder= Sequential()
    for i in range(3):
        encoder.add(Dense(units, activation='relu'))
    
    encoder.add(Dense(latent_dim, name="embeddings")) 
    

    decoder = Sequential() 
    for i in range(3):
        decoder.add(Dense(units, activation='relu'))
    decoder.add(Dense(4))

    vae = Sequential()
    vae.add(encoder)
    vae.add(decoder)
    vae.compile(optimizer='rmsprop', loss='mse', metrics= ['accuracy'])
    return (vae, encoder, decoder)


In [16]:
from keras.callbacks import ModelCheckpoint

vae, encoder, decoder = create_vae()
checkpointer = ModelCheckpoint(filepath="Models/Weights/Nominal_weights_ae.hdf5",
                               verbose=1, save_best_only=True)


In [17]:

vae.fit(x=trainX_nominal,
        y=trainX_nominal,
        epochs=90,
        validation_data=(valX_nominal,valX_nominal),
        batch_size=1024,
        callbacks=[checkpointer])
vae.load_weights('Models/Weights/Nominal_weights_ae.hdf5')


Train on 300 samples, validate on 30 samples
Epoch 1/90





Epoch 00001: val_loss improved from inf to 0.92240, saving model to Models/Weights/Nominal_weights_ae.hdf5


Epoch 2/90





Epoch 00002: val_loss improved from 0.92240 to 0.53539, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 3/90





Epoch 00003: val_loss improved from 0.53539 to 0.34848, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 4/90





Epoch 00004: val_loss did not improve from 0.34848
Epoch 5/90





Epoch 00005: val_loss did not improve from 0.34848
Epoch 6/90





Epoch 00006: val_loss improved from 0.34848 to 0.22299, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 7/90





Epoch 00007: val_loss did not improve from 0.22299
Epoch 8/90





Epoch 00008: val_loss did not improve from 0.22299
Epoch 9/90





Epoch 00009: val_loss improved from 0.22299 to 0.16565, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 10/90





Epoch 00010: val_loss improved from 0.16565 to 0.11406, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 11/90





Epoch 00011: val_loss did not improve from 0.11406
Epoch 12/90





Epoch 00012: val_loss did not improve from 0.11406
Epoch 13/90





Epoch 00013: val_loss improved from 0.11406 to 0.07453, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 14/90





Epoch 00014: val_loss improved from 0.07453 to 0.05231, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 15/90





Epoch 00015: val_loss did not improve from 0.05231
Epoch 16/90





Epoch 00016: val_loss did not improve from 0.05231
Epoch 17/90





Epoch 00017: val_loss did not improve from 0.05231
Epoch 18/90





Epoch 00018: val_loss did not improve from 0.05231
Epoch 19/90





Epoch 00019: val_loss improved from 0.05231 to 0.04029, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 20/90





Epoch 00020: val_loss improved from 0.04029 to 0.03600, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 21/90





Epoch 00021: val_loss did not improve from 0.03600
Epoch 22/90





Epoch 00022: val_loss did not improve from 0.03600
Epoch 23/90





Epoch 00023: val_loss did not improve from 0.03600
Epoch 24/90





Epoch 00024: val_loss did not improve from 0.03600
Epoch 25/90





Epoch 00025: val_loss did not improve from 0.03600
Epoch 26/90





Epoch 00026: val_loss did not improve from 0.03600
Epoch 27/90





Epoch 00027: val_loss did not improve from 0.03600
Epoch 28/90





Epoch 00028: val_loss did not improve from 0.03600
Epoch 29/90





Epoch 00029: val_loss improved from 0.03600 to 0.03242, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 30/90





Epoch 00030: val_loss improved from 0.03242 to 0.01654, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 31/90





Epoch 00031: val_loss improved from 0.01654 to 0.01317, saving model to Models/Weights/Nominal_weights_ae.hdf5
Epoch 32/90





Epoch 00032: val_loss did not improve from 0.01317
Epoch 33/90





Epoch 00033: val_loss did not improve from 0.01317
Epoch 34/90





Epoch 00034: val_loss did not improve from 0.01317
Epoch 35/90





Epoch 00035: val_loss did not improve from 0.01317
Epoch 36/90





Epoch 00036: val_loss did not improve from 0.01317
Epoch 37/90





Epoch 00037: val_loss did not improve from 0.01317
Epoch 38/90





Epoch 00038: val_loss did not improve from 0.01317
Epoch 39/90





Epoch 00039: val_loss did not improve from 0.01317
Epoch 40/90





Epoch 00040: val_loss did not improve from 0.01317
Epoch 41/90





Epoch 00041: val_loss did not improve from 0.01317
Epoch 42/90





Epoch 00042: val_loss did not improve from 0.01317
Epoch 43/90





Epoch 00043: val_loss did not improve from 0.01317
Epoch 44/90





Epoch 00044: val_loss did not improve from 0.01317
Epoch 45/90





Epoch 00045: val_loss did not improve from 0.01317
Epoch 46/90





Epoch 00046: val_loss did not improve from 0.01317
Epoch 47/90





Epoch 00047: val_loss did not improve from 0.01317
Epoch 48/90





Epoch 00048: val_loss did not improve from 0.01317
Epoch 49/90





Epoch 00049: val_loss did not improve from 0.01317
Epoch 50/90





Epoch 00050: val_loss did not improve from 0.01317
Epoch 51/90





Epoch 00051: val_loss did not improve from 0.01317
Epoch 52/90





Epoch 00052: val_loss did not improve from 0.01317
Epoch 53/90





Epoch 00053: val_loss did not improve from 0.01317
Epoch 54/90





Epoch 00054: val_loss did not improve from 0.01317
Epoch 55/90





Epoch 00055: val_loss did not improve from 0.01317
Epoch 56/90





Epoch 00056: val_loss did not improve from 0.01317
Epoch 57/90





Epoch 00057: val_loss did not improve from 0.01317
Epoch 58/90





Epoch 00058: val_loss did not improve from 0.01317
Epoch 59/90





Epoch 00059: val_loss did not improve from 0.01317
Epoch 60/90





Epoch 00060: val_loss did not improve from 0.01317
Epoch 61/90





Epoch 00061: val_loss did not improve from 0.01317
Epoch 62/90





Epoch 00062: val_loss did not improve from 0.01317
Epoch 63/90





Epoch 00063: val_loss did not improve from 0.01317
Epoch 64/90





Epoch 00064: val_loss did not improve from 0.01317
Epoch 65/90





Epoch 00065: val_loss did not improve from 0.01317
Epoch 66/90





Epoch 00066: val_loss did not improve from 0.01317
Epoch 67/90





Epoch 00067: val_loss did not improve from 0.01317
Epoch 68/90





Epoch 00068: val_loss did not improve from 0.01317
Epoch 69/90





Epoch 00069: val_loss did not improve from 0.01317
Epoch 70/90





Epoch 00070: val_loss did not improve from 0.01317
Epoch 71/90





Epoch 00071: val_loss did not improve from 0.01317
Epoch 72/90





Epoch 00072: val_loss did not improve from 0.01317
Epoch 73/90





Epoch 00073: val_loss did not improve from 0.01317
Epoch 74/90





Epoch 00074: val_loss did not improve from 0.01317
Epoch 75/90





Epoch 00075: val_loss did not improve from 0.01317
Epoch 76/90





Epoch 00076: val_loss did not improve from 0.01317
Epoch 77/90





Epoch 00077: val_loss did not improve from 0.01317
Epoch 78/90





Epoch 00078: val_loss did not improve from 0.01317
Epoch 79/90





Epoch 00079: val_loss did not improve from 0.01317
Epoch 80/90





Epoch 00080: val_loss did not improve from 0.01317
Epoch 81/90





Epoch 00081: val_loss did not improve from 0.01317
Epoch 82/90





Epoch 00082: val_loss did not improve from 0.01317
Epoch 83/90





Epoch 00083: val_loss did not improve from 0.01317
Epoch 84/90





Epoch 00084: val_loss did not improve from 0.01317
Epoch 85/90





Epoch 00085: val_loss did not improve from 0.01317
Epoch 86/90





Epoch 00086: val_loss did not improve from 0.01317
Epoch 87/90





Epoch 00087: val_loss did not improve from 0.01317
Epoch 88/90





Epoch 00088: val_loss did not improve from 0.01317
Epoch 89/90





Epoch 00089: val_loss did not improve from 0.01317
Epoch 90/90





Epoch 00090: val_loss did not improve from 0.01317


In [18]:
vaes = []
vaes.append(vae)

def return_mask(num, labels):
    return np.squeeze(np.argwhere(labels == num))

labels = pd.read_csv("Data/Boat_nominal_data/Boat_mix_labels.csv")
labels = labels.drop(columns="Unnamed: 0")
labels = np.array(labels)

masks = [return_mask(num,labels)[:,0] for num in range(0,9)]


from sklearn.decomposition import PCA

model_index = 0
# 
# for i in vaes:
#     encodings.append(i[1].predict(trainX_nominal))

encodings = []
encodings.append(np.average(encoder.predict(trainX_nominal), axis=1))
print(encodings[0].shape)

def check_z_sampling(encoded_values):
    m = encoded_values[0]
    var = np.exp(0.5*encoded_values[1])
    eps = np.random.normal(0,1,latent_dim)
    
    sampled = []
    index = 0
    var_zero = np.zeros(10)
    for means in m:
        sample = means+var[index]*eps
        #sample = means+var_zero*eps 
        sampled.append(sample)
        index += 1
    
    sampled = np.array(sampled)
    return sampled

def plot_pca(title, type): 
    x_val = []
    y_val= []
    for i in range(principalComponents.shape[0]):
        x_val.append(principalComponents[i][0])
        y_val.append(principalComponents[i][1])
    x_val = np.array(x_val)
    y_val = np.array(y_val)
    
    for mask in masks:
        plt.scatter(x=x_val[mask], y=y_val[mask], alpha=0.5)

    plt.legend(labels=np.arange(0,9))
    plt.title(str(title)+""+type)
    plt.show()
    
#print(encodings[0][0][0], encodings[0][1][0], encodings[0][2][0])
for i, encod in enumerate(encodings):
        latent_values = check_z_sampling(encod)
        scaler = StandardScaler()
        enc_input = scaler.fit_transform(encod) 
        pca = PCA(2)
        principalComponents = pca.fit_transform(enc_input)
        #print(pca.explained_variance_ratio_)
        plot_pca('?', '?')


(300, 10)


In [29]:
runs = []
for mask in masks:
    run_for_class = trainX_nominal[mask]
    print(run_for_class.shape)
    runs.append(run_for_class)
    
for i in runs[0]:
    run = np.reshape(i, (1, 1024,4))
    rec = vaes[0].predict(run)
    rec = np.reshape(rec, (len(trainX_nominal[0]), n_features))
    reconstruction_df = pd.DataFrame(rec, columns=boat_csv.columns)
    
    plt.plot(reconstruction_df["Lon"], reconstruction_df["Lat"])
    #plt.savefig("Imgs/Latent_reconstruction/"+str(title)+".png")
    plt.show()
    break
    


(192, 1024, 4)
(15, 1024, 4)
(11, 1024, 4)
(12, 1024, 4)
(14, 1024, 4)
(11, 1024, 4)
(13, 1024, 4)
(16, 1024, 4)
(16, 1024, 4)


In [13]:

encodings = encoder.predict(trainX_nominal)


labels = pd.read_csv("Data/Boat_nominal_data/Boat_mix_labels.csv")
labels = labels.drop(columns="Unnamed: 0")
labels = np.array(labels)

mat_mask = np.array([labels for i in range(latent_dim)])
print(mat_mask.shape)



<class 'numpy.ndarray'> <class 'numpy.int64'>


In [16]:
from sklearn.decomposition import PCA

labels = np.array(pd.read_csv("Data/Boat_nominal_data/Boat_mix_labels.csv")['label']) 
nominals = np.squeeze(np.argwhere(labels==1))
anomalous = np.squeeze(np.argwhere(labels==0))
print(type(nominals), type(nominals[0]))

model_index = 0

titles = ["Mean", "Std", "Sampled"]

scaler = StandardScaler()
enc_input = scaler.fit_transform(encodings)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(enc_input)
x_val = []
y_val=[]
for i in range(principalComponents.shape[0]):
    x_val.append(principalComponents[i][0])
    y_val.append(principalComponents[i][1])
x_val = np.array(x_val)
y_val = np.array(y_val)


plt.scatter(x=x_val[nominals],y=y_val[nominals], alpha=0.5)
plt.scatter(x=x_val[anomalous],y=y_val[anomalous], alpha=0.5)
plt.show()


from sklearn.manifold import TSNE

def tsne(data, title):
    tsne = TSNE(n_components=2, random_state=0)
    
    tsne_obj= tsne.fit_transform(data)
    tsne_df = pd.DataFrame({'X':tsne_obj[:,0],
                            'Y':tsne_obj[:,1],
                            })
    
    plt.scatter(x=tsne_df["X"][nominals],
                y=tsne_df["Y"][nominals], alpha=0.5)
    plt.scatter(x=tsne_df["X"][anomalous],
                y=tsne_df["Y"][anomalous], alpha=0.5)
    plt.title(title)
    plt.show()
    
    return tsne_df

tsne_enc_nom_df = tsne(encodings, "Values")


<class 'numpy.ndarray'> <class 'numpy.int64'>


In [23]:
def sample_from_latent_space(xCoord, yCoord):
    point = np.array([xCoord,yCoord])
    latent_point = pca.inverse_transform(point)
    return np.reshape(latent_point,(1, latent_dim))

def visualize_reconstruction(reconstructed_run, title):
    plt.plot(reconstructed_run["Lon"], reconstructed_run["Lat"])
    plt.title(title)
    plt.show()
    # plt.plot(reconstructed_run["Sin"])
    # plt.plot(reconstructed_run["Cosin"])
    # plt.show()

X,Y = np.mgrid[-5:5.1:0.5, -5:5.1:0.5]
XY = np.vstack((X.flatten(), Y.flatten())).T

point = [0,0]

for i in XY:
    point = sample_from_latent_space(i[0],i[1])
    reconstructed = np.reshape(decoder.predict(point), (len(trainX_nominal[0]),n_features))
    visualize_reconstruction(pd.DataFrame(reconstructed, columns=boat_csv.columns), 
                         title="Reconstruction")


In [20]:
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering

plt.figure(figsize=(10, 7))
plt.title("Customer Dendograms")
dend_nom = shc.dendrogram(shc.linkage(tsne_dec_nom_df, method='ward'))
dend_anom = shc.dendrogram(shc.linkage(tsne_dec_anom_df, method='ward'))



In [21]:

n_clusters = 4
cluster = AgglomerativeClustering(n_clusters=n_clusters, 
                                  affinity='euclidean',
                                  linkage='ward')
cl_nom = cluster.fit_predict(tsne_dec_nom_df)
cl_anom = cluster.fit_predict(tsne_dec_anom_df)
plt.plot(cl_nom)
plt.title("NOMINAL CLUSTERS")
plt.show()

plt.plot(cl_anom)
plt.title("Anomalous_clusters")
plt.show()

In [22]:

df_nom = pd.DataFrame(normal_data, columns=boat_csv.columns)
df_anom = pd.DataFrame(anomalous_data, columns=boat_anom_csv.columns)
def plot_clusters(cl, df, nominal):
    for i in range(n_clusters):
        cluster = np.squeeze(np.argwhere(cl==i))    
        if nominal:
            plt.scatter(x=df['Lon'][cluster],y=df["Lat"][cluster],s=5)
        else:
            plt.scatter(x=df['lon'][cluster],y=df["lat"][cluster],s=5)
    plt.show()


plot_clusters(cl_nom, df_nom, True)
plot_clusters(cl_anom, df_anom, False)


In [32]:
df_nominal = pd.DataFrame(normal_data, columns=boat_csv.columns)
df = pd.DataFrame(nom_enc[0])

plt.figure(1)
axis_list = []

for i in range(df.shape[1]):
    plt.plot(df[i])

plt.show()    

In [23]:
from sklearn.cluster import KMeans
cluster_comp = [0,1,2,3]# print(tsne_enc_df)
# print(tsne_dec_df)

kmeans_normal = KMeans(n_clusters=2, random_state=0).fit(normal_data)
clusters_normal = kmeans_normal.predict(np.average(trainX_nominal,axis=0))

kmeans_enc = KMeans(n_clusters=2, random_state=0).fit(tsne_enc_df)
clusters_enc = kmeans_enc.predict(tsne_enc_df)
plt.plot(clusters_enc)
plt.show()

kmeans_dec = KMeans(n_clusters=2, random_state=0).fit(tsne_dec_df)
clusters_dec = kmeans_dec.predict(tsne_dec_df)

NameError: name 'tsne_enc_df' is not defined

In [25]:
def transform_to_mask(cl_label, clust_obj):
    mask = []
    part = []
    for i, elem in enumerate(clust_obj):
        if elem == cl_label:
            part.append(i)
        else:
            if part:
                mask.append(part)
                part = []
            else:
                pass
    return mask


def get_mask_list(clust_obj):
    mask_list = []
    for i in cluster_comp:
        mask_list.append(transform_to_mask(i, clust_obj))
    return mask_list


masks_normal = np.array(get_mask_list(clusters_normal))

masks_enc = np.array(get_mask_list(clusters_enc))

masks_dec = np.array(get_mask_list(clusters_dec))

print(masks_normal)
masks = (masks_normal, masks_enc,masks_dec)
print(masks_normal.shape, masks_enc.shape, masks_enc.shape)


NameError: name 'clusters_enc' is not defined

In [41]:
def plot_cl(cl, color):
    plt.plot(cl["lon"], cl["lat"], color=color)
        
    
def plot_clusters_on_map():
    titles = ['Nominal Normal','Nominal Encoded','Nominal Decoded']
    color_list = ['blue','green','red','black']
    map = anomalous_data[:6620]
    print(map.shape)
    for k,mask in enumerate(masks):
        for i, elem in enumerate(mask):
            for j in elem:
                cl = pd.DataFrame(map[j], columns=boat_anom_csv.columns)
                plot_cl(cl, color_list[i]) 
        plt.title(titles[k])
        plt.show()
      
               
plot_clusters_on_map()


(6620, 7)
