In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from keras.layers import Input, Dense, Lambda, Flatten, Reshape
from keras.layers import Conv1D
from keras.models import Model
from keras import backend as K
from sklearn.utils import resample

n_features = 4
n_runs = 200
labels = pd.read_csv("Data/Boat_nominal_data/Boat_mix_len_labels.csv")
labels = labels.drop(columns="Unnamed: 0") 
labels = np.array(labels)
max_len = 0



def prepare_training(path, n_runs):
    def closest_4(n, m):
        q = n / m
        n1 = m * q
        if (n * m) > 0:
            n2 = m * (q + 1)
        else:
            n2 = m * (q - 1)
        if abs(n-n1) < abs(n-n2):
            return int(n1)
        return int(n2)
    
    
    def extend_line(run, max_len):
        difference = abs(len(run) - max_len)
        extension = np.array([run[-1]]*difference)
        if difference != 0:
            run = np.vstack([run, extension])
        return run
    
    def get_max_len(sequence_list):
        max_len = 0
        min_len = 1000
        for seq in sequence_list:
            if len(seq) > max_len:
                max_len = len(seq)
            if len(seq) < min_len:
                min_len = len(seq)
        return max_len, min_len
    
    def construct_matrix(sequence_list):
        max_len, min_len = get_max_len(sequence_list)
        len = closest_4(max_len,4)
        len = 420
        train_matrix = np.zeros(shape=(n_runs, len, n_features))
        for index, run in enumerate(sequence_list):
            line = extend_line(run, len)
            train_matrix[index] = line
        return train_matrix
        
        
    def stadard_sequences(seqs):
        for i, seq in enumerate(seqs):
            seqs[i] = MinMaxScaler(feature_range=[0, 1]).fit_transform(seq)
        return seqs       
    
    
    def read_sequences():
        run_list_mix = []
        for index in range(n_runs):
            run_csv = pd.read_csv(path+str(index))
            run_csv = run_csv.drop(columns=['Unnamed: 0'])
            run_list_mix.append(run_csv)
        stands = stadard_sequences(run_list_mix)
        padded_matrix = construct_matrix(stands)
        return padded_matrix
    
    return read_sequences()
    

train_matrix = prepare_training("Mix_sequences_var_length/run^", n_runs=n_runs) 
print(train_matrix.shape)

Using TensorFlow backend.


ValueError: could not broadcast input array from shape (686,4) into shape (420,4)

In [23]:

from keras.layers import Input, LSTM, RepeatVector, Conv2DTranspose, MaxPooling1D, UpSampling1D, AveragePooling1D
from keras.losses import mse
from keras.models import Model


filters = 50
intermediate_dimension = 10 
latent_dim = 10

def Conv1DTranspose(input_tensor, filters, kernel_size,strides=2, padding='same'):
        x = Lambda(lambda x: K.expand_dims(x, axis=2))(input_tensor)
        x = Conv2DTranspose(filters=filters, kernel_size=(kernel_size, 1), 
                            activation='relu',strides=strides, padding='same')(x)
        x = Lambda(lambda x: K.squeeze(x, axis=2))(x)
        return x



def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean=0 and std=1.0
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon


def repeat(x):
    steps_matrix = K.ones_like(x[0][:, :, :1])
    
    latent_matrix = K.expand_dims(x[1], axis=1)
    return K.batch_dot(steps_matrix, latent_matrix)


def create_vae():
    print(max_len)
    inputs = Input(shape=(train_matrix.shape[1], n_features))
    x = inputs
    
    for i in range(2):
        x = Conv1D(filters=filters,
                   kernel_size=20,
                   strides=2,
                   padding='valid')(x)
    
    shape = K.int_shape(x)
    encoder_outputs, state_h, state_c = LSTM(intermediate_dimension, return_state=True)(x)
    encoder_states = [state_h, state_c]
    
    z_mean = Dense(latent_dim, name='z_mean',)(encoder_outputs)
    z_log_var = Dense(latent_dim, name='z_log_var')(encoder_outputs)
    z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
    
    encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
    
    latent_inputs = Input(shape=(latent_dim,), name='latent_inputs')
    #x = Lambda(repeat)([before_flattening, z])
    x = Dense(shape[1]*shape[2])(latent_inputs)
    x = Reshape((shape[1], shape[2]))(x)
    
    # 
    # decoder_inputs = Reshape((shape[1],shape[2]))(x)
    # decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    # decoder_out, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    # x = decoder_out

    for i in range(2):
        x = Conv1DTranspose(input_tensor=x,
                            filters=filters,
                            kernel_size=20,
                            strides=2,
                            padding='valid')
    
    
    output = Dense(n_features)(x)
    
    encoder.summary()
    decoder = Model(latent_inputs, output)
    decoder.summary()
    outputs = decoder(encoder.outputs[2])
    reconstruction_loss = mse(K.flatten(inputs), K.flatten(outputs))
    kl_loss = - 0.5 * K.mean(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), 
                             axis=-1)
    loss = K.mean(reconstruction_loss+kl_loss)
    vae = Model(inputs, outputs, name='vae')
    vae.add_loss(loss)
    #vae.summary()
    vae.compile(optimizer='adam')
    return vae, encoder, decoder

# 
# def create_ae():
#     inputs = Input(shape=(train_matrix.shape[1], n_features))
#     x = inputs
#     for i in range(2):
#         x = Conv1D(filters=filters, kernel_size=20,
#                    activation='relu',
#                    padding='same')(x)
#         x = AveragePooling1D(pool_size=2)(x)
#     shape = K.int_shape(x)   
#     print(shape)
#     _,x,_ = LSTM(intermediate_dimension, return_state=True)(x)
#     #x = Flatten()(x)
#     encoded = Dense(latent_dim)(x)
#     #x = Lambda(repeat)([before_flattening, encoded])
#     
#     latent_inputs = Input(shape=(latent_dim,), name='latent_inputs')
#     x = Dense(shape[1]*shape[2])(latent_inputs)
#     x = Reshape((shape[1],shape[2]))(x)
#     
#     for i in range(2):
#         x = Conv1DTranspose(input_tensor=x, filters=filters,
#                             kernel_size=20,strides=2, padding='same')
#         x = UpSampling1D(size=2)(x)
#        
#     #decoded = LSTM(n_features, return_sequences=True)(x)
#     
#     output = Dense(n_features)(x)
#     
#     encoder = Model(inputs, encoded)
#     encoder.summary()
#     decoder = Model(latent_inputs, output)
#     decoder.summary()
#     output = decoder(encoder.output)
#     sequence_autoencoder = Model(inputs, output)
#     #sequence_autoencoder.summary()
#     sequence_autoencoder.compile(optimizer='adam', 
#                                  loss='mse')
#     return sequence_autoencoder, encoder, shape


model, encoder, decoder = create_vae()


0


ValueError: Can not squeeze dim[2], expected a dimension of 1, got 2 for 'lambda_44/Squeeze_1' (op: 'Squeeze') with input shapes: [?,182,2,50].

In [6]:
from keras.callbacks import ModelCheckpoint


def train():
    
    print(train_matrix.shape)
    model.fit(train_matrix,train_matrix, epochs=50, verbose=1)
    model.save_weights("Models/Weights/AE_CONV_LSTM_Diff_len_dist_MATRIX_LEN.hdf5")


train()


(300, 420, 4)


Epoch 1/50


 32/300 [==>...........................] - ETA: 26s - loss: 0.3430

 64/300 [=====>........................] - ETA: 12s - loss: 0.3306

















Epoch 2/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0570

 64/300 [=====>........................] - ETA: 2s - loss: 0.0548

















Epoch 3/50


 32/300 [==>...........................] - ETA: 2s - loss: 0.0371

 64/300 [=====>........................] - ETA: 2s - loss: 0.0366

















Epoch 4/50


 32/300 [==>...........................] - ETA: 2s - loss: 0.0294

 64/300 [=====>........................] - ETA: 2s - loss: 0.0299

















Epoch 5/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0254

 64/300 [=====>........................] - ETA: 2s - loss: 0.0294

















Epoch 6/50


 32/300 [==>...........................] - ETA: 2s - loss: 0.0276

 64/300 [=====>........................] - ETA: 2s - loss: 0.0258

















Epoch 7/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0296

 64/300 [=====>........................] - ETA: 2s - loss: 0.0314



















Epoch 8/50


 32/300 [==>...........................] - ETA: 2s - loss: 0.0263

 64/300 [=====>........................] - ETA: 2s - loss: 0.0299

















Epoch 9/50


 32/300 [==>...........................] - ETA: 2s - loss: 0.0255

 64/300 [=====>........................] - ETA: 2s - loss: 0.0265

















Epoch 10/50


 32/300 [==>...........................] - ETA: 2s - loss: 0.0273

 64/300 [=====>........................] - ETA: 2s - loss: 0.0258

















Epoch 11/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0225

 64/300 [=====>........................] - ETA: 3s - loss: 0.0233



















Epoch 12/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0221

 64/300 [=====>........................] - ETA: 2s - loss: 0.0214

















Epoch 13/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0199

 64/300 [=====>........................] - ETA: 3s - loss: 0.0206

















Epoch 14/50


 32/300 [==>...........................] - ETA: 4s - loss: 0.0203

 64/300 [=====>........................] - ETA: 3s - loss: 0.0205

















Epoch 15/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0184

 64/300 [=====>........................] - ETA: 2s - loss: 0.0179

















Epoch 16/50


 32/300 [==>...........................] - ETA: 6s - loss: 0.0195

 64/300 [=====>........................] - ETA: 4s - loss: 0.0185

















Epoch 17/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0169

 64/300 [=====>........................] - ETA: 2s - loss: 0.0181

















Epoch 18/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0173

 64/300 [=====>........................] - ETA: 3s - loss: 0.0180

















Epoch 19/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0168

 64/300 [=====>........................] - ETA: 3s - loss: 0.0167

















Epoch 20/50


 32/300 [==>...........................] - ETA: 2s - loss: 0.0175

 64/300 [=====>........................] - ETA: 2s - loss: 0.0174

















Epoch 21/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0176

 64/300 [=====>........................] - ETA: 3s - loss: 0.0168

















Epoch 22/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0153

 64/300 [=====>........................] - ETA: 2s - loss: 0.0158

















Epoch 23/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0168

 64/300 [=====>........................] - ETA: 2s - loss: 0.0168

















Epoch 24/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0159

 64/300 [=====>........................] - ETA: 3s - loss: 0.0158

















Epoch 25/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0150

 64/300 [=====>........................] - ETA: 2s - loss: 0.0148

















Epoch 26/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0169

 64/300 [=====>........................] - ETA: 3s - loss: 0.0174

















Epoch 27/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0190

 64/300 [=====>........................] - ETA: 3s - loss: 0.0175

















Epoch 28/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0178

 64/300 [=====>........................] - ETA: 2s - loss: 0.0173

















Epoch 29/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0166

 64/300 [=====>........................] - ETA: 2s - loss: 0.0161

















Epoch 30/50


 32/300 [==>...........................] - ETA: 4s - loss: 0.0152

 64/300 [=====>........................] - ETA: 3s - loss: 0.0161

















Epoch 31/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0169

 64/300 [=====>........................] - ETA: 3s - loss: 0.0162

















Epoch 32/50


 32/300 [==>...........................] - ETA: 5s - loss: 0.0180

 64/300 [=====>........................] - ETA: 4s - loss: 0.0161

















Epoch 33/50


 32/300 [==>...........................] - ETA: 7s - loss: 0.0157

 64/300 [=====>........................] - ETA: 6s - loss: 0.0157

















Epoch 34/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0206

 64/300 [=====>........................] - ETA: 3s - loss: 0.0203

















Epoch 35/50


 32/300 [==>...........................] - ETA: 4s - loss: 0.0168

 64/300 [=====>........................] - ETA: 3s - loss: 0.0168

















Epoch 36/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0154

 64/300 [=====>........................] - ETA: 2s - loss: 0.0157

















Epoch 37/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0134

 64/300 [=====>........................] - ETA: 3s - loss: 0.0143

















Epoch 38/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0141

 64/300 [=====>........................] - ETA: 2s - loss: 0.0154

















Epoch 39/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0160

 64/300 [=====>........................] - ETA: 3s - loss: 0.0155

















Epoch 40/50


 32/300 [==>...........................] - ETA: 4s - loss: 0.0157

 64/300 [=====>........................] - ETA: 3s - loss: 0.0165

















Epoch 41/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0155

 64/300 [=====>........................] - ETA: 3s - loss: 0.0152

















Epoch 42/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0148

 64/300 [=====>........................] - ETA: 2s - loss: 0.0158

















Epoch 43/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0160

 64/300 [=====>........................] - ETA: 2s - loss: 0.0169

















Epoch 44/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0175

 64/300 [=====>........................] - ETA: 2s - loss: 0.0164

















Epoch 45/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0151

 64/300 [=====>........................] - ETA: 3s - loss: 0.0166

















Epoch 46/50


 32/300 [==>...........................] - ETA: 4s - loss: 0.0145

 64/300 [=====>........................] - ETA: 3s - loss: 0.0155

















Epoch 47/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0149

 64/300 [=====>........................] - ETA: 3s - loss: 0.0162

















Epoch 48/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0169

 64/300 [=====>........................] - ETA: 3s - loss: 0.0158

















Epoch 49/50


 32/300 [==>...........................] - ETA: 3s - loss: 0.0152

 64/300 [=====>........................] - ETA: 3s - loss: 0.0148

















Epoch 50/50


 32/300 [==>...........................] - ETA: 4s - loss: 0.0159

 64/300 [=====>........................] - ETA: 5s - loss: 0.0159

















In [7]:

def return_mask(num, labs):
    arg = np.squeeze(np.argwhere(labs == num))
    return arg

masks = [return_mask(num, np.array(labels))[:, 0] for num in range(0, 9)]

from sklearn.decomposition import PCA


encodings = encoder.predict(train_matrix)

#enc_mean, enc_var, z_enc = encodings[0], encodings[1], encodings[2]
enc_mean, enc_var, z_enc = encodings, encodings, encodings

print(enc_mean.shape, enc_var.shape, z_enc.shape)


from mpl_toolkits.mplot3d import Axes3D  



def plot_pca(title, i): 
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    markers = ['o', 'o', 'o', 'o', '^', '^', '^', '^', '^', '^']
    for index, mask in enumerate(masks):
        
        ax.scatter(principalComponents[:, 0][mask], 
                   principalComponents[:, 1][mask],  
                   principalComponents[:, 2][mask], marker=markers[index])
    # for mask in unseen_mask:
    #     ax.scatter(unseen_encoding[0][:,0][mask],
    #                unseen_encoding[0][:,1][mask],
    #                unseen_encoding[0][:,2][mask])
    plt.legend(labels=np.arange(0, 9))
    plt.title(str(title))
    plt.show()
    
    for mask in masks:
        plt.scatter(x=principalComponents[:, 0][mask], 
                    y=principalComponents[:, 1][mask],
                    alpha=0.5)
    # for mask in unseen_mask:
    #     plt.scatter(unseen_encoding[0][:,0][mask],
    #            unseen_encoding[0][:,1][mask])
    #     
        #break
    
    plt.legend(labels=np.arange(0, 9))
    plt.title(str(title))
    plt.show()


enc_list = [enc_mean, enc_var, z_enc]
titles = ["MEAN","LOG_VAR","SAMPLED"]
for i,enc in enumerate(enc_list):
    scaler = StandardScaler()
    enc_input = scaler.fit_transform(enc) 
    pca = PCA(3)
    principalComponents = pca.fit_transform(enc_input)
    print(principalComponents.shape)
    print(pca.explained_variance_ratio_)
    plot_pca('Sequences'+titles[i], 0)
    
    # principalComponents = enc
    # plot_pca('Sequences_Not_Pca'+titles[i], 0)
    



(300, 10) (300, 10) (300, 10)
(300, 3)
[0.7783707  0.1761033  0.04431508]


(300, 3)
[0.7783707  0.1761033  0.04431508]


(300, 3)
[0.7783707  0.1761033  0.04431508]


In [107]:

unseen_labs = pd.read_csv("Data/Boat_nominal_data/Boat_unseen_labels_mix.csv")
unseen_labs = unseen_labs.drop(columns="Unnamed: 0") 
unseen_labs = np.array(unseen_labs)
unseen_mask = [return_mask(num, np.array(unseen_labs))[:, 0] for num in range(0, 9)]

unseen_sequences_matrix = prepare_training("Mix_sequences_var_length/run_unseen^", 
                                           n_runs=300)

unseen_encoding = encoder.predict(unseen_sequences_matrix)

In [108]:
reconstruction = model.predict(train_matrix)
print(reconstruction.shape)
#RECONSTRUCTION
def reconstruct_sequence(seq_index):
    run = train_matrix[seq_index]
    #mask_seq = np.squeeze(np.argwhere(np.mean(run, axis=1) != 0))
    rec_run = reconstruction[seq_index]#[mask_seq]
    #print(rec_run[:,0])
    df = pd.DataFrame(rec_run[:-15], columns=["Timestep","Sin", "Cosin", "Lat", "Lon"])
    df_original = pd.DataFrame(run[:-15], columns=["Timestep","Sin", "Cosin", "Lat", "Lon"])
    plt.plot(df_original['Lon'], df_original['Lat'])
    plt.plot(df['Lon'], df['Lat'])
    plt.show()

for i in range(len(train_matrix)):
    reconstruct_sequence(i)
    if i == 5:
        break


(300, 420, 5)


In [3]:

model.load_weights("Models/Weights/VAE_CONV_LSTM_Diff_len_dist_MATRIX_LEN.hdf5")




















In [32]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)

tsne_obj= tsne.fit_transform(enc_input)

print(tsne_obj.shape)

for mask in masks:
    plt.scatter(x=tsne_obj[:, 0][mask], 
                y=tsne_obj[:, 1][mask],
                alpha=0.5)
plt.show()


(200, 2)


In [146]:
#RECONSTRUCTION
def reconstruct_sequence(seq_index):
    run = train_matrix[seq_index]
    #mask_seq = np.squeeze(np.argwhere(np.mean(run, axis=1) != 0))
    
    reconstr_run = model.predict(np.reshape(run, (1, run.shape[0], run.shape[1])))
    reconstr_run = np.reshape(reconstr_run, (run.shape[0], run.shape[1]))
    reconstr_run = reconstr_run
    df = pd.DataFrame(reconstr_run, columns=["Timestep","Sin", "Cosin", "Lat", "Lon"])
    plt.plot(df['Lon'], df['Lat'])
    plt.show()

for i in range(len(train_matrix)):
    reconstruct_sequence(i)
    if i > 5:
        break
    
reconstruction = model.predict(train_matrix)
reconstruction_unseen = model.predict(unseen_sequences_matrix)



In [11]:
#RECONSTRUCTION ERROR
def get_reconstructed_matrix(input_matrix, reconstrut):
    return_matrix = np.zeros(shape=input_matrix.shape)
    for i,run in enumerate(input_matrix):
        mask_seq = np.squeeze(np.argwhere(np.mean(run, axis=1) != 0))
        return_matrix[i][mask_seq] = reconstrut[i][mask_seq]    
    return reconstrut


train_error = abs(train_matrix-get_reconstructed_matrix(train_matrix,reconstruction))
unseen_runs_error = abs(unseen_sequences_matrix-get_reconstructed_matrix(train_matrix,reconstruction_unseen))

train_error_avg = np.mean(train_error, axis=2)
unseen_error_avg = np.mean(unseen_runs_error, axis=2)
train_error_avg = np.mean(train_error_avg, axis=1)
unseen_error_avg = np.mean(unseen_error_avg, axis=1)
print(train_error_avg.shape, unseen_error_avg.shape)

for mask in masks:
    plt.scatter(np.linspace(1,200,200)[mask],train_error_avg[mask])
plt.title('ERROR ON TRAIN')
plt.show()

for mask in unseen_mask:
    plt.scatter(np.linspace(1,200,200)[mask],unseen_error_avg[mask])
plt.title('ERROR ON Unseen')
plt.show()

(200,) (200,)


In [18]:
import ipyvolume as ipv
import numpy as np
x, y, z = unseen_encoding[:,0], unseen_encoding[:,1], unseen_encoding[:,2]

for mask in unseen_mask:
    ipv.scatter(x[mask], y[mask], z[mask], size=0.3, marker="sphere")
ipv.show()

VBox(children=(Figure(camera=PerspectiveCamera(fov=46.0, position=(0.0, 0.0, 2.0), quaternion=(0.0, 0.0, 0.0, …

[[1.69657601e-04 2.90231335e-03 2.66825905e-01 5.29824561e-01
  8.35837121e-01]
 [1.69657601e-04 2.90231335e-03 3.53738663e-01 5.49122807e-01
  7.10645313e-01]
 [9.93304461e-03 5.59014899e-02 3.40350877e-01 7.50188854e-01
  9.98851458e-01]
 ...
 [0.00000000e+00 6.14035088e-02 3.62061103e-01 4.36881822e-01
  1.00000000e+00]
 [4.80214197e-03 2.02577052e-01 6.96491228e-01 7.76227370e-01
  9.99448401e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]]
