<a href="https://colab.research.google.com/github/nina-prog/DataAnalysis_VAE/blob/main/VAE_v1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
sns.set()

In [3]:
### Load ecg5000 data using read_csv
ecg5000 = pd.read_csv('ECG5000_ALL.txt', sep='\s+', header=None)

### Delete label-column first (column 0)
ecg5000.drop(ecg5000.columns[[0]], axis=1, inplace=True)

In [4]:
### Optional test and info about data set
print("Get type of ecg5000: {} \n".format(type(ecg5000)))
print("Get dimensions of ecg5000: {} \n".format(ecg5000.shape))
print("Get number of elements of ecg5000: {} \n".format((ecg5000.size)))
print("Display first 10 rows of ecg5000: \n {}".format(ecg5000.head(10)))

Get type of ecg5000: <class 'pandas.core.frame.DataFrame'> 

Get dimensions of ecg5000: (5000, 140) 

Get number of elements of ecg5000: 700000 

Display first 10 rows of ecg5000: 
         1         2         3    ...       138       139       140
0 -0.112522 -2.827204 -3.773897  ...  0.123431  0.925286  0.193137
1 -1.100878 -3.996840 -4.285843  ...  0.773820  1.119621 -1.436250
2 -0.567088 -2.593450 -3.874230  ...  0.321097  0.904227 -0.421797
3  0.490473 -1.914407 -3.616364  ...  1.086798  1.403011 -0.383564
4  0.800232 -0.874252 -2.384761  ...  0.971020  1.614392  1.421456
5 -1.507674 -3.574550 -4.478011  ...  1.634990  1.493366 -0.783134
6 -0.297161 -2.766635 -4.102185  ...  1.110407  1.288165 -0.823386
7  0.446769 -1.507397 -3.187468  ...  1.258433  0.961215 -0.999476
8  0.087631 -1.753490 -3.304473  ...  0.192971 -0.648683 -2.441068
9 -0.832281 -1.700368 -2.257301  ...  2.126852  1.679299  0.965814

[10 rows x 140 columns]


In [5]:
### Normalize dataframe with min-max-normalization to range between [-0.8, 0.8]
### For any [a, b] normalization use: x = (b-a)((x-min)/(max-min))+ a
normalized_ecg5000 = (0.8 + 0.8)*(ecg5000 - ecg5000.min())/(ecg5000.max() - ecg5000.min())-0.8
# print(normalized_ecg5000)

In [6]:
### Split Data into 80/20 Training, Test
trainDF, testDF = train_test_split(normalized_ecg5000, test_size=0.2)
### Flatten Values to 1D NP Array
x_train = trainDF.values.flatten()
x_test = testDF.values.flatten()

print("Shape of Training DataFrame: \t {}".format(trainDF.shape))
print("Shape of Test DataFrame: \t {}".format(testDF.shape))


Shape of Training DataFrame: 	 (4000, 140)
Shape of Test DataFrame: 	 (1000, 140)


In [8]:
### For better understanding visit: https://towardsdatascience.com/autoencoders-for-the-compression-of-stock-market-data-28e8c1a2da3e
### or for code: https://gist.github.com/GerardBCN/40349b39bc45d4550141aff6966d1619#file-stock_price_autoencoding-ipynb


#################################
### TO DO: Correct Input Size ###
#################################

window_length = 10
encoding_dim = 3
epochs = 100
test_samples = 2000

inputs = keras.Input(shape=(window_length, 1))
encoded = layers.LSTM(encoding_dim, activation='tanh')(inputs)

decoded = layers.RepeatVector(window_length)(encoded)
decoded = layers.LSTM(1, return_sequences=True, activation='tanh')(decoded)

sequence_autoencoder = keras.Model(inputs, decoded)
encoder = keras.Model(inputs, encoded)
sequence_autoencoder.summary()

sequence_autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
history = sequence_autoencoder.fit(x_train, x_train,
                epochs=epochs,
                batch_size=1024,
                shuffle=True,
                validation_data=(x_test, x_test))

decoded_ecg5000 = sequence_autoencoder.predict(x_test)

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 10, 1)]           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 3)                 60        
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 10, 3)             0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 10, 1)             20        
Total params: 80
Trainable params: 80
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch