<a href="https://colab.research.google.com/github/nina-prog/DataAnalysis_VAE/blob/main/VAE_v1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.utils.vis_utils import plot_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Data Preprocessing

---



## Load Data

In [43]:
### Load ecg5000 data using read_csv
ecg5000 = pd.read_csv('ECG5000_ALL.txt', sep='\s+', header=None)

### Delete label-column first (column 0)
ecg5000.drop(ecg5000.columns[[0]], axis=1, inplace=True)

In [None]:
### Optional test and info about data set
print("Type of ecg5000: \t \t {}".format(type(ecg5000)))
print("Dimensions of ecg5000: \t \t {}".format(ecg5000.shape))
print("Number of elements of ecg5000: \t {}".format((ecg5000.size)))
print("Display first 10 rows of ecg5000: \n {}".format(ecg5000.head(10)))

## Scale Data

In [None]:
### Normalize dataframe with min-max-normalization to range between [-0.8, 0.8]
# (1) For any [a, b] normalization use: x = (b-a)((x-min)/(max-min))+ a
scaled_ecg5000 = (0.8 + 0.8)*(ecg5000 - ecg5000.min())/(ecg5000.max() - ecg5000.min())-0.8
print(normalized_ecg5000)

In [None]:
# Or (2) Use sklearn MinMaxScaler
min_max_scaler = MinMaxScaler(feature_range=(-0.8,0.8))
scaled_ecg5000 = pd.DataFrame(min_max_scaler.fit_transform(ecg5000))
print(scaled_ecg5000)

## Split Data

In [45]:
### Split Data into 80/20 Training, Test
trainDF, testDF = train_test_split(scaled_ecg5000, test_size=0.2)
# Optional test and info about new data sets
print("Shape of Training DataFrame: \t {}".format(trainDF.shape))
print("Shape of Test DataFrame: \t {}".format(testDF.shape))

Shape of Training DataFrame: 	 (4000, 140)
Shape of Test DataFrame: 	 (1000, 140)


## Reshape Data

In [None]:
# (1) 1D Array
### Flatten Values to 1D NP Array
x_train = trainDF.values.flatten()
x_test = testDF.values.flatten()

### Reshape input into [samples, timesteps, features]
n_train = len(x_train)
n_test = len(x_test)
x_train = x_train.reshape((1, n_train, 1)) # samples 1+, time steps n_train (4000*140), features 1  
x_test = x_test.reshape((1, n_test, 1)) # samples 1+, time steps n_test (1000*140), features 1

### Properties
print("Time steps of reshaped train dataset: {}".format(n_train))
print("Time steps of reshaped test dataset: {}".format(n_test))

In [77]:
# Or (2) Leave Dataframe Dimesions as they are
### Convert to array
x_train = trainDF.to_numpy()
x_test = testDF.to_numpy()

### Reshape input into [samples, timesteps, features]
n_train = len(trainDF.index) # time steps
n_test = len(testDF.index) # time steps
f_train = len(trainDF.columns) # features
f_test = len(testDF.columns) # features
x_train = x_train.reshape(1, n_train, f_train)
x_test = x_test.reshape(1, n_test, f_test)

### Properties
print("Time steps of reshaped train dataset: {}".format(n_train))
print("Time steps of reshaped test dataset: {}".format(n_test))

Time steps of reshaped train dataset: 4000
Time steps of reshaped test dataset: 1000


# Build Autoencoder (AE)

---



In [92]:
### For better understanding visit: https://towardsdatascience.com/autoencoders-for-the-compression-of-stock-market-data-28e8c1a2da3e
### or for code: https://gist.github.com/GerardBCN/40349b39bc45d4550141aff6966d1619#file-stock_price_autoencoding-ipynb
### For Reshaping Issues: https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/

### fit model
encoding_dim = 3
epochs = 50

### define model
inputs = keras.Input(shape=(n_train, f_train), name='Input_layer')
encoded = layers.LSTM(encoding_dim, activation='tanh', name='Encode_1')(inputs)
decoded = layers.RepeatVector(n_train, name='Bootleneck')(encoded)
decoded = layers.LSTM(1, return_sequences=True, activation='tanh', name='Decode_1')(decoded)
decoded = TimeDistributed(Dense(140, activation='tanh'),name='Output_Layer')(decoded)

sequence_autoencoder = keras.Model(inputs, decoded)
encoder = keras.Model(inputs, encoded)

sequence_autoencoder.summary()

### train AE
sequence_autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
history = sequence_autoencoder.fit(x_train, x_train,
                epochs=epochs,
                batch_size=1024,
                shuffle=True,
                validation_data=(x_train, x_train))

### recreation


Model: "functional_65"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input_layer (InputLayer)     [(None, 4000, 140)]       0         
_________________________________________________________________
Encode_1 (LSTM)              (None, 3)                 1728      
_________________________________________________________________
Bootleneck (RepeatVector)    (None, 4000, 3)           0         
_________________________________________________________________
Decode_1 (LSTM)              (None, 4000, 1)           20        
_________________________________________________________________
Output_Layer (TimeDistribute (None, 4000, 140)         280       
Total params: 2,028
Trainable params: 2,028
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 

# Plot Results

---



In [93]:
print(x_train.shape)
print(decoded_ecg5000.shape)

print(x_train)
print(decoded_ecg5000)

# container = {"ecg5000": ecg5000, "decoded ecg5000": decoded_ecg5000}
# values = pd.DataFrame(data=container)
# plt.plot(val, linewidth = 1, figsize=(25,6), title='Autoencoder Result')

(1, 4000, 140)
(1, 4000, 140)
[[[-0.1110686  -0.17686869 -0.4458685  ... -0.0369936  -0.22963038
   -0.47184174]
  [-0.03321507 -0.12827784 -0.45126996 ...  0.21151635  0.18217461
   -0.19577157]
  [ 0.20717933  0.23120738  0.00732081 ... -0.16530639 -0.08224348
   -0.04169195]
  ...
  [ 0.15005455  0.11401331 -0.18378076 ... -0.46430011 -0.34326688
   -0.26210482]
  [-0.03674064 -0.11266574 -0.49163153 ...  0.27470306  0.28476384
   -0.04472052]
  [ 0.10788321  0.043425   -0.22025965 ... -0.25423979 -0.1526599
   -0.28085446]]]
[[[ 0.00769053 -0.01822397 -0.02795841 ... -0.02164804  0.02115758
   -0.01978468]
  [ 0.0129608  -0.03916315 -0.04317983 ... -0.03903757  0.03501612
   -0.02421514]
  [ 0.02004946 -0.05995862 -0.05959188 ... -0.05216741  0.04540386
   -0.02774159]
  ...
  [ 0.10942466 -0.18857788 -0.2499218  ... -0.04889083  0.04199147
   -0.04706485]
  [ 0.10942466 -0.18857788 -0.2499218  ... -0.04889083  0.04199147
   -0.04706485]
  [ 0.10942466 -0.18857788 -0.2499218  ... -