<a href="https://colab.research.google.com/github/nina-prog/DataAnalysis_VAE/blob/main/VAE_v1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Data Preprocessing

---



## Load Data

In [3]:
### Load ecg5000 data using read_csv
ecg5000 = pd.read_csv('ECG5000_ALL.txt', sep='\s+', header=None)

### Delete label-column first (column 0)
ecg5000.drop(ecg5000.columns[[0]], axis=1, inplace=True)

In [None]:
### Optional test and info about data set
print("Type of ecg5000: \t \t {}".format(type(ecg5000)))
print("Dimensions of ecg5000: \t \t {}".format(ecg5000.shape))
print("Number of elements of ecg5000: \t {}".format((ecg5000.size)))
print("Display first 10 rows of ecg5000: \n {}".format(ecg5000.head(10)))

## Scale Data

In [None]:
### Normalize dataframe with min-max-normalization to range between [-0.8, 0.8]
# (1) For any [a, b] normalization use: x = (b-a)((x-min)/(max-min))+ a
scaled_ecg5000 = (0.8 + 0.8)*(ecg5000 - ecg5000.min())/(ecg5000.max() - ecg5000.min())-0.8
print(normalized_ecg5000)

In [None]:
# Or (2) Use sklearn MinMaxScaler
min_max_scaler = MinMaxScaler(feature_range=(-0.8,0.8))
scaled_ecg5000 = pd.DataFrame(min_max_scaler.fit_transform(ecg5000))
print(scaled_ecg5000)

## Split Data

In [None]:
### Split Data into 80/20 Training, Test
trainDF, testDF = train_test_split(scaled_ecg5000, test_size=0.2)
# Optional test and info about new data sets
print("Shape of Training DataFrame: \t {}".format(trainDF.shape))
print("Shape of Test DataFrame: \t {}".format(testDF.shape))

## Reshape Data

In [None]:
# (1) 1D Array
### Flatten Values to 1D NP Array
x_train = trainDF.values.flatten()
x_test = testDF.values.flatten()

### Reshape
n_train = len(x_train)
n_test = len(x_test)
x_train = x_train.reshape((1, n_train, 1)) # samples 1+, time steps n_train (4000*140), features 1  
x_test = x_test.reshape((1, n_test, 1)) # samples 1+, time steps n_test (1000*140), features 1

### Properties
print("Time steps of reshaped train dataset: {}".format(n_train))
print("Time steps of reshaped test dataset: {}".format(n_test))

In [None]:
# Or (2) Leave Dataframe Dimesions as they are
### convert to array
x_train = trainDF.to_numpy()
x_test = testDF.to_numpy()

### Reshape
n_train = len(trainDF.index) # time steps
n_test = len(testDF.index) # time steps
f_train = len(trainDF.columns) # features
f_test = len(testDF.columns) # features
x_train = x_train.reshape(1, n_train, f_train)
x_train = x_test.reshape(1, n_test, f_test)

###Properties
print("Time steps of reshaped train dataset: {}".format(n_train))
print("Time steps of reshaped test dataset: {}".format(n_test))

# Build Autoencoder (AE)

---



In [None]:
### For better understanding visit: https://towardsdatascience.com/autoencoders-for-the-compression-of-stock-market-data-28e8c1a2da3e
### or for code: https://gist.github.com/GerardBCN/40349b39bc45d4550141aff6966d1619#file-stock_price_autoencoding-ipynb
### For Reshaping Issues: https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/

encoding_dim = 3
epochs = 50

inputs = keras.Input(shape=(n_train, 1))
encoded = layers.LSTM(encoding_dim, activation='tanh')(inputs)

decoded = layers.RepeatVector(n_train)(encoded)
decoded = layers.LSTM(1, return_sequences=True, activation='tanh')(decoded)

sequence_autoencoder = keras.Model(inputs, decoded)
encoder = keras.Model(inputs, encoded)
sequence_autoencoder.summary()

sequence_autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
history = sequence_autoencoder.fit(x_train, x_train,
                epochs=epochs,
                batch_size=1024,
                shuffle=True,
                validation_data=(x_train, x_train))

decoded_ecg5000 = sequence_autoencoder.predict(x_train)

# Plot Results

---

