In [None]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM , Embedding, BatchNormalization
from tensorflow.keras.layers import Dense, Activation, Input,Dropout
seqnc_length = 10000
embedding_dim = 64
vocab_size = 10000
(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words=vocab_size,skip_top=20)


In [None]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler 
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers

In [None]:
# Here is a list of maximum indexes in every review --- we search the maximum index in this list of max indexes
print(type([max(sequence) for sequence in x_train]))

# Find the maximum of all max indexes
max([max(sequence) for sequence in x_train])

In [None]:
# Let's quickly decode a review

# step 1: load the dictionary mappings from word to integer index
word_index = imdb.get_word_index()

# step 2: reverse word index to map integer indexes to their respective words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Step 3: decode the review, mapping integer indices to words
#
# indices are off by 3 because 0, 1, and 2 are reserverd indices for "padding", "Start of sequence" and "unknown"
decoded_review = ' '.join([reverse_word_index.get(i-3, '?') for i in x_train[0]])

decoded_review

We cannot feed list of integers into our deep neural network. We will need to convert them into tensors.

To prepare our data we will One-hot Encode our lists and turn them into vectors of 0's and 1's. This would blow up all of our sequences into 10,000 dimensional vectors containing 1 at all indices corresponding to integers present in that sequence. This vector will have the element 0 at all indices which are not present in integer sequence.

Simply put, the 10,000 dimensional vector corresponding to each review, will have

Every index corresponding to a word Every index vith value 1, is a word which is present in the review and is denoted by its integer counterpart Every index containing 0, is a word not present in the review We will vectorize our data manually for maximum clarity. This will result in a tensors of shape (25000, 10000).

In [None]:
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
  results = np.zeros((len(sequences), dimension))
  for i,sequence in enumerate(sequences):
    results[i,sequence] = 1 
  return results
# Vectorize training Data
x_train = vectorize_sequences(x_train)

# Vectorize testing Data
x_test = vectorize_sequences(x_test)  

In [None]:
y_train = np.asarray(y_train).astype('float32')
y_test  = np.asarray(y_test).astype('float32')

In [None]:
inpt_vec = Input(shape=(seqnc_length,))

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Activation, Input
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras.layers import RepeatVector, TimeDistributed
from tensorflow.keras.datasets import mnist
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [None]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler 
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers

In [None]:
# Building the Encoder network
encoded = Dense(100, activation ='tanh',
                activity_regularizer = regularizers.l1(10e-5))(inpt_vec)
encoded = Dense(50, activation ='tanh',
                activity_regularizer = regularizers.l1(10e-5))(encoded)
encoded = Dense(25, activation ='tanh',
                activity_regularizer = regularizers.l1(10e-5))(encoded)
encoded = Dense(12, activation ='tanh',
                activity_regularizer = regularizers.l1(10e-5))(encoded)
encoded = Dense(6, activation ='relu')(encoded)

In [None]:
# Building the Decoder network
decoded = Dense(12, activation ='tanh')(encoded)
decoded = Dense(25, activation ='tanh')(decoded)
decoded = Dense(50, activation ='tanh')(decoded)
decoded = Dense(100, activation ='tanh')(decoded)

In [None]:
# Building the Output Layer
output_layer = Dense(seqnc_length, activation ='sigmoid')(decoded)

In [None]:
autoencoder = Model(inpt_vec, output_layer)
autoencoder.compile(optimizer ="rmsprop", loss ="mse",metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import matplotlib.pyplot as plt
#callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_delta=1e-4, mode='min', verbose=1)
stop_alg = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True, verbose=1)

In [None]:
hist = autoencoder.fit(x_train, y_train, batch_size=100, epochs=100,callbacks=[stop_alg, reduce_lr], shuffle=True, validation_data=(x_test, y_test))

In [None]:
scores = autoencoder.evaluate(x_test, y_test, verbose=0)

In [None]:
print(scores)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
# save and plot training process
autoencoder.save_weights("autoencoder.hdf5")

fig = plt.figure(figsize=(10,6))
plt.plot(hist.history['loss'], color='#785ef0')
plt.plot(hist.history['val_loss'], color='#dc267f')
plt.title('Model Loss Progress')
plt.ylabel('Brinary Cross-Entropy Loss')
plt.xlabel('Epoch')
plt.legend(['Training Set', 'Test Set'], loc='upper right')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

y_hat = autoencoder.predict(x_test)

# gets the ROC
##fpr, tpr, thresholds = roc_curve(y_test, y_hat)
##roc_auc = auc(fpr, tpr)

# plots ROC
## fig = plt.figure(figsize=(10,6))
## plt.plot(fpr, tpr, color='#785ef0', label='ROC curve (AUC = %0.2f)' % roc_auc)
## plt.plot([0, 1], [0, 1], color='#dc267f', linestyle='--')
## plt.xlim([0.0, 1.0])
## plt.ylim([0.0, 1.05])
## plt.xlabel('False Positive Rate')
## plt.ylabel('True Positive Rate')
## plt.title('Receiver Operating Characteristic Curve')
## plt.legend(loc="lower right")
## plt.show()

In [None]:
y_pred = np.zeros(len(y_hat))
for i, score in enumerate(y_hat):
    y_pred[i] = np.array([score])