In [45]:
def create_dataset(X, y, time_steps_x, time_steps_y):
    """
    Creating windowed dataset for feeding into the neural network

    Arguments :

    df : dataframe which has to be converted to 3D array from 2D array
    window_size : timeperiod during which the model must be trained on a rolling basis

    Returns:

    np.array(arr) : an array of arrays where every array is a window (3D array)

    """
    Xs, ys = [], []
    for i in range(len(X) - time_steps_x - time_steps_y +1):
        Xs.append(X.iloc[i:(i + time_steps_x)].values)
        
    for i in range(time_steps_x, len(X) - time_steps_y + 1):
        ys.append(y.iloc[i:i + time_steps_y].values)
    return np.array(Xs), np.array(ys)

In [None]:
# Defining the neural network structure

# Kernel Initialiser is used to initialise the random weights in the neural net. Here we are using glorot.
# in the initializer for kernels we can put in a seed that will prevent the random initialization.
# https://keras.io/api/layers/initializers/

#return_sequences transfers the hidden state from one cell to another
#return_Sequences are required for stacking LSTMS so that they provide the hidden state for each time step 
# rather than combining it

# kernel regulizers are used

# Relu activation is used

# Activation functions
# https://machinelearningmastery.com/choose-an-activation-function-for-deep-learning/#:~:text=Activation%20functions%20are%20a%20critical,design%20of%20a%20neural%20network.&text=Activation%20functions%20are%20a%20key,the%20type%20of%20prediction%20problem.

def autoencoder_model(array):
    """
    Defining the neural network structure

    Arguments:

    array : array of predictor variable (gauge column)

    Returns:

    model : model to be used is returned

    """
    inputs = Input(shape = (array.shape[1], array.shape[2]))  
    L1 = LSTM(128, kernel_initializer=initializers.glorot_normal(seed_no), activation='relu', return_sequences=True, kernel_regularizer= regularizers.l2(reg_lambda))(inputs)
    L2 = LSTM(64, kernel_initializer=initializers.glorot_normal(seed_no), activation='relu', return_sequences=True)(L1)
    L3 = LSTM(32, kernel_initializer=initializers.glorot_normal(seed_no), activation='relu', return_sequences=False)(L2)
    L4 = RepeatVector(array.shape[1])(L3)
    L5 = LSTM(32, kernel_initializer=initializers.glorot_normal(seed_no), activation='relu', return_sequences=True)(L4)
    L6 = LSTM(64, kernel_initializer=initializers.glorot_normal(seed_no), activation='relu', return_sequences=True)(L5)
    L7 = LSTM(128, kernel_initializer=initializers.glorot_normal(seed_no), activation='relu', return_sequences=True)(L6)
  
    output = TimeDistributed(Dense(array.shape[2], kernel_initializer=initializers.glorot_normal(seed_no)))(L7)
    model = Model(inputs = inputs, outputs = output)
    return model


def hvd_train(train_list, learning_rate,reg_lambda, batch_size, epochs):
    """
    Training the wells

    Arguments :

    train_list : list of training wells
    learning_rate : learning rate of the model (hyperparameter)
    reg_lambda : regularisation parameter of the model (hyperparameter)
    batch_size : training samples of the model (hyperparameter)
    epochs : number of times the algorithm will work through the data set (hyperparameter)

    Returns:

    AE_model : auto encoder model to be used
    timestamp_dict : maximum timestamp for which each well is predicted
    History : model fit object (used to obtain validation loss and loss)

    """
    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    global History
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    timestamp_dict = {}
    # train_csv has been cleaned of anomalies based on LPO codes
    train_csv = read_csv('filename.csv',data_path) 
    max_timestamp = str(train_csv.TimeStamp.max())
    timestamp_dict[train_list[0]] = max_timestamp

    ### Data pre-processing for the training well
    # Considering only gauge columns
    train = train_csv[gauge_cols]

    # Normalizing data
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(train)
    X_train = create_dataset(X_train, window_size)

    # Creating test dataset by taking a chunk of the timeseries which is 20%
    x_test = X_train[math.floor(0.4*len(X_train)):math.floor(0.6*len(X_train))]
    x_train_a = X_train[:math.floor(0.4*len(X_train))]
    x_train_b = X_train[math.floor(0.6*len(X_train)):]
    x_train = np.concatenate((x_train_a, x_train_b), axis=0, out=None)

    AE_model = autoencoder_model(x_train)

    # Adjusting learning rate based on number of GPUs.
    optimizer = keras.optimizers.SGD(lr = learning_rate*hvd.size())

    # Wrapping our optimizer with Horovod's distributed optimizer
    optimizer = hvd.DistributedOptimizer(optimizer)

    AE_model.compile(optimizer = optimizer, loss='mae')

    # Broadcasting initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0),]

    # Saving checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

    History = AE_model.fit(x_train, x_train,
            batch_size = batch_size,
            callbacks = callbacks,
            epochs = epochs,
            verbose = 2,
            validation_data = (x_test, x_test))
    return AE_model,timestamp_dict, History

In [51]:
def indices(indices):
  """
  Getting start and end index from a range of indices
  
  """
  from itertools import groupby, count
  def as_range(iterable):
    l = list(iterable)
    if len(l) > 1:
      return '{0}-{1}'.format(l[0], l[-1])
    else:
      return '{0}'.format(l[0])
  # Gathering BU indices
  indices = ','.join(as_range(g) for _, g in groupby(indices, key=lambda n, c=count(): n-next(c)))
  indices = indices.split(",")
  indices = [i.split("-") for i in indices]
  bu_list = []
  for i in range(len(indices)):
    if len(indices[i]) == 2:
      bu_list.append(indices[i])
  indices = []
  for sublist in bu_list:
      for item in sublist:
          indices.append(item)
  return [int(i) for i in indices]

def plots_with_ranges():
  for j in columns:
    fig, ax1 = plt.subplots(figsize = (18, 5))

    color = 'tab:red'
    ax1.set_xlabel('Time (10 Minutes)')
    ax1.set_ylabel(j, color=color)
    ax1.plot(concat_data.index, concat_data[j] , 'blue', label = 'non-anomalous pred')
    ax1.scatter(anomaly.index, anomaly[j] , color='red', s=10, label='anomalous pred')
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel('risk_factor', color=color)  # we already handled the x-label with ax1
    ax2.plot(risk_factor.index, risk_factor, color='black')
    ax2.tick_params(axis='y', labelcolor=color)
    plt.title(str(j)+' vs Time')

    for i in range(int(len(bu_index)/2)):
      i = 2 * i
      plt.axvspan(bu_index[i], bu_index[i+1], color='green', alpha=0.2, label='buildup')
      plt.axvspan(bu_index[i+1] - 144, bu_index[i+1], color='red', alpha=0.2, label='buildup')
      
    plt.savefig('fig.png')

In [50]:
indices([1,2,3,5,6,7,10,11,13,15,16])

[1, 3, 5, 7, 10, 11, 15, 16]

In [41]:
import numpy as np
# import tensorflow as tf
# from tensorflow import keras
import pandas as pd
# import seaborn as sns
# from pylab import rcParams
# import matplotlib.pyplot as plt
# from matplotlib import rc
# from pandas.plotting import register_matplotlib_converters



# importing 
import mlflow
import mlflow.sklearn
from numpy.random import seed
seed(seed_no)
from tensorflow import set_random_seed
set_random_seed(seed_no)
import random
random.seed(seed_no)

import os
import pandas as pd
import numpy as np
import math
import pickle
import matplotlib.pyplot as plt
import itertools as it
from datetime import date,datetime
from time import time

# Keras libs
import keras
from keras.models import Sequential
from keras.layers import Dropout, Dense, Flatten, Input, LSTM, TimeDistributed, RepeatVector
from keras import backend as K
from keras.models import Model
from keras import regularizers
from keras import initializers

from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import horovod.keras as hvd

import datetime

hvd.init()

a = [[1,2,3], [4,5,6], [7,8,9], [10,11,12]]
df = pd.DataFrame(a, columns = ['A', 'B', 'C']) 

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler = scaler.fit(df[['A', 'B', 'C']])

sc = scaler.transform(df[['A', 'B', 'C']])
df_scaled = pd.DataFrame(sc, columns = ['A', 'B', 'C']) 



time_steps = 2

# reshape to [samples, time_steps, n_features]

X_train, y_train = create_dataset(
      df_scaled[['A', 'B']],
      df_scaled['C'],
      1,3)

  return self.partial_fit(X, y)
