In [1]:
import pandas as pd
from hmmlearn import hmm
import numpy as np
from sklearn import preprocessing
from scipy.stats import norm
import pomegranate

# Getting data and diving them into unique unit numbers

We need to divide data into unique numbers, because the state restes as the unit number changes, so we need to find Gaussian distribution for different unit numbers

In [3]:
data = pd.read_csv('~/Documents/hitachi/CMAPSS/train_FD001.txt', sep=" ", header=None)
unique_unit_values = data[0].unique() #Number of units
data_cycles = []
for unit_num in unique_unit_values:
    data_cycles.append(data[data[0] == unit_num])

# Removing operational settings and normalize the data column wise

In [4]:
def normalize(data):
    x = data.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    dataNew = pd.DataFrame(x_scaled)
    return dataNew
#Remove the operation settings
dataT = data[data.columns[5:26]]
dataT.columns = range(21)
dataT = normalize(dataT)

# Dividing data for each unit

I think this is why my transitional matrix previously was not working properly as in each unit the state resets and start from good condition

In [5]:
dataT_cycles = []
for unit_num in unique_unit_values:
    dataT_cycles.append(dataT[data[0] == unit_num])

# Identifying and removing non variable data columns

Removing the columns where the data does not vary

In [6]:
for dataT_cycle in dataT_cycles:
    print(dataT_cycle.columns[dataT_cycle.std() == 0])
"""
Here we can see 0,4,9,15,17,18 but also 5 at many places so we drop column number 5 as well
"""
dataT.drop(data.columns[[0, 3, 4, 5, 9, 15, 17, 18]],axis=1,inplace=True)
dataT.columns = range(13)
dataT_cycles = []
for unit_num in unique_unit_values:
    dataT_cycles.append(dataT[data[0] == unit_num])

Int64Index([0, 4, 5, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 5, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 5, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 5, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 5, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 5, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 5, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 5, 9, 15, 17, 18], dtype='int64')
Int64Index([0, 4, 5, 9, 15, 17, 18], dtype='i

In [33]:
# Right now only using the first data frame (i.e Machine 1) to train the VAE, but we can combine all the dataframes
# and train the VAE jointly on the entire data for better performance 

x_train = dataT_cycles[0].values[:150]
x_test = dataT_cycles[0].values[151:198]
x_train.shape
# x_test.shape

(150, 13)

# Variational AutoEncoders to find Latent State Space Distribution



In [35]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from keras.layers import Lambda, Input, Dense
from keras.models import Model
from keras.datasets import mnist
from keras.losses import mse, binary_crossentropy
from keras.utils import plot_model
from keras import backend as K

import numpy as np
import matplotlib.pyplot as plt
import argparse
import os

In [36]:
# Data preparation
x_train = dataT_cycles[0].values[:100]
x_test = dataT_cycles[0].values[101:198]
x_train.shape
x_test.shape
original_dim = x_train[0].shape[0]

In [37]:
# network parameters
input_shape = (original_dim, )
intermediate_dim = 9
batch_size = 10
latent_dim = 5
epochs = 50

In [38]:
# Sampling function
# reparameterization trick
# instead of sampling from Q(z|X), sample eps = N(0,I)
# z = z_mean + sqrt(var)*eps
def sampling(args):
    """Reparameterization trick by sampling fr an isotropic unit Gaussian.

    # Arguments
        args (tensor): mean and log of variance of Q(z|X)

    # Returns
        z (tensor): sampled latent vector
    """
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean=0 and std=1.0
    epsilon = K.random_normal(shape=(batch, dim))
    
    return z_mean + K.exp(0.5 * z_log_var) * epsilon


In [39]:
# VAE Model Encoder + Decoder 

# Building the Encoder
inputs = Input(shape=input_shape, name='encoder_input')
x = Dense(intermediate_dim, activation='relu')(inputs)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)

# use reparameterization trick to push the sampling out as input
# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

# instantiate encoder model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
encoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 13)           0                                            
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 9)            126         encoder_input[0][0]              
__________________________________________________________________________________________________
z_mean (Dense)                  (None, 5)            50          dense_5[0][0]                    
__________________________________________________________________________________________________
z_log_var (Dense)               (None, 5)            50          dense_5[0][0]                    
____________________________________________________________________________________________

In [40]:
#build decoder model 
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(intermediate_dim, activation='relu')(latent_inputs)
outputs = Dense(original_dim, activation='sigmoid')(x)

# instantiate decoder model
decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
z_sampling (InputLayer)      (None, 5)                 0         
_________________________________________________________________
dense_6 (Dense)              (None, 9)                 54        
_________________________________________________________________
dense_7 (Dense)              (None, 13)                130       
Total params: 184
Trainable params: 184
Non-trainable params: 0
_________________________________________________________________


In [41]:
#instantiate VAE model
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae_mlp')

In [42]:
def main(args):
    parser = argparse.ArgumentParser()
    help_ = "Load h5 model trained weights"
    parser.add_argument("-w", "--weights", help=help_)
    help_ = "Use mse loss instead of binary cross entropy (default)"
    parser.add_argument("-m", "--mse", help=help_, action='store_true')
    
    models = (encoder, decoder)
    data = (x_test, None)
    
    # VAE loss = mse_loss or xent_loss + kl_loss
    if args.mse:
        reconstruction_loss = mse(inputs, outputs)
    else:
        reconstruction_loss = binary_crossentropy(inputs, outputs)
        
    reconstruction_loss *= original_dim
    kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
    kl_loss = K.sum(kl_loss, axis= -1)
    kl_loss *= 0.5 
    vae_loss = K.mean(reconstruction_loss + kl_loss)
    vae.add_loss(vae_loss)
    vae.compile(optimizer='adam')
    vae.summary()
    
    if args.weights:
        vae.load_weights(args.weights)
    else:
        # Train the autoencoder
        vae.fit(x_train, epochs=epochs, batch_size= batch_size, validation_data=(x_test, None))
        vae.save_weights('vae_mlp_CMAPSS.h5')

In [43]:
class Args:
    mse = None
    weights = None
    
args = Args()

if __name__ == '__main__':
    main(args)
    

Model: "vae_mlp"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   (None, 13)                0         
_________________________________________________________________
encoder (Model)              [(None, 5), (None, 5), (N 226       
_________________________________________________________________
decoder (Model)              (None, 13)                184       
Total params: 410
Trainable params: 410
Non-trainable params: 0
_________________________________________________________________
Train on 100 samples, validate on 91 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 3

In [65]:
# Once the VAE has been trained, we can use the encoder to sample the latent space

#predicting the latent space for first 13 observation values for machine 1
test = np.asarray(x_train[0:13])  

# indexing on 2 because the encoder predicts z_mean, z_log_var and sampled vector z (we are interested in z only)
latent_space = encoder.predict(test)[2]  

In [66]:
# Each list is a 5 dimension latent state space for that observation value
latent_space

array([[ 27.251003, -20.169312,  22.938713, -29.368395,  19.887693],
       [ 28.004028, -20.717808,  23.52812 , -30.190313,  20.401749],
       [ 26.674105, -19.824026,  22.48051 , -28.758377,  19.549667],
       [ 26.771925, -19.89632 ,  22.58438 , -28.8585  ,  19.634975],
       [ 27.21871 , -20.129292,  22.902697, -29.331566,  19.845766],
       [ 25.162264, -18.688475,  21.239565, -27.113237,  18.45327 ],
       [ 27.825731, -20.64691 ,  23.433723, -29.999489,  20.355967],
       [ 25.292551, -18.82446 ,  21.323738, -27.268307,  18.56439 ],
       [ 25.6682  , -18.988995,  21.635544, -27.64892 ,  18.744232],
       [ 26.223043, -19.250334,  22.048712, -28.224167,  18.99528 ],
       [ 26.14768 , -19.279438,  22.037632, -28.152092,  19.042107],
       [ 25.701624, -19.079872,  21.717144, -27.687557,  18.856134],
       [ 26.871473, -19.978725,  22.61129 , -28.98185 ,  19.678198]],
      dtype=float32)

In [67]:
# Reconstruct the raw observation from the learned latent space 
sample = decoder.predict(latent_space)

In [70]:
# Compare the with the real x_train value
x_train[0]

array([0.18373494, 0.40680183, 0.72624799, 0.24242424, 0.109755  ,
       0.36904762, 0.63326226, 0.20588235, 0.1996078 , 0.36398615,
       0.33333333, 0.71317829, 0.7246617 ])

In [69]:
# Right now they are not same as we trained the VAE on very less amount of data 
sample[0]

array([0.31233034, 0.32204977, 0.68397444, 0.24470285, 0.13417616,
       0.27928537, 0.69099057, 0.25210077, 0.17723036, 0.3619346 ,
       0.32728148, 0.63931286, 0.6711073 ], dtype=float32)

# Using HMM to find out transitional matrices

Here we first define transmatrix as [[0.5, 0.5, 0.0, 0.0],[0.0, 0.5, 0.5, 0.0],[0.0, 0.0, 0.5, 0.5],[0.0,0.0,0.0,1.0]] which means there is half chance for each state to go to next state and half to remain in the current state itself.

Then we train for each unit for transmatrix as well as state means and we will take average of each unit transmatrices and states as the transmatrix and state

*Note*: Here state '0' means the perfect health and '3' means weakest health 

In [31]:
lr = hmm.GaussianHMM(n_components=4, covariance_type="diag",init_params="cm", params="mtc")
lr.startprob_ = np.array([1.0, 0.0, 0.0, 0.0])
transmats = []
statemeans = []
for i in range(100):
    lr.transmat_ = np.array([[0.5, 0.5, 0.0, 0.0],[0.0, 0.5, 0.5, 0.0],[0.0, 0.0, 0.5, 0.5],[0.0,0.0,0.0,1.0]])
    lr.fit(dataT_cycles[i])
    transmat = lr.transmat_
    transmats.append(transmat)
    statemeans.append(lr.means_)

In [30]:
lr = hmm.GMMHMM(n_components=4, n_mix=4, covariance_type="diag",init_params="cm", params="mt")
lr.startprob_ = np.array([1.0, 0.0, 0.0, 0.0])
transmats = []
statemeans = []
for i in range(100):
    lr.transmat_ = np.array([[0.5, 0.5, 0.0, 0.0],[0.0, 0.5, 0.5, 0.0],[0.0, 0.0, 0.5, 0.5],[0.0,0.0,0.0,1.0]])
    lr.fit(dataT_cycles[i])
    transmat = lr.transmat_
    transmats.append(transmat)
    statemeans.append(lr.means_)

  new_cov = new_cov_numer / new_cov_denom
  new_cov = new_cov_numer / new_cov_denom
  + np.dot(X ** 2, (1.0 / covars).T))
  + np.dot(X ** 2, (1.0 / covars).T))
  + np.dot(X ** 2, (1.0 / covars).T))


ValueError: mixture weights must sum up to 1

In [32]:
transmat = np.array(transmats).mean(axis=0)
statemean = np.array(statemeans).mean(axis=0)

In [33]:
transmat

array([[0.51835732, 0.48164268, 0.        , 0.        ],
       [0.        , 0.7002123 , 0.2997877 , 0.        ],
       [0.        , 0.        , 0.87145676, 0.12854324],
       [0.        , 0.        , 0.        , 0.98      ]])

In [10]:
pd.DataFrame(statemean)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.366748,0.3605,0.648286,0.24855,0.159554,0.315721,0.675591,0.266175,0.197184,0.374635,0.351366,0.605654,0.633334
1,0.436753,0.415611,0.571987,0.290103,0.197851,0.399325,0.588434,0.308757,0.228241,0.44589,0.427247,0.527394,0.550402
2,0.536725,0.506111,0.471711,0.360246,0.247191,0.525837,0.477683,0.375745,0.268967,0.550086,0.517278,0.435316,0.449778
3,0.679415,0.624325,0.330025,0.472313,0.315224,0.68962,0.312354,0.488682,0.325557,0.68827,0.639782,0.296632,0.302288


In [11]:
t_prob = np.array([transmat, transmat])

In [12]:
rewards = np.array([[100, 50, 0, -50],[-50, 0, 50, 100]])

In [13]:
e_prob = np.array([norm.pdf(statemean), norm.pdf(statemean)])

In [14]:
e_prob

array([[[0.3729949 , 0.37384322, 0.32333181, 0.38680786, 0.39389644,
         0.37954642, 0.31754041, 0.38505735, 0.39126148, 0.3719059 ,
         0.37506058, 0.3320907 , 0.3264447 ],
        [0.36265064, 0.36593308, 0.33873974, 0.38250316, 0.3912099 ,
         0.36836958, 0.33552258, 0.38037257, 0.38868521, 0.36119137,
         0.364143  , 0.34714575, 0.34286804],
        [0.34542646, 0.3509846 , 0.35693762, 0.37387749, 0.3869382 ,
         0.34743041, 0.35592727, 0.37175108, 0.38476971, 0.34292754,
         0.34898477, 0.362878  , 0.36056297],
        [0.31671883, 0.32829941, 0.37779757, 0.35683622, 0.3796059 ,
         0.31451413, 0.37994797, 0.35404071, 0.37835124, 0.31480677,
         0.32510764, 0.38177117, 0.38112516]],

       [[0.3729949 , 0.37384322, 0.32333181, 0.38680786, 0.39389644,
         0.37954642, 0.31754041, 0.38505735, 0.39126148, 0.3719059 ,
         0.37506058, 0.3320907 , 0.3264447 ],
        [0.36265064, 0.36593308, 0.33873974, 0.38250316, 0.3912099 ,
         

In [16]:
pd.DataFrame(e_prob[0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.372995,0.373843,0.323332,0.386808,0.393896,0.379546,0.31754,0.385057,0.391261,0.371906,0.375061,0.332091,0.326445
1,0.362651,0.365933,0.33874,0.382503,0.39121,0.36837,0.335523,0.380373,0.388685,0.361191,0.364143,0.347146,0.342868
2,0.345426,0.350985,0.356938,0.373877,0.386938,0.34743,0.355927,0.371751,0.38477,0.342928,0.348985,0.362878,0.360563
3,0.316719,0.328299,0.377798,0.356836,0.379606,0.314514,0.379948,0.354041,0.378351,0.314807,0.325108,0.381771,0.381125


In [17]:
# 0: no-repair, 1: repair
actions = ('0', '1')
# 0: failing, 1: low health, 2: good health, 3: perfect health
states = ('0', '1', '2', '3')

gamma = 0.95

In [18]:
"""
First we define an MDP. We also represent a policy
as a dictionary of {state: action} pairs, and a utility function as a
dictionary of {state: number} pairs. We then define the value_iteration
and policy_iteration algorithms."""


import random
import numpy as np
from collections import defaultdict

class MDP:

    """A Markov Decision Process, defined by an initial state, transition model,
    and reward function. We also keep track of a gamma value, for use by
    algorithms. The transition model is represented somewhat differently from
    the text. Instead of P(s' | s, a) being a probability number for each
    state/state/action triplet, we instead have T(s, a) return a
    list of (p, s') pairs. We also keep track of the possible states,
    terminal states, and actions for each state."""

    def __init__(self, init, actlist, terminals, transitions=None, reward=None, states=None, gamma=0.9):
        if not (0 < gamma <= 1):
            raise ValueError("An MDP must have 0 < gamma <= 1")

        # collect states from transitions table if not passed.
        self.states = states or self.get_states_from_transitions(transitions)
            
        self.init = init
        
        if isinstance(actlist, list):
            # if actlist is a list, all states have the same actions
            self.actlist = actlist

        elif isinstance(actlist, dict):
            # if actlist is a dict, different actions for each state
            self.actlist = actlist
        
        self.terminals = terminals
        self.transitions = transitions or {}
        if not self.transitions:
            print("Warning: Transition table is empty.")

        self.gamma = gamma

        self.reward = reward or {s: 0 for s in self.states}

        # self.check_consistency()

    def R(self, state):
        """Return a numeric reward for this state."""

        return self.reward[state]

    def T(self, state, action):
        """Transition model. From a state and an action, return a list
        of (probability, result-state) pairs."""

        if not self.transitions:
            raise ValueError("Transition model is missing")
        else:
            return self.transitions[state][action]

    def actions(self, state):
        """Return a list of actions that can be performed in this state. By default, a
        fixed list of actions, except for terminal states. Override this
        method if you need to specialize by state."""

        if state in self.terminals:
            return [None]
        else:
            return self.actlist

    def get_states_from_transitions(self, transitions):
        if isinstance(transitions, dict):
            s1 = set(transitions.keys())
            s2 = set(tr[1] for actions in transitions.values()
                     for effects in actions.values()
                     for tr in effects)
            return s1.union(s2)
        else:
            print('Could not retrieve states from transitions')
            return None

    def check_consistency(self):

        # check that all states in transitions are valid
        assert set(self.states) == self.get_states_from_transitions(self.transitions)

        # check that init is a valid state
        assert self.init in self.states

        # check reward for each state
        assert set(self.reward.keys()) == set(self.states)

        # check that all terminals are valid states
        assert all(t in self.states for t in self.terminals)

        # check that probability distributions for all actions sum to 1
        for s1, actions in self.transitions.items():
            for a in actions.keys():
                s = 0
                for o in actions[a]:
                    s += o[0]
                assert abs(s - 1) < 0.001

class POMDP(MDP):

    """A Partially Observable Markov Decision Process, defined by
    a transition model P(s'|s,a), actions A(s), a reward function R(s),
    and a sensor model P(e|s). We also keep track of a gamma value,
    for use by algorithms. The transition and the sensor models
    are defined as matrices. We also keep track of the possible states
    and actions for each state."""

    def __init__(self, actions, transitions=None, evidences=None, rewards=None, states=None, gamma=0.95):
        """Initialize variables of the pomdp"""

        if not (0 < gamma <= 1):
            raise ValueError('A POMDP must have 0 < gamma <= 1')

        self.states = states
        self.actions = actions

        # transition model cannot be undefined
        self.t_prob = transitions
        if not self.t_prob.any():
            print('Warning: Transition model is undefined')
        
        # sensor model cannot be undefined
        self.e_prob = evidences
        if not self.e_prob.any():
            print('Warning: Sensor model is undefined')
        
        self.gamma = gamma
        self.rewards = rewards

    def remove_dominated_plans(self, input_values):
        """
        Remove dominated plans.
        This method finds all the lines contributing to the
        upper surface and removes those which don't.
        """

        values = [val for action in input_values for val in input_values[action]]
        values.sort(key=lambda x: x[0], reverse=True)

        best = [values[0]]
        y1_max = max(val[1] for val in values)
        tgt = values[0]
        prev_b = 0
        prev_ix = 0
        while tgt[1] != y1_max:
            min_b = 1
            min_ix = 0
            for i in range(prev_ix + 1, len(values)):
                if values[i][0] - tgt[0] + tgt[1] - values[i][1] != 0:
                    trans_b = (values[i][0] - tgt[0]) / (values[i][0] - tgt[0] + tgt[1] - values[i][1])
                    if 0 <= trans_b <= 1 and trans_b > prev_b and trans_b < min_b:
                        min_b = trans_b
                        min_ix = i
            prev_b = min_b
            prev_ix = min_ix
            tgt = values[min_ix]
            best.append(tgt)

        return self.generate_mapping(best, input_values)

    def remove_dominated_plans_fast(self, input_values):
        """
        Remove dominated plans using approximations.
        Resamples the upper boundary at intervals of 100 and
        finds the maximum values at these points.
        """

        values = [val for action in input_values for val in input_values[action]]
        values.sort(key=lambda x: x[0], reverse=True)

        best = []
        sr = 100
        for i in range(sr + 1):
            x = i / float(sr)
            maximum = (values[0][1] - values[0][0]) * x + values[0][0]
            tgt = values[0]
            for value in values:
                val = (value[1] - value[0]) * x + value[0]
                if val > maximum:
                    maximum = val
                    tgt = value

            if all(any(tgt != v) for v in best):
                best.append(np.array(tgt))

        return self.generate_mapping(best, input_values)

    def generate_mapping(self, best, input_values):
        """Generate mappings after removing dominated plans"""

        mapping = defaultdict(list)
        for value in best:
            for action in input_values:
                if any(all(value == v) for v in input_values[action]):
                    mapping[action].append(value)

        return mapping

    def max_difference(self, U1, U2):
        """Find maximum difference between two utility mappings"""

        for k, v in U1.items():
            sum1 = 0
            for element in U1[k]:
                sum1 += sum(element)
            sum2 = 0
            for element in U2[k]:
                sum2 += sum(element)
        return abs(sum1 - sum2)

        
class Matrix:
    """Matrix operations class"""

    @staticmethod
    def add(A, B):
        """Add two matrices A and B"""

        res = []
        for i in range(len(A)):
            row = []
            for j in range(len(A[0])):
                row.append(A[i][j] + B[i][j])
            res.append(row)
        return res

    @staticmethod
    def scalar_multiply(a, B):
        """Multiply scalar a to matrix B"""

        for i in range(len(B)):
            for j in range(len(B[0])):
                B[i][j] = a * B[i][j]
        return B

    @staticmethod
    def multiply(A, B):
        """Multiply two matrices A and B element-wise"""

        matrix = []
        for i in range(len(B)):
            row = []
            for j in range(len(B[0])):
                row.append(B[i][j] * A[j][i])
            matrix.append(row)

        return matrix

    @staticmethod
    def matmul(A, B):
        """Inner-product of two matrices"""

        return [[sum(ele_a*ele_b for ele_a, ele_b in zip(row_a, col_b)) for col_b in list(zip(*B))] for row_a in A]

    @staticmethod
    def transpose(A):
        """Transpose a matrix"""
        
        return [list(i) for i in zip(*A)]


def pomdp_value_iteration(pomdp, epsilon=0.1):
    """Solving a POMDP by value iteration."""

    U = {'':[[0]* len(pomdp.states)]}
    count = 0
    while True:
        count += 1
        prev_U = U
        values = [val for action in U for val in U[action]]
        value_matxs = []
        for i in values:
            for j in values:
                value_matxs.append([i, j])

        U1 = defaultdict(list)
        for action in pomdp.actions:
            for u in value_matxs:
                u1 = Matrix.matmul(Matrix.matmul(pomdp.t_prob[int(action)], Matrix.multiply(pomdp.e_prob[int(action)], Matrix.transpose(u))), [[1], [1]])
                u1 = Matrix.add(Matrix.scalar_multiply(pomdp.gamma, Matrix.transpose(u1)), [pomdp.rewards[int(action)]])
                U1[action].append(u1[0])

        U = pomdp.remove_dominated_plans_fast(U1)
        # replace with U = pomdp.remove_dominated_plans(U1) for accurate calculations
        
        if count > 10:
            if pomdp.max_difference(U, prev_U) < epsilon * (1 - pomdp.gamma) / pomdp.gamma:
                return U



In [19]:
states

('0', '1', '2', '3')

In [20]:
pomdp = POMDP(actions, t_prob, e_prob, rewards, states, gamma)

In [21]:
utility = pomdp_value_iteration(pomdp, epsilon=3)
utility

defaultdict(list,
            {'0': [array([ 209.88702446,   90.76156902,  -34.01230831, -185.57933297])]})

# ADQRN

In [55]:
from __future__ import division

import gym
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [56]:
#These lines establish the feed-forward part of the network used to choose actions
inputs1 = tf.placeholder(shape=[1,13],dtype=tf.float32)
W = tf.Variable(tf.random_uniform([13,2],0,0.01))
Qout = tf.matmul(inputs1,W)
predict = tf.argmax(Qout,1)

#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape=[1,2],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)

In [107]:
def get_state(obs):
    state = 0
    diff = 16
    for i in range(len(statemean)):
        stateDiff = obs - statemean[i]
        stateDiffVal = np.sqrt(np.mean(stateDiff**2))
        if stateDiffVal < diff:
            diff = stateDiffVal
            state = i
    return state

In [130]:
# Getting the next step after an action is done

def getStepDetails(i,j,action):
    unitData = dataT_cycles[i]
    d = False
    if action == 1:
        newJ = 0
    else:
        newJ = j+1
    obsNext = unitData.values[newJ]
    if newJ >= len(unitData) - 1:
        d = True
    s1 = get_state(obsNext)
    r1 = rewards[action][s1]
    return r1,newJ,s1,obsNext,d

In [196]:
# Set learning parameters
init = tf.global_variables_initializer()
y = gamma
e = 0.1
num_episodes = len(dataT_cycles)
#create lists to contain total rewards and steps per episode
jList = []
rList = []
D = np.empty([0,5]) # Replay memory
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        #Reset environment and get first new observation for new unit
        rAll = 0
        d = False
        j = 0
        k = 0
        unitData = dataT_cycles[i]
        #The Q-Network
        while j < len(unitData):
            #Choose an action by greedily (with e chance of random action) from the Q-network
            a,allQ = sess.run([predict,Qout],feed_dict={inputs1:unitData.values[j].reshape(1,13)})
            if np.random.rand(1) < e:
                a[0] = np.random.randint(0,2)
            #Get new state and reward from environment
            r,j,s1,o1,d = getStepDetails(i,j,a[0])
            D = np.vstack([D, [a[0],unitData.values[j-1].reshape(1,13),r,o1,s1]])
            if len(D) > 20:
                lastInd = np.random.randint(15,len(D))
                randomSample = D[lastInd-15:lastInd]
                finalO = D[lastInd,3].reshape(1,13)
                Reward = np.sum(D[lastInd-15:lastInd,2])
            else:
                finalO = o1.reshape(1,13)
                Reward = r
            # We take batch size of 15 (j in algorithm)
            #Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(Qout,feed_dict={inputs1:finalO})
            #Obtain maxQ' and set our target value for chosen action.
            maxQ1 = np.max(Q1)
            targetQ = allQ
            targetQ[0,a[0]] = Reward + y*maxQ1
            #Train our network using target and predicted Q values
            _,W1 = sess.run([updateModel,W],feed_dict={inputs1:unitData.values[j-1].reshape(1,13),nextQ:targetQ})
            rAll += r
            s = s1
            k += 1
            if d == True or k >= 1000:
                #Reduce chance of random action as we train the model.
                e = 1./((i/50) + 10)
                break
        jList.append(j)
        rList.append(rAll)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


# Prediction

In [197]:
a = dataT_cycles[5].values[160].reshape(-1,13)

In [198]:
W1

array([[ 2127.6558 ,  6986.8765 ],
       [ 2248.1982 ,  5232.41   ],
       [ 3317.4592 ,  3077.835  ],
       [ 4824.3174 ,   -45.76535],
       [ 3286.811  ,  2219.9832 ],
       [ 2962.1672 ,  4778.8364 ],
       [ 5146.6387 ,  1414.4443 ],
       [ 4606.688  , -1017.36865],
       [ 8425.082  ,  1503.2651 ],
       [ 2555.8647 ,  2504.4    ],
       [ 1596.5271 ,  3793.9429 ],
       [ 3868.3757 ,   737.1053 ],
       [ 3159.9434 ,  1868.2078 ]], dtype=float32)

In [199]:
np.dot(a,W1)

array([[17942.35635803, 16792.32532451]])