lecture21.tex

%% LyX 2.2.1 created this file.  For more info, see http://www.lyx.org/.
%% Do not edit unless you really know what you are doing.
\documentclass[english,12pt]{article}
\usepackage[T1]{fontenc}
\usepackage[latin9]{inputenc}
\usepackage{mathtools}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amssymb}

\makeatletter
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Textclass specific LaTeX commands.
\theoremstyle{plain}
\newtheorem{thm}{\protect\theoremname}[section]
\theoremstyle{definition}
\newtheorem{defn}[thm]{\protect\definitionname}
\theoremstyle{plain}
\newtheorem{prop}[thm]{\protect\propositionname}
\ifx\proof\undefined
\newenvironment{proof}[1][\protect\proofname]{\par
\normalfont\topsep6\p@\@plus6\p@\relax
\trivlist
\itemindent\parindent
\item[\hskip\labelsep\scshape #1]\ignorespaces
}{%
\endtrivlist\@endpefalse
}
\providecommand{\proofname}{Proof}
\fi
\theoremstyle{remark}
\newtheorem{rem}[thm]{\protect\remarkname}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% User specified LaTeX commands.
\usepackage[margin=1in]{geometry}

\makeatother

\usepackage{babel}
\providecommand{\definitionname}{Definition}
\providecommand{\propositionname}{Proposition}
\providecommand{\remarkname}{Remark}
\providecommand{\theoremname}{Theorem}

\begin{document}

\title{Math 525: Lecture 21}

\date{April 03, 2018}

\maketitle
So far, we have worked primarily with (stationary) Markov chains whose
transition matrices are ``constant''. In this lecture, we explore
the following question: what if we could ``control'' the transition
matrix? In this context, we will have a transition matrix $P(\pi)$
that depends on some quantity $\pi$ which we, the ``controller'',
get to choose.

\section{Markov decision processes}

For this lecture, our setting is as follows:
\begin{itemize}
\item $S=\{1,\ldots,m\}$ is a finite state space.
\item To each state $i$ in $S$ is associated a nonempty countable set
$\mathcal{A}_{i}$ which we can intuitively think of as all the ``actions''
available at state $i$.
\end{itemize}
\begin{defn}
A \emph{stationary policy} $\pi_{0}$ is a function whose domain is
$S$ and which satisfies $\pi_{0}(i)\in\mathcal{A}_{i}$ for all $i$.
The set of all stationary policies is denoted $\Pi_{0}$
\end{defn}
%
\begin{defn}
A \emph{randomized policy} $(\pi_{n})_{n\geq0}$ is a sequence in
which each $\pi_{n}(i)$ is a random variable satisfying
\begin{itemize}
\item $\pi_{n}(i)$ takes values in $\mathcal{A}_{i}$ a.s. and
\item $\{\pi_{n}(i)=a\}\in\mathcal{F}_{n}$ for each $a$ in $\mathcal{A}_{i}$.
\end{itemize}
The set of all randomized policies is denoted $\Pi$. 
\end{defn}
For each state $i$ in $S$ and action $a$ in $\mathcal{A}_{i}$,
let $p_{i}(a)$ denote a nonnegative column vector satisfying $p_{i}(a)^{\intercal}e=1$.
Given a randomized policy $\pi$, let $(X_{n}^{\pi})_{n\geq0}$ denote
a Markov chain satisfying
\[
\mathbb{P}(X_{n+1}^{\pi}=j\mid X_{n}^{\pi}=i)=p_{i}(\pi_{n}(i))^{\intercal}e_{j}.
\]
That is, the transition matrix at time $n$ is 
\[
P(\pi_{n})=\begin{pmatrix}p_{1}(\pi_{n}(1))^{\intercal}\\
p_{2}(\pi_{n}(2))^{\intercal}\\
\vdots\\
p_{m}(\pi_{n}(m))^{\intercal}
\end{pmatrix}.
\]

Now, let $c:S\rightarrow\mathbb{R}$, $0\leq d<1$, and
\begin{equation}
J(i,\pi)=\mathbb{E}\left[\sum_{n\geq0}d^{n}c(X_{n}^{\pi})\middle|X_{0}^{\pi}=i\right].\label{eq:objective}
\end{equation}
We can think of
\begin{itemize}
\item $c(X_{n}^{\pi})$ as the cost incurred at time $n$ and
\item $d^{n}$ as a discount factor which attempts to capture the fact that
costs incurred in the ``future'' are not as bad as costs incurred
``today''.
\end{itemize}
Our objective is to pick $\pi$ so as to minimize $J(i,\pi)$. That
is, we are interested in the quantity
\begin{equation}
\boxed{v(i)=\inf_{\pi\in\Pi}J(i,\pi)}\label{eq:mdp}
\end{equation}
We call (\ref{eq:mdp}) a \emph{Markov decision process} (MDP).
\begin{prop}
$v(i)$ is bounded for each $i$.
\end{prop}
\begin{proof}
This is a trivial consequence of the discount factor being strictly
less than one:
\[
\left|v(i)\right|\leq\sum_{n\geq0}d^{n}\max_{j}\left|c(j)\right|=\frac{1}{1-d}\max_{j}\left|c(j)\right|.\qedhere
\]
\end{proof}
\begin{rem}
We glossed over defining $\mathcal{F}_{n}$ earlier, so we return
to that now. Given a particular randomized policy $\pi$, we define
\[
\mathcal{F}_{n}=\sigma(X_{0}^{\pi},\ldots,X_{n}^{\pi}).
\]
This definition seems, at first glance, circular... it seems as though
$\pi_{n}$ depends on $\mathcal{F}_{n}$ and vice versa. However,
if we look a bit closer at the definition of $X_{n}^{\pi}$, we note
that it only depends on the $\pi_{0},\ldots,\pi_{n-1}$. In light
of this, we can write $X_{n}^{\pi}\equiv X_{n}^{\pi_{0},\ldots,\pi_{n-1}}$.
Therefore,
\[
\mathcal{F}_{n}=\sigma(X_{0},X_{1}^{\pi_{0}},X_{2}^{\pi_{0},\pi_{1}},\ldots,X_{n}^{\pi_{0},\pi_{1},\ldots,\pi_{n-1}})
\]
and hence $\pi$ is well-defined.
\end{rem}

\section{Dynamic programming}

By the Markov property,
\begin{multline*}
J(i,\pi)=\mathbb{E}^{i}\left[c(X_{0}^{\pi})+\sum_{n\geq1}d^{n}c(X_{n}^{\pi})\right]=c(i)+d\mathbb{E}^{i}\left[\sum_{n\geq0}d^{n}c(X_{n+1}^{\pi})\right]\\
=c(i)+d\mathbb{E}^{i}\left[J(X_{1}^{\pi},(\pi_{n})_{n\geq1})\right]\geq c(i)+d\sum_{j}(P(\pi_{0}))_{ij}v(j)
\end{multline*}
where $\pi_{0}$ is some stationary policy. Taking infimums of both
sides of this equality,
\begin{equation}
v(i)\geq\inf_{\pi_{0}\in\Pi_{0}}\left\{ c(i)+d\sum_{j}(P(\pi_{0}))_{ij}v(j)\right\} .\label{eq:ldp}
\end{equation}

Now, fix $\epsilon>0$. For each $i$, let $\pi^{i}=(\pi_{n}^{i})_{n\geq0}$
be a randomized policy which satisifes
\[
v(i)\geq J(i,\pi^{i})-\epsilon.
\]
Let $\pi_{0}$ be an arbitrary stationary policy. Define a new randomized
policy $\pi^{\epsilon}=(\pi_{n}^{\epsilon})_{n\geq0}$ by
\[
\pi_{n}^{\epsilon}=\begin{cases}
\pi_{0} & \text{if }n=0\\
\sum_{i}\mathbf{1}_{\{X_{1}^{\pi_{0}}=i\}}\pi_{n-1}^{i} & \text{if }n>0.
\end{cases}
\]
Note that
\[
v(i)\leq J(i,\pi^{\epsilon})=c(i)+d\sum_{j}(P(\pi_{0}))_{ij}J(j,\pi^{j})\leq c(i)+d\sum_{j}(P(\pi_{0}))_{ij}v(j)+\epsilon.
\]
Now, take infimums of both sides to get
\begin{equation}
v(i)\leq\inf_{\pi_{0}\in\Pi_{0}}\left\{ c(i)+d\sum_{j}(P(\pi_{0}))_{ij}v(j)\right\} +\epsilon.\label{eq:udp}
\end{equation}

We can take $\epsilon\downarrow0$ and combine (\ref{eq:ldp}) and
(\ref{eq:udp}) to arrive at 
\begin{equation}
v(i)=\inf_{\pi_{0}\in\Pi_{0}}\left\{ c(i)+d\sum_{j}(P(\pi_{0}))_{ij}v(j)\right\} .\label{eq:dp}
\end{equation}
The implications of this are amazing! We started out with an objective
function (\ref{eq:objective}) that was daunting: minimizing it would
require picking a stationary policy for each time $n$. However, we
were able to use the Markov property to reduce this to a ``local''
problem that only involves minimizing over all stationary policies
$\pi_{0}$. In fact, we can simplify (\ref{eq:dp}) even further.
First, we need some notation:
\[
\text{for }\{y_{\alpha}\}_{\alpha}\in\mathbb{R}^{n}\text{, }\inf_{\alpha}y_{\alpha}\text{ is the vector with entries }\inf_{\alpha}(y_{\alpha})_{i}.
\]

\begin{thm}[Dynamic programming]
Let $v=(v(1),\ldots,v(m))^{\intercal}$ and $c=(c(1),\ldots,c(m))^{\intercal}$
where $v(i)$ is the quantity defined by (\ref{eq:mdp}). Then,
\[
\boxed{\sup_{\pi_{0}\in\Pi_{0}}\left\{ \left(I-dP(\pi_{0})\right)v-c\right\} =0}
\]
\end{thm}
\begin{proof}
We can rewrite (\ref{eq:dp}) as
\begin{equation}
v=\inf_{\pi_{0}\in\Pi}\left\{ c+dP(\pi_{0})v\right\} .\label{eq:fixpoint}
\end{equation}
Moving some terms around, we obtain the desired result.
\end{proof}
In fact, the situation is much more general than we have let on. We
can allow for more general discount factors and costs:
\[
J(i,\pi)=\mathbb{E}\left[\sum_{n\geq0}d(\pi_{n}(X_{n}^{\pi}),X_{n}^{\pi})^{n}c(\pi_{n}(X_{n}^{\pi}),X_{n}^{\pi})\middle|X_{0}^{\pi}=i\right].
\]
However, in this case, it is no longer necessarily the case that $v(\cdot)$
is bounded. When it is, the corresponding dynamic programming equation
is
\begin{equation}
\sup_{\pi_{0}\in\Pi_{0}}\left\{ \left(I-D(\pi_{0})P(\pi_{0})\right)v-c(\pi_{0})\right\} =0\label{eq:dp_general}
\end{equation}
where $c(\pi_{0})=(c(\pi_{0}(1),1),\ldots,c(\pi_{0}(m),m))^{\intercal}$
and
\[
D(\pi_{0})=\operatorname{diag}(d(\pi_{0}(1),1),\ldots,d(\pi_{0}(m),m))=\begin{pmatrix}d(\pi_{0}(1),1)\\
 & d(\pi_{0}(2),2)\\
 &  & \ddots\\
 &  &  & d(\pi_{0}(m),m)
\end{pmatrix}.
\]

In light of this, the remainder of this lecture is focused on (\ref{eq:dp_general}).
In particular, we would like to know if an arbitrary vector $v$ satisfies
(\ref{eq:dp_general}), is it necessarily equal to the MDP (\ref{eq:mdp})?
Moreover, can we use (\ref{eq:dp_general}) to compute the MDP?
\end{document}