problems/pr2.tex

\documentclass{article}
% Include macros here
\input{macros}
\usepackage{fancyhdr}
%\include{macros}
\usepackage{pifont}

% Number of problem sheet
\newcounter{problemSheetNumber}
\setcounter{problemSheetNumber}{2}
\newcommand{\matlabprob}{\ding{100} \ }
\newcommand{\examprob}{\ding{80} \ }
%\setcounter{section}{\theproblemSheetNumber}  
%\renewcommand{\theparagraph}{(\thesection.\arabic{paragraph})}
\newcounter{problems}
\setcounter{problems}{0}
%\setlength{\parindent}{0cm}
\renewcommand{\problem}{\paragraph{(\theproblemSheetNumber.\theproblems)}\addtocounter{problems}{1}}

%\theoremstyle{remark}
%\newtheorem{problem}[problemSheetNumber]{}

\pagestyle{fancy}
\lhead{MATH36061}
\chead{Convex Optimization}
\rhead{October 9, 2016}

\begin{document} 
\begin{center}
{\Large {\bf Problem Sheet \theproblemSheetNumber}}
\end{center}

Problems in Part A will be discussed in class.
Problems in Part B come with solutions and should be tried at home. 


\section*{Part A}

\problem Let $\{\vct{x}_k\}_{k\geq 0}$ be a sequence of vectors in $\R^n$ and assume that this sequence converges linearly to some $\vct{x}^*\in \R^n$ with respect to a norm $\norm{\cdot}$,
\begin{equation*}
 \norm{\vct{x}_{k+1}-\vct{x}^*} \leq r\cdot \norm{\vct{x}_k-\vct{x}^*}, \quad k\geq 0,
\end{equation*}
for some constant $r\in (0,1)$. 
\begin{itemize}
 \item[(a)] Show that for $M=\norm{\vct{x}_0-\vct{x}^*}$, 
 \begin{equation*}
  \norm{\vct{x}_k-\vct{x}^*} \leq r^k \cdot M.
 \end{equation*}
\item[(b)] Let $\e>0$ be given. Show that if $N$ is an integer such that
\begin{equation*}
 N> \frac{1}{1-r}\left(\ln(M)+\ln\left(\frac{1}{\e}\right)\right),
\end{equation*}
then $\norm{\vct{x}_N-\vct{x}^*}\leq \e$. In words, $N$ iterations are sufficient to reach accuracy $\e$.
\item[(c)] Now assume that the sequence converges quadratically,
\begin{equation*}
 \norm{\vct{x}_k-\vct{x}^*}\leq C\norm{\vct{x}_k-\vct{x}^*}^2
\end{equation*}
for some constant $C>0$. Show that
\begin{equation*}
 \norm{\vct{x}_{k+1}-\vct{x}^*}\leq C^k \cdot M^{2^k}.
\end{equation*}
How many steps will guarantee a solution up to an error $\e$?
\end{itemize}

 \problem Consider the function
 \begin{equation*}
  f(x) = \sqrt{x^2+1}.
 \end{equation*}
Determine a value $\overline{x}\in \R$ such that
\begin{itemize}
 \item for $x_0<\overline{x}$, Newton's method converges to a minimizer,
 \item for $x_0>\overline{x}$, Newton's method does not converge.
\end{itemize}
What happens if $x_0=\overline{x}$ is chosen as starting point?
 
 \problem Describe a steepest descent method with respect to the norm 
 \begin{equation*}
  \norm{\vct{x}}_{\infty} = \max_{1\leq i\leq n} |x_i|.
 \end{equation*}
What are the descent directions? How can the best descent direction be found?
 
\section*{Part B}

\problem Iterative algorithms will never find the exact solution, so it is important to have suitable criteria for stopping an algorithm. Given a tolerance $\e>0$, consider the following candidates for stopping criteria:
\begin{itemize}
 \item[(a)] $\norm{\vct{x}_{k+1}-\vct{x}_k}<\e$;
 \item[(b)] $\norm{\nabla f(\vct{x}_k)}<\e$;
 \item[(c)] $k=100$.
\end{itemize}
Apply Newton's method for minimizing the function
\begin{equation*}
 f(x) = (x-1)^6
\end{equation*}
(in Python, MATLAB, or by hand) with each of the given stopping criteria. Which of these is the most efficient? In general, describe the benefits or disadvantages of each of these stopping criteria. 

\problem When implementing descent algorithms one often uses {\em backtracking} to select a good step length. Given a descent direction $\vct{p}_k$, one first tries the step length $1$ and then successively scales it down until the {\em sufficient decrease} condition (Lecture 4) is satisfied.

Fixing a decrease parameter $c\in (0,1/2)$ and a scaling parameter $s\in (0,1)$, backtracking works as follows.
\begin{itemize}
\item $\alpha=1$;
\item while $f(\vct{x}_k+\alpha \vct{p}_k) \geq f(\vct{x}_k)+\alpha \cdot c\cdot \ip{\vct{p}_k}{\nabla f(\vct{x}_k)}$: 
     $\displaystyle \alpha = \alpha\cdot s$;
\item $\displaystyle \vct{x}_{k+1} = \vct{x}_k+\alpha \vct{p}_k$
\end{itemize}

Consider the function
\begin{equation*}
 f(\vct{x}) = e^{x_1+3x_2-0.1}+e^{x_1-3x_2-0.1}+e^{-x_1-0.1}
\end{equation*}
on $\R^2$, with level sets given in the contour plot in Figure~\ref{fig:contour}.
Using the starting point $\vct{x}_0=(-1,0.7)^{\trans}$, plot the trajectory of
\begin{itemize}
 \item[(a)] Gradient descent with step length $0.1$;
 \item[(b)] Gradient descent with backtracking, using $c=0.1$ and $s=0.5$;
 \item[(c)] Newton's method.
\end{itemize}

\begin{figure}[h!]
 \centering
 \includegraphics[width=0.6\textwidth]{images/contour_cropped.pdf}
 \caption{Contour plot}\label{fig:contour}
\end{figure}

\problem (Logistic Regression.) Let $Y$ be a random variable that satisfies
 \begin{equation*}
  \Prob\{Y=1\} = p, \quad \Prob\{Y=0\} = 1-p,
 \end{equation*}
 where $\Prob\{\}$ denotes the probability of an event.
This random variable models an event with two possible outcomes. The {\em logistic model} for $p$ has the form
\begin{equation}\label{eq:logprob}\tag{1}
 p = \frac{e^{\ip{\vct{a}}{\vct{u}}+b}}{1+e^{\ip{\vct{a}}{\vct{u}}+b}},
\end{equation}
$\vct{u}\in \R^n$ is a vector of {\em explanatory variables}, and $\vct{a}\in \R^n, b\in \R$ are the {\em model parameters} that explain how $p$ depends on the variables. For example, the variable $Y$ could represent a choice in an election and the vector $\vct{u}$ encodes demographic information, or the variable $Y$ could indicate the presence of a disease and $\vct{u}$ encodes data about a patient.

Given vectors $\vct{u}_1,\dots,\vct{u}_m$ and corresponding observed outcomes $y_1,\dots,y_m$, we would like to estimate the parameters $\vct{a}$ and $b$. Assuming the observed outcomes $y_1=\cdots =y_k=1$ and $y_{k+1}=\cdots y_m=0$, the {\em log likelihood} function is defined as
\begin{equation}\label{eq:mlprod}\tag{2}
 \sum_{i=1}^k \ln(p_i) +\sum_{i=k+1}^m \ln(1-p_i),
\end{equation}
where $p_i$ is the probability~\eqref{eq:logprob} computed using $\vct{u}_i$,
and the goal is to find parameters $\vct{a},b$ that maximize this function.

\begin{itemize}
 \item[(a)] Show that the negative of the log likelihood function is given by
\begin{equation*}
 f(\vct{a},b) = -\sum_{i=1}^k (\ip{\mtx{a}}{\vct{u}_i}+b)+\sum_{i=1}^m\ln\left(1+e^{\ip{\vct{a}}{\vct{u}_i}+b}\right)
 \end{equation*}
 Show that the sum of convex functions and the composition of a convex function with a linear one are convex, and deduce that the log likelihood function is convex.
 \item[(b)] Consider the following table, in which the first row represents the hours of prepration and the second row whether an exam was passed (1) or not (0).
 \begin{figure}[h!]
 \centering
 \begin{tabular}{l|c|c|c|c|c|c|c|c|c|c|c|c|}
  Hours & 0.5 & 1 & 1.5 & 2 & 2.5 & 3 & 3 & 4.5 & 4 &4.5 & 4.75 & 5 \\
  \hline
  Pass  & 0   & 0 & 0   & 1 & 0   & 1 & 0   & 1 & 1 & 1  & 1    & 1
 \end{tabular}
\end{figure}
Find the (negative) log likelihood function $f(a,b)$. Solve the corresponding minimization problem 
\begin{equation*}
 \minimize f(a,b)
\end{equation*}
in two dimensions using either gradiend descent or Newton's method, and plot the resulting probability function~\eqref{eq:logprob}, giving the relationship of hours of study to probability of success.
\end{itemize}

\end{document}