# Tarea 1 <br/> CC6204 Deep Learning, Universidad de Chile  <br/> Hoja de respuestas partes 1 y 2 
## Nombre: Martin Cornejo Saavedra
Fecha sugerida para completar esta parte: 23 de marzo de 2018

In [0]:
# instalacion de los paquetes necesarios

from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
!pip install -q ipdb

# para manejar archivos de Drive

!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

## Nos debemos autentificar como usuarios
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

import glob
import torch
import numpy
import pdb


# Parte 1: Funciones de activación, derivadas y función de salida

## 1a) Funciones de activación

In [0]:
def relu(T):
    T[T < 0] = 0
    return T

def sig(T):
    return torch.reciprocal(1 + torch.exp(-1 * T))

def swish(T, beta):
    return torch.mul(T, sig(torch.mul(T, beta)))

def celu(T, alfa):
    positive = relu(T)
    negative = torch.mul(relu(torch.mul(T, -1)), -1)
    celu_T = torch.mul(torch.add(torch.exp(torch.div(negative, alfa)), -1), alfa)

    return torch.add(positive, 1, celu_T)

## 1b) Derivando las funciones de activación


\begin{equation}
\frac{\partial\ \text{relu}(x)}{\partial x} =
\left\{
	\begin{array}{ll}
		1  & \mbox{si } x \geq 0 \\
		0  & \mbox{~} 
	\end{array}
\right. 
\end{equation}
<br>

Dado $ \sigma (x) = sigmoid(x)$, tenemos que:

\begin{eqnarray}
\frac{\partial\ \text{swish}(x, \beta)}{\partial x} & = \sigma (\beta x) + \beta x \cdot \sigma (\beta x)(1-\sigma (\beta x)) \\
& = \sigma (\beta x) + \beta x \cdot \sigma (\beta x) - \beta x \cdot \sigma (\beta x)^{2}  \\
&= \beta \cdot swish(x, \beta) + \sigma (\beta x)(1 - \beta \cdot swish(x, \beta))\\
\\
\frac{\partial\ \text{swish}(x, \beta)}{\partial \beta} & =  
x^2 \sigma (\beta x)(1 - \sigma (\beta x))\\
\end{eqnarray}
<br><br>

\begin{eqnarray}
\frac{\partial\ \text{celu}(x, \alpha)}{\partial x} & =  
\left\{
	\begin{array}{ll}
		1  & \mbox{si } x \geq 0 \\
		exp (\frac{x}{\alpha})  & \mbox{~} 
	\end{array}
\right. \\
\\
\frac{\partial\ \text{celu}(x, \alpha)}{\partial \alpha} & = 
\left\{
	\begin{array}{ll}
		0  & \mbox{si } x \geq 0 \\
		exp (\frac{x}{\alpha})(1 - \frac{x}{\alpha}) - 1  & \mbox{~} 
	\end{array}
\right. \\
\end{eqnarray}

## 1c) Softmax

Dada la funcion `softmax` sabemos que cada elemento de la secuencia $\text{softmax}(x_1,\ldots,x_n)$ tiene la forma

\begin{equation}
s_i = \frac{e^{x_i}}{\sum_{j=1}^{n}e^{x_j}}
\end{equation}

Luego, para cada elemento de la secuencia $\text{softmax}(x_1-M,\ldots,x_n-M)$ se tiene

\begin{equation}
s_i = \frac{e^{x_i-M}}{\sum_{j=1}^{n}e^{x_j-M}} = \frac{e^{-M}e^{x_i}}{\sum_{j=1}^{n}e^{-M}e^{x_j}} = \frac{e^{-M}e^{x_i}}{e^{-M}\sum_{j=1}^{n}e^{x_j}} = \frac{e^{x_i}}{\sum_{j=1}^{n}e^{x_j}}
\end{equation}

Demostrando que $\text{softmax}(x_1-M,\ldots,x_n-M) = \text{softmax}(x_1,\ldots,x_n)$.

In [0]:
# por ahora softmax estara implementada solo para tensores en 2-D
def softmax(T, dim=0, estable=True):
    denom_softmax = torch.div(T, 2)
    denom_softmax = torch.exp(denom_softmax)
    denom_softmax = torch.mm(denom_softmax, torch.transpose(denom_softmax, 0, 1))
    denom_softmax = torch.reciprocal(torch.diag(denom_softmax))

    return torch.mm(torch.diag(denom_softmax), T.exp())

# Parte 2: Red neuronal y pasada hacia adelante (forward)

## 2a) Clase para red neuronal, 2b) Usando la GPU, 2c) Pasada hacia adelante

In [0]:
class FFNN():
    def __init__(self, F, l_h, l_a, C, params=[]):
      if (len(params) > 0):
        self.W_1 = params[0][0]
        self.b_1 = params[0][1]
        self.W_2 = params[1][0]
        self.b_2 = params[1][1]
        self.U = params[2][0]
        self.c_init = params[2][1]

      else:
        self.F = F
        self.l_h = l_h
        self.l_a = l_a
        self.C = C

        self.W_1 = torch.randn(F, l_h[0])
        self.b_1 = torch.zeros(1, l_h[0])

        self.W_2 = torch.randn(l_h[0], l_h[1])
        self.b_2 = torch.zeros(1, l_h[1])

        self.U = torch.randn(l_h[1], C)
        self.c_init = torch.zeros(1, C)
          
  
    def gpu(self):
      if torch.cuda.is_available():
        self.W_1 = self.W_1.cuda()
        self.b_1 = self.b_1.cuda()
        self.W_2 = self.W_2.cuda()
        self.b_2 = self.b_2.cuda()
        self.U = self.U.cuda()
        self.c_init = self.c_init.cuda()
  
    def cpu(self):
      self.W_1 = self.W_1.cpu()
      self.b_1 = self.b_1.cpu()
      self.W_2 = self.W_2.cpu()
      self.b_2 = self.b_2.cpu()
      self.U = self.U.cpu()
      self.c_init = self.c_init.cpu()
  
    def forward(self, x):
      if torch.cuda.is_available():
        x = x.cuda()
        self.gpu()   # redundante, corregir
      
      h_1 = sig(torch.mm(x, self.W_1) + self.b_1)
      h_2 = sig(torch.mm(h_1, self.W_2) + self.b_2)
      y = softmax(torch.mm(h_2, self.U) + self.c_init)

      return y

## 2d) Probando tu red con un modelo pre-entrenado

In [44]:
## Clonamos el github

#!git clone https://github.com/jorgeperezrojas/cc6204-DeepLearning-DCCUChile.git

os.chdir("/content/cc6204-DeepLearning-DCCUChile/2018/tareas/tarea1/recursos/varita_magica")

data  modelos  varita.pdf


In [112]:
# cargar parametros entrenados a tensores

local_download_path = "modelos/ejemplo"
params = []
params.append([numpy.loadtxt(local_download_path+"/W1.txt"), numpy.loadtxt(local_download_path+"/b1.txt")])
params.append([numpy.loadtxt(local_download_path+"/W2.txt"), numpy.loadtxt(local_download_path+"/b2.txt")])
params.append([numpy.loadtxt(local_download_path+"/U.txt"), numpy.loadtxt(local_download_path+"/c.txt")])

params = list(map(lambda x: [torch.from_numpy(x[0]), torch.from_numpy(x[1])], params))
#pdb.set_trace()


# cargar red neuronal en pase a parametros conocidos

test_data_path = "data/train_set"
test_input = torch.from_numpy(numpy.loadtxt(test_data_path+"/hechizo-0/090.txt")).double().cuda().view(1, 4096)
red_neuronal = FFNN(0, [], ['algo'], 10, params)
red_neuronal.forward(test_input)



Columns 0 to 5 
 9.9813e-01  1.5828e-03  2.7570e-04  6.2942e-10  4.2164e-10  3.0463e-07

Columns 6 to 9 
 1.3270e-12  1.0802e-08  1.2021e-05  2.3639e-11
[torch.cuda.DoubleTensor of size 1x10 (GPU 0)]

In [0]:
# Tu código visualizando los ejemplos incorrectos acá

# Parte 3: Más derivadas y back propagation

## 3a) Entropía Cruzada

In [1]:
def cross_ent_loss(Q,P):
    dimension = 0
    q_log = torch.log(Q)
    product = torch.mul(P, torch.reciprocal(q_log))

    return torch.sum(product)/Q.size(dimension)

## 3b) Derivando la última capa

\begin{equation}
\frac{\partial \cal L}{\partial u^{(L)}} = \frac{\partial \cal L}{\partial ŷ} \cdot \frac{\partial ŷ}{\partial u^{(L)}} \\
\frac{\partial \cal L}{\partial ŷ} = \frac{\partial CE(ŷ, y)}{\partial ŷ} = \frac{1}{N} \sum_i \frac{\partial CE(ŷ_i, y_i)}{\partial ŷ_i} = -\frac{1}{N} \sum_{i,x} \frac{y(x)}{ŷ(x)}
\end{equation}