# 01-Basic & Logistic Regression

In [1]:
import time
import copy
import math
import pickle
import numpy as np
import matplotlib.pyplot as plt

## 1.벡터화(Vectorization)를 사용해야하는 이유.

딥러닝에서 사용하는 데이터들은 스칼라(단일 상수)보다는 벡터나 행렬과 같이 다차원 값을 많이 사용한다.

벡터의 내적을 계산할 때, 벡터가 가진 원소별로 곱하고 그 값들을 모두 더해야하는데,  
일반적인 반복문을 통해 내적 연산을 구현하면 너무 비효율적이다.

In [2]:
x1 = [9, 2, 5, 0, 0, 7, 5, 0, 0, 0, 9, 2, 5, 0, 0]
x2 = [9, 2, 2, 9, 0, 9, 2, 5, 0, 0, 9, 2, 5, 0, 0]

start_time = time.process_time()
result = 0
for i in range(len(x1)): ## 벡터의 내적은 두 벡터의 차원이 같을 때 가능한 연산.
    result += x1[i] * x2[i]
end_time = time.process_time()

print(f"{result}, Computation Time : {1000 * (end_time - start_time)}ms")

278, Computation Time : 0.032601000000020974ms


파이썬에서 다차원 값을 이용한 계산은 Numpy 라이브러리를 많이 이용한다.  
계산과정을 위한 코드가 훨씬 간단해지고, 속도도 훨씬 더 빠르다.

In [3]:
x1 = [9, 2, 5, 0, 0, 7, 5, 0, 0, 0, 9, 2, 5, 0, 0]
x2 = [9, 2, 2, 9, 0, 9, 2, 5, 0, 0, 9, 2, 5, 0, 0]

start_time = time.process_time()
result = np.dot(x1, x2)
end_time = time.process_time()
print(f"{result}, Computation Time : {1000 * (end_time - start_time)}ms")

278, Computation Time : 0.028043000000033125ms


$$
Sigmoid(x) = \sigma(x) = \frac{1}{1 + e^{-x}}
$$


다음과 같이 스칼라를 입력 받는 형태의 함수 정의는 타입에러로 인해 다차원 값을 계산할 수 없다.

In [4]:
## 입력으로 상수값만 가능한 형태의 구현.
## 또는 단일 독립변수를 이용하는 함수라고 봐도 무방하다.
def basic_sigmoid(x):
    s = 1 / (1 + math.exp(-x))

    return s

In [5]:
print(basic_sigmoid(10))

0.9999546021312976


In [6]:
## x=[1,2,3]과 같이 리스트 타입의 입력은 불가능하다. 즉, 벡터나 행렬과 같이 다차원의 입력은 계산이 불가능하다.
## 다변수함수(vector function) 형태의 구현이 필요함.

x = [1, 2, 3]
# basic_sigmoid(x) ## -> TypeError: bad operand type for unary -: 'list'

이럴 때도 Numpy를 이용하는 것이 훨씬 효과적이다.  
벡터(또는 행렬)을 입력하고 `np.exp(-x)`와 같은 연산을 적용하면 모든 원소에 지수함수 $e^x$가 적용된다.

$$ \text{For } x \in \mathbb{R}^n \text{,     } sigmoid(x) = sigmoid\begin{pmatrix}
    x_1  \\
    x_2  \\
    ...  \\
    x_n  \\
\end{pmatrix} = \begin{pmatrix}
    \frac{1}{1+e^{-x_1}}  \\
    \frac{1}{1+e^{-x_2}}  \\
    ...  \\
    \frac{1}{1+e^{-x_n}}  \\
\end{pmatrix}\tag{1} $$

In [7]:
## 벡터나 행렬과 같은 다차원 데이터를 입력으로 계산을 하기 위해서는 numpy 라이브러리를 활용한다.
def sigmoid(x):
    s = 1 / (1 + np.exp(-x))

    return s

In [8]:
x = np.array([1, 2, 3])
print(sigmoid(x))

[0.73105858 0.88079708 0.95257413]


- $\text{for } x \in \mathbb{R}^{1\times n} \text{,     }$

\begin{align*}
 softmax(x) &= softmax\left(\begin{bmatrix}
    x_1  &&
    x_2 &&
    ...  &&
    x_n  
\end{bmatrix}\right) \\&= \begin{bmatrix}
    \frac{e^{x_1}}{\sum_{j}e^{x_j}}  &&
    \frac{e^{x_2}}{\sum_{j}e^{x_j}}  &&
    ...  &&
    \frac{e^{x_n}}{\sum_{j}e^{x_j}} 
\end{bmatrix} 
\end{align*}

In [9]:
def softmax(x):
    x_exp = np.exp(x)
    x_sum = np.sum(x_exp, axis=1, keepdims=True)
    s = x_exp / x_sum

    return s

In [10]:
x = np.array([[9, 2, 5, 0, 0],
               [7, 5, 0, 0 ,0]])
print(softmax(x))

[[9.80897665e-01 8.94462891e-04 1.79657674e-02 1.21052389e-04
  1.21052389e-04]
 [8.78679856e-01 1.18916387e-01 8.01252314e-04 8.01252314e-04
  8.01252314e-04]]


## 2.Derivative

딥러닝에서의 학습이란 prediction과 ground-truth간 오차, loss를 계산하고 이를 Backpropagation함으로써  
NN이 가진 Trainable parameter를 업데이트하는 방식으로 진행된다.

여기서 "업데이트"라는 것은 해당 변수에 대한 손실값의 미분을 계산해 "어떤 방향으로 변수의 값을 조정해야 손실값이 감소할까?"를 알아내고,  
손실값이 감소하는 방향으로 변수의 값을 조정하는 것이다.

$$sigmoid\_derivative(x) = \sigma'(x) = \sigma(x) (1 - \sigma(x))$$

In [11]:
def sigmoid_derivative(x):
    s = 1 / (1 + np.exp(-x))
    ds = s * (1 - s)

    return ds

In [12]:
x = np.array([1, 2, 3])
print(sigmoid_derivative(x))

[0.19661193 0.10499359 0.04517666]


## 3.Logistic Regression

### 3-1.Download & Load Dataset

아래 코드를 터미널에서 실행해 데이터셋을 다운로드 받습니다.

    wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
    tar -xvzf cifar-10-python.tar.gz

In [13]:
def unpickle(files):
    dataset = {'data': [], 'labels': []}
    for file in files:
        with open(file, 'rb') as fo:
            data_dict = pickle.load(fo, encoding='bytes')
            images = data_dict[b'data']
            labels = data_dict[b'labels']
            dataset['data'].extend(images)
            dataset['labels'].extend(labels)
    dataset['data'] = np.array(dataset['data'])
    dataset['labels'] = np.array(dataset['labels'])
    
    return dataset

        
dataset_dir = "/home/pervinco/Datasets/cifar-10-batches-py" ## 본인의 경로에 맞게 수정할 것.
train_data_files = ["data_batch_1",
                    "data_batch_2",
                    "data_batch_3",
                    "data_batch_4",
                    "data_batch_5"]

train_data_files = [f"{dataset_dir}/{file}" for file in train_data_files]
test_data_file = [f"{dataset_dir}/test_batch"]

trainset = unpickle(train_data_files)
testset = unpickle(test_data_file)

### 3-2. Data Processing

In [14]:
def vec2img(images):
    X = []
    for image in images:
        ## 처음 1024개의 요소가 빨간색 채널, 다음 1024개의 요소가 녹색 채널, 마지막 1024개의 요소가 파란색 채널.
        image = np.reshape(image, (3, 32, 32)) # 이미지를 (3, 32, 32)로 재구성
        image = np.transpose(image, (1, 2, 0)) # # 축을 재배열하여 (32, 32, 3)으로 변환
        image = image.astype(np.uint8)
        X.append(image)

    return np.array(X)

def cat_filter(labels, target=3):
    Y = []
    for label in labels:
        if label == target:
            Y.append([1])
        else:
            Y.append([0])

    return np.array(Y).transpose((1, 0))

In [15]:
train_set_x_orig, train_set_y = vec2img(trainset["data"]), cat_filter(trainset["labels"])
test_set_x_orig, test_set_y = vec2img(testset["data"]), cat_filter(testset["labels"])

In [16]:
m_train = train_set_x_orig.shape[0]
m_test = test_set_x_orig.shape[0]
num_px = train_set_x_orig.shape[1]

print ("Number of training examples: m_train = " + str(m_train))
print ("Number of testing examples: m_test = " + str(m_test))
print ("Height/Width of each image: num_px = " + str(num_px))
print ("Each image is of size: (" + str(num_px) + ", " + str(num_px) + ", 3)")
print ("train_set_x shape: " + str(train_set_x_orig.shape))
print ("train_set_y shape: " + str(train_set_y.shape))
print ("test_set_x shape: " + str(test_set_x_orig.shape))
print ("test_set_y shape: " + str(test_set_y.shape))

Number of training examples: m_train = 50000
Number of testing examples: m_test = 10000
Height/Width of each image: num_px = 32
Each image is of size: (32, 32, 3)
train_set_x shape: (50000, 32, 32, 3)
train_set_y shape: (1, 50000)
test_set_x shape: (10000, 32, 32, 3)
test_set_y shape: (1, 10000)


In [17]:
train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T
test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T

print ("train_set_x_flatten shape: " + str(train_set_x_flatten.shape))
print ("train_set_y shape: " + str(train_set_y.shape))
print ("test_set_x_flatten shape: " + str(test_set_x_flatten.shape))
print ("test_set_y shape: " + str(test_set_y.shape))

train_set_x_flatten shape: (3072, 50000)
train_set_y shape: (1, 50000)
test_set_x_flatten shape: (3072, 10000)
test_set_y shape: (1, 10000)


In [18]:
## Normalize

train_set_x = train_set_x_flatten / 255.
test_set_x = test_set_x_flatten / 255.

## 3-3. Build Computations & Training

For one example $x^{(i)}$:
$$z^{(i)} = w^T x^{(i)} + b \tag{1}$$
$$\hat{y}^{(i)} = a^{(i)} = sigmoid(z^{(i)})\tag{2}$$ 
$$ \mathcal{L}(a^{(i)}, y^{(i)}) =  - y^{(i)}  \log(a^{(i)}) - (1-y^{(i)} )  \log(1-a^{(i)})\tag{3}$$

The cost is then computed by summing over all training examples:
$$ J = \frac{1}{m} \sum_{i=1}^m \mathcal{L}(a^{(i)}, y^{(i)})\tag{6}$$

In [19]:
def sigmoid(z):
    s = 1 / (1 + np.exp(-z))

    return s

def initialize_with_zeros(input_dim):
    w = np.zeros((input_dim, 1)) ## 입력 데이터 샘플의 feature 수에 맞춰지기 때문.
    b = 0.0

    return w, b

Forward Propagation:
- You get X
- You compute $A = \sigma(w^T X + b) = (a^{(1)}, a^{(2)}, ..., a^{(m-1)}, a^{(m)})$
- You calculate the cost function: $J = -\frac{1}{m}\sum_{i=1}^{m}(y^{(i)}\log(a^{(i)})+(1-y^{(i)})\log(1-a^{(i)}))$

Here are the two formulas you will be using: 

$$ \frac{\partial J}{\partial w} = \frac{1}{m}X(A-Y)^T\tag{7}$$
$$ \frac{\partial J}{\partial b} = \frac{1}{m} \sum_{i=1}^m (a^{(i)}-y^{(i)})\tag{8}$$

In [20]:
def propagate(w, b, X, Y):
    m = X.shape[1]
    A = 1 / (1 + np.exp(-np.dot(w.T, X) - b)) ## sigmoid(w^Tx +b)
    cost = -1/m * np.sum(Y * np.log(A) + (1 - Y) * np.log(1 - A)) ## BCE

    dw = 1/m * np.dot(X, np.transpose((A-Y))) ## Gradient of \partial J / \partial w
    db = 1/m * np.sum(A-Y) ## Gradient of \partial J / \partial b

    # cost = np.squeeze(np.array(cost))
    grads = {"dw": dw,
             "db": db}
    
    return grads, cost

In [21]:
w =  np.array([[1.], [2]])
b = 1.5
X = np.array([[1., -2., -1.], [3., 0.5, -3.2]])
Y = np.array([[1, 1, 0]])
grads, cost = propagate(w, b, X, Y)

print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print ("cost = " + str(cost))

dw = [[ 0.25071532]
 [-0.06604096]]
db = -0.1250040450043965
cost = 0.15900537707692405


In [22]:
def optimize(w, b, X, Y, num_iterations=100, learning_rate=0.009, print_cost=False):
    w = copy.deepcopy(w)
    b = copy.deepcopy(b)

    costs = []
    for i in range(num_iterations):
        ## forward propagte
        grads, cost = propagate(w, b, X, Y)
        
        ## gradients
        dw = grads["dw"]
        db = grads["db"]

        ## update trainable params
        w = w - (learning_rate * dw)
        b = b - (learning_rate * db)

        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
        
            # Print the cost every 100 training iterations
            if print_cost:
                print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs

In [23]:
params, grads, costs = optimize(w, b, X, Y, num_iterations=100, learning_rate=0.009, print_cost=False)

print ("w = " + str(params["w"]))
print ("b = " + str(params["b"]))
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print("Costs = " + str(costs))

w = [[0.80956046]
 [2.0508202 ]]
b = 1.5948713189708588
dw = [[ 0.17860505]
 [-0.04840656]]
db = -0.08888460336847771
Costs = [0.15900537707692405]


### 3-4.Predictions

In [24]:
def predict(w, b, X):
    m = X.shape[1]
    Y_prediction = np.zeros((1, m))
    w = w.reshape(X.shape[0], 1)

    A = 1 / (1 + np.exp(-(w.T @ X + b)))

    for i in range(A.shape[1]):
        if A[0, i] > 0.5:
            Y_prediction[0,i] = 1
        else:
            Y_prediction[0,i] = 0

    return Y_prediction

In [25]:
w = np.array([[0.1124579], [0.23106775]])
b = -0.3
X = np.array([[1., -1.1, -3.2],[1.2, 2., 0.1]])
print ("predictions = " + str(predict(w, b, X)))

predictions = [[1. 1. 0.]]


### 3-5. Merging to Black Box

In [26]:
def model(X_train, Y_train, X_test, Y_test, num_iterations=2000, learning_rate=0.5, print_cost=False):
    w, b = initialize_with_zeros(X_train.shape[0])
    params, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)

    w = params['w']
    b = params['b']
    
    Y_prediction_test = predict(w, b, X_test)
    Y_prediction_train = predict(w, b, X_train)

    if print_cost:
        print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
        print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))

    
    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

In [27]:
logistic_regression_model = model(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations=2000, learning_rate=0.005, print_cost=True)

Cost after iteration 0: 0.693147
Cost after iteration 100: 0.322767
Cost after iteration 200: 0.318619
Cost after iteration 300: 0.315850
Cost after iteration 400: 0.313723
Cost after iteration 500: 0.311992
Cost after iteration 600: 0.310541
Cost after iteration 700: 0.309304
Cost after iteration 800: 0.308237
Cost after iteration 900: 0.307307
Cost after iteration 1000: 0.306490
Cost after iteration 1100: 0.305766
Cost after iteration 1200: 0.305121
Cost after iteration 1300: 0.304543
Cost after iteration 1400: 0.304021
Cost after iteration 1500: 0.303549
Cost after iteration 1600: 0.303119
Cost after iteration 1700: 0.302725
Cost after iteration 1800: 0.302364
Cost after iteration 1900: 0.302031
train accuracy: 89.932 %
test accuracy: 89.96 %
