In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sets the backend of matplotlib to the 'inline' backend.
#
# With this backend, the output of plotting commands is displayed inline within frontends like the Jupyter notebook,
# directly below the code cell that produced it.
# The resulting plots will then also be stored in the notebook document.
#
# More details: https://stackoverflow.com/questions/43027980/purpose-of-matplotlib-inline
%matplotlib inline

### Read the data in, and then show the first several rows

In [None]:
path = os.getcwd() + '/data.csv'
data = pd.read_csv(path, sep=',', header=0)
data.head()

### Get some basic high level statistical overview of the data

In [None]:
data.describe()

### Visualise the data in Cartesian coordinate system

In [None]:
data.plot(kind='scatter', x='Population', y='Profit', figsize=(12, 8))

### Define the latest squares cost function

Lets say we have sample data set: 

$$
(x_1, y_1),(x_2, y_2), (x_3, y_3)
$$

And we want to find out the coefficient matrix $\theta = \begin{pmatrix} \theta_0 & \theta_1\end{pmatrix}$ for the fitting equation:

$$
\begin{equation*}
y \> = \> \theta_0 \> + \> \theta_1x
\end{equation*}
$$

to minimize the quadratic sum of each sample data's itting error $\varepsilon_i$, that is: $\sum\limits_{i=1}^3\varepsilon_i^2$, which the $\varepsilon_i$ can be calculated in the below way:

$$
\begin{cases}
y_1 = \theta_0 + \theta_1x_1 + \varepsilon_1 \\
y_2 = \theta_0 + \theta_1x_2 + \varepsilon_2 \\
y_3 = \theta_0 + \theta_1x_3 + \varepsilon_3
\end{cases}
$$

$$
\sum\limits_{i=1}^3\varepsilon_i^2
= [(\theta_0 + \theta_1x_1) - y_1]^2 + [(\theta_0 + \theta_1x_2) - y_2]^2 + [(\theta_0 + \theta_1x_3) - y_3]^2 \\
= [\begin{pmatrix} 1 & x_1\end{pmatrix}\begin{pmatrix} \theta_0 \\ \theta_1\end{pmatrix} - y_1]^2 \
  + [\begin{pmatrix} 1 & x_2\end{pmatrix}\begin{pmatrix} \theta_0 \\ \theta_1\end{pmatrix} - y_2]^2 \
  + [\begin{pmatrix} 1 & x_3\end{pmatrix}\begin{pmatrix} \theta_0 \\ \theta_1\end{pmatrix} - y_3]^2 \\
= \sum\limits_{i=1}^3[\begin{pmatrix} 1 & x_i\end{pmatrix}\theta^T - y_i]^2
$$

#### So we got two points from the above calculations
1. Cost function.
2. We need to insert one all **1** column to all the sample data to make the matrix calculations can be performed correctly.

In [None]:
# Define the latest squares cost function.
def computeCost(X, y, theta):
    inner = np.power((X * theta.T) - y, 2)
    # The '1/2' is just for calculation convenience, since we know we will use ‘gradient descent’ algorithm,
    # and the cost function is one second derived function which after derivative there will be one '2' there.
    return np.sum(inner) / (2 * len(X))

In [None]:
data.insert(0, 'Ones', 1)

In [None]:
# Set X (training data) any y (target variable).
cols = data.shape[1]
X = data.iloc[:, 0:cols-1]
y = data.iloc[:, cols-1:cols]

In [None]:
# Convert data frames to numpy matrices.
X = np.matrix(X.values)
y = np.matrix(y.values)
theta = np.matrix(np.array([0, 0]))

In [None]:
# Always pay attention to the shape of the matrix to get around of some unnecessary troubles.
X.shape, y.shape, theta.shape

In [None]:
# Use the initial coefficient matrix θ to compute the cost.
computeCost(X, y, theta)

In [None]:
def gradientDescent(X, y, theta, alpha, iters):
    temp = np.matrix(np.zeros(theta.shape))
    params = int(theta.ravel().shape[1])
    cost = np.zeros(iters)
    
    for i in range(iters):
        error = (X * theta.T) - y
        
        for j in range(params):
            term = np.multiply(error, X[:, j])
            temp[0, j] = theta[0, j] - ((alpha / len(X))) * np.sum(term)
            
        theta = temp
        cost[i] = computeCost(X, y, theta)
        
    return theta, cost

In [None]:
# Initialise learning rate and iterations.
alpha = 0.01
iters = 1000

# Perform gradient descent to find out the coefficient matrix θ for the fitting equation.
g, cost = gradientDescent(X, y, theta, alpha, iters)
g

In [None]:
computeCost(X, y, g)

In [None]:
x = np.linspace(data.Population.min(), data.Population.max(), 100)
f = g[0, 0] + g[0, 1] * x

fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(x, f, 'r', label='Prediction')  
ax.scatter(data.Population, data.Profit, label='Traning Data')  
ax.legend(loc=2)  
ax.set_xlabel('Population')  
ax.set_ylabel('Profit')  
ax.set_title('Predicted Profit vs. Population Size') 

In [None]:
fig, ax = plt.subplots(figsize=(12,8))  
ax.plot(np.arange(iters), cost, 'r')  
ax.set_xlabel('Iterations')  
ax.set_ylabel('Cost')  
ax.set_title('Error vs. Training Epoch') 