# Machine Learning Exercise 1 - Linear Regression

## Linear regression with one variable

Taks 0: Understanding how Gradient Descent works

In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [None]:
#data1
data = pd.read_csv('ex1data1.txt', header=None, names=['Population', 'Profit'])
data.head()

In [None]:
#data2
data = pd.read_csv('ex1data2.txt', header=None, names=['Size', 'Bedrooms', 'Price'])
data.head()

In [None]:
data.shape

In [None]:
#data2
data = (data - data.mean()) / data.std()
#data.head()

In [None]:
#data.describe()

In [None]:
#data1
ax = data.plot(kind='scatter', x='Population', y='Profit', title='Scatter plot of training data', figsize=(8,4),grid=True);
ax.set_xlabel('House Size')
ax.set_ylabel('House Price')

In [None]:
#data2
ax = data.plot(kind='scatter', x='Size', y='Price', title='Scatter plot of training data', figsize=(8,4),grid=True);
ax.set_xlabel('House Size')
ax.set_ylabel('House Price')

## Gradient Descent

First, you create a function to compute the cost of a given solution (characterized by the parameters beta):

In [None]:
def compute_cost(X, y, beta):
    
    #X is the population X = x1, x2, x3... xn
    #X has the shape [beta, pop]
    #beta has this shape [b1, b2]
    #hb(pop) = profit predicted
    
    n    = X.shape[0]
    sumt = 0
    
    #for i in range(0,n):
    hb    = X * beta.T
    sub   = (hb-y)
    isso=np.multiply(sub,sub)
    sumt= np.sum(isso)  
    
    Jb = sumt/(2*n)
    return Jb
    

In [None]:
def compute_cost(X, y, theta):  
    residual = ((X * theta.T) - y)
    squared_residuals = np.power(residual, 2)
    return np.sum(squared_residuals) / (2 * len(X))

We store each example as a row in the X matrix. To take into account the intercept term (\beta0), we add an additional first column to X and set it to all ones. This allows us to treat \beta0 as simply another 'feature'.

In [None]:
data.insert(0, 'beta zero', 1)

Now let's do some variable initialization

In [None]:
#data1
# set X (training data) and y (target variable)
cols = data.shape[1]
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]

In [None]:
#data2
cols = data.shape[1]
X = data.iloc[:,0:cols-2]
y = data.iloc[:,cols-1:cols]

Now, you need to guarantee that X (training set) and y (target variable) are correct.

In [None]:
X.head()

In [None]:
y.head()

The cost function is expecting numpy matrices so we need to convert X and y before we can use them. We also need to initialize beta.

In [None]:
X = np.matrix(X.values)
y = np.matrix(y.values)
beta = np.matrix(np.array([0,0]))

Here's what beta looks like.

In [None]:
beta

Let's take a quick look at the shape of our matrices.

In [None]:
X.shape, beta.shape, y.shape

Now let's compute the cost for our initial solution (0 values for beta).

In [None]:
compute_cost(X, y, beta)

Now, you are asked to define a function to perform gradient descent on the parameters beta

In [None]:
def gradient_descent(X, y, theta, alpha, iters):
    '''
    alpha: learning rate
    iters: number of iterations
    OUTPUT:
    theta: learned parameters
    cost:  a vector with the cost at each training iteration
    '''
    temp       = np.matrix(np.zeros(theta.shape))
    parameters = int(theta.ravel().shape[1])
    cost       = np.zeros(iters)
    n          = X.shape[0]
    sumt       = 0
    
    
    for i in range(iters):
        dif  = (X * theta.T) - y
        
        for j in range(parameters):
            #hb   = theta.item(0,1) + X.item(i,1)*theta.item(0,0)
            mult = np.multiply(dif, X[:,j])
            sumt = np.sum(mult) 
        
            temp[0,j] = theta[0,j] - ((alpha/n) * sumt)

        theta = temp       
        cost[i] = compute_cost(X, y, theta)
    
    return theta, cost

Initialize some additional variables - the learning rate alpha, and the number of iterations to perform

In [None]:
alpha = 0.01
iters = 1500

Now let's run the gradient descent algorithm to fit our parameters theta to the training set.

In [None]:
g, cost = gradient_descent(X, y, beta, alpha, iters)
g

Finally we can compute the cost (error) of the trained model using our fitted parameters.

In [None]:
compute_cost(X, y, g)

Now let's plot the linear model along with the data to visually see how well it fits.

In [None]:
x = np.linspace(data.Population.min(), data.Population.max(), 100)
f = g[0, 0] + (g[0, 1] * x)

fig, ax = plt.subplots(figsize=(8,4))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population of city in 10,000s')
ax.set_ylabel('Profit in $10,000s')
ax.set_title('Predicted Profit vs. Population Size')
ax.grid(True)

Looks pretty good! Remember that the gradient decent function also outputs a vector with the cost at each training iteration, we can plot it as well. 

Since the cost always decreases - this is an example of a convex optimization problem.

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_ylim(4.0)
ax.set_title('Error vs. Training Epoch')
ax.grid(True)

Now, we will show a contour plot that presents beta0 against beta1 and the outcome of J. First, we set values for beta0 and beta1

In [None]:
beta0_vals = np.linspace(-10, 10, 100)
beta1_vals = np.linspace(-1, 4, 100)

Now, initialize J values to a matrix of 0's

In [None]:
j_vals = np.zeros([len(beta0_vals), len(beta1_vals)])

In [None]:
for i in range(len(beta0_vals)):
    for j in range(len(beta1_vals)):
        t = np.matrix(np.array([beta0_vals[i], beta1_vals[j]]))
        j_vals[i,j] = compute_cost(X, y, t)

In [None]:
plt.contour(beta0_vals, beta1_vals, j_vals.T, np.logspace(-2, 3, 20));

In [None]:
plt.scatter(g[0,0],g[0,1],)
plt.contour(beta0_vals, beta1_vals, j_vals.T, np.logspace(-2, 3, 20));

Now, in 3D

In [None]:
beta0_mesh, beta1_mesh = np.meshgrid(beta0_vals, beta1_vals)
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(beta0_mesh, beta1_mesh, j_vals.T);

## Linear regression with multiple variables

From now on, you will use the second dataset, i.e., ex1data2.txt. This is a housing price dataset with 2 variables (size of the house in square feet and number of bedrooms) and a target (price of the house). You are asked to use the techniques already applied to analyze that data set.

In [3]:
data2 = pd.read_csv('ex1data2.txt', header=None, names=['Size', 'Bedrooms', 'Price'])
data2.head()

Unnamed: 0,Size,Bedrooms,Price
0,2104,3,399900
1,1600,3,329900
2,2400,3,369000
3,1416,2,232000
4,3000,4,539900


For this task we add another pre-processing step - normalizing the features.

Notice that the scale of the values for each feature is vastly large. A house will typically have 2-5 bedrooms, but may have anywhere from hundreds to thousands of square feet. If we use the features as they are in the dataset, the 'size' feature would too much wheighted and would end up dwarfing any contributions from the 'number of bedrooms' feature. To fix this, we need to do something called 'feature normalization'. That is, we need to adjust the scale of the features to level the playing field. One way to do this is by subtracting from each value in a feature the mean of that feature, and then dividing by the standard deviation.

In [4]:
data2 = (data2 - data2.mean()) / data2.std()
data2.head()

Unnamed: 0,Size,Bedrooms,Price
0,0.13001,-0.223675,0.475747
1,-0.50419,-0.223675,-0.084074
2,0.502476,-0.223675,0.228626
3,-0.735723,-1.537767,-0.867025
4,1.257476,1.090417,1.595389


Given that you were asked to implement both cost function and gradient descent using matrix operations, your previously implementations will work just fine in the multivariate dataset. Hence, you need now insert the 'ones' column as before and separate the X's and the y's.

Conduct the rest of this exercise by repeating the experiments conducted in the simple linear dataset...

In [None]:
data2.insert(0, 'beta zero', 1)

In [None]:
# set X (training data) and y (target variable)
#variable inicialization
cols2 = data2.shape[1]
X2 = data2.iloc[:,0:cols2-1]
y2 = data2.iloc[:,cols2-1:cols2]

In [None]:
X2.head()

In [None]:
y2.head()

In [None]:
X2 = np.matrix(X2.values)
y2 = np.matrix(y2.values)
beta = np.matrix(np.array([0,0,0]))

In [None]:
#before Gradient Descent
compute_cost(X2, y2, beta)

In [None]:
alpha = 0.01
iters = 1500

In [None]:
g, cost = gradient_descent(X2, y2, beta, alpha, iters)
g

In [None]:
#after Gradient Descent
compute_cost(X2, y2, g)