## Linear Regression

---

### Import Relevant Libraries

In [14]:
import pandas as pd # for data handling
import numpy as np # for numerical operations

---

### Data

In [15]:
data_array = np.array([[-20.00, -69.00], [-10.00, -36.00], [-5.00, -20.00], 
                       [0.00, -8.00], [4.00, 6.00], [9.00, 21.00], [15.00, 35.00], [30.00, 81.00]]) # load data into array
data = pd.DataFrame(data_array, columns=['X', 'Y']) # create dataframe with columns X and Y
print(data) # print the data

      X     Y
0 -20.0 -69.0
1 -10.0 -36.0
2  -5.0 -20.0
3   0.0  -8.0
4   4.0   6.0
5   9.0  21.0
6  15.0  35.0
7  30.0  81.0


---

### Linear Regression (LR) Function
#### Returns:
- sorted traning dataset
- optimal model parameters
- discrepencies between model predictions and actual values of training dataset
- mean squared error

#### Equations Implemented:
##### vector X containing all x values:
$$
X = \begin{bmatrix} x_0 & \dots & x_{n-1} \end{bmatrix}
$$
##### vector Y containing all y values:
$$
Y = \begin{bmatrix} y_0 & \dots & y_{n-1} \end{bmatrix}
$$
##### summation of all x values:
$$
\sum x_i = x_0 + x_1 + \dots + x_{n-1}
$$
##### summation of all y values:
$$
\sum y_i = y_0 + y_1 + \dots + y_{n-1}
$$
##### summation of all squared x values:
$$
\sum x_i^2 = x_0^2 + x_1^2 + \dots + x_{n-1}^2
$$
##### summation of all products of x and y values:
$$
\sum x_i y_i = x_0 y_0 + x_1 y_1 + \dots + x_{n-1} y_{n-1}
$$
##### optimal slope m:
$$
m = \frac{n \sum x_i y_i - \sum x_i \sum y_i}{n \sum x_i^2 - (\sum x_i)^2}
$$
##### optimal intercept b:
$$
b = \frac{\sum x_i^2 \sum y_i - \sum x_i \sum x_i y_i}{n \sum x_i^2 - (\sum x_i)^2}
$$
##### absolute error:
$$
|Err| = |y_i - \hat{y_i}|
$$
##### mean squared error:
$$
MSE = \frac{1}{n} ||\hat{y}\ - y||^{2}_{2} = \frac{1}{n} \sum_{i = 0}^{n - 1} (m . x_i + b - y_i)^2
$$

---

### Function Implementation

In [16]:
def LR(data): # linear regression function
    sorted_data = sorted(data.to_numpy(), key=lambda point: point[0]) # sort the data into a list of tuples starting at index 0 # sort the data according to x value
    n = len(data) # number of elements
    slopes_list = [] # list to hold the slope values
    intercepts_list = [] # list to hold the intercept values
    yhats = [] # list to hold the yhat values
    sum_x = 0 # Initialize the summation of all x values to zero
    sum_y = 0 # initialize the summation of all y values to zero
    sum_x_squared = 0 # initialize the summation of all x values squared to zero
    sum_x_times_y = 0 # initialize the summation of the product of all x and y values to zero
    errs = [] # list to hold the error values
    MSE = 0  # Initialize the mean squared error to zero

    X = data['X'].to_numpy() # column X becomes array of X values
    Y = data['Y'].to_numpy() # column Y becomes array of Y values

    """summation of x values section"""
    for x in X: # loop through X vector
        sum_x += x # append new x value from X vector to the summation through each iteration

    """summation of y values section"""
    for y in Y: # loop through Y vector
        sum_y += y # append new y value from Y vector to the summation through each iteration

    """summation of x squared values section"""
    for x in X: # loop through X vector
        a = x * x # square the current value of x
        sum_x_squared += a # append newly squared value from X vector to the summation through each iteration

    """summation of all xy products section"""
    for i in range(n): # iterate n number of times
        a = X[i] * Y[i] # the product of x and y at the same index is stored in variable a
        sum_x_times_y += a # the total sum of the xy products is updated

    """optimal model parameters section"""
    slope_model_parameters = ((n * sum_x_times_y) - (sum_x * sum_y)) / ((n * sum_x_squared) - (sum_x)**2) # calculate the slope per the formula listed above
    intercept_model_parameters = ((sum_x_squared * sum_y) - (sum_x * sum_x_times_y)) / ((n * sum_x_squared) - (sum_x)**2) # calculate the intercept per the formula listed above

    slope_model_parameters_rounded = round(slope_model_parameters, 2) # round slope model parameter to two decimal places
    intercept_model_parameters_rounded = round(intercept_model_parameters, 2) # round intercept model parameter to two decimal places

    """pairwise slopes and intercepts section"""
    for i in range(n - 1): # iteration for comparing point pairs
        x0, y0 = sorted_data[i] # a pair of corresponding points
        x1, y1 = sorted_data[i + 1] # the next pair of corresponding points
        m = (y1 - y0) / (x1 - x0) # slope
        b = ((y1 * x0) - (y0 * x1)) / (x0 - x1) # intercept
        slopes_list.append(round(m, 2)) # add the newly calculated slope to the slopes list, round to two decimal places
        intercepts_list.append(round(b, 2)) # add the newly calculated intercept to the intercepts list, round to two decimal places
        
    """yhat section"""
    for i, (x, y) in enumerate(sorted_data): # iterate through x values and y values by index i
        yhat = slope_model_parameters * x + intercept_model_parameters # yhat formula
        yhat = round(yhat, 2) # round yhat value to two decimal places
        yhats.append((i, yhat)) # add to yhat values list

    """discrepencies section"""
    for i in range(n): # loop to iterate
        a = abs(Y[i] - yhats[i]) # value of the discrepency
        errs.append(a) # append individual discrepencies to errors list

    """mean squared error section"""
    if n > 0: # assume n is zero initially, until the length of the data determines the number of elements above
        error = 0 # combined error is initially zero until computed
        for x, y in sorted_data: # loop through the x values and the y values
            error += ((slope_model_parameters * x) + intercept_model_parameters - y) ** 2  # square the individual errors then add them
        MSE = error / n  # calculate the finalized mean squared error before rounding

    MSE = round(MSE, 6) # round the mean squared error to six decimal places
    
    print(f"\nThe training dataset:\n") # print information of impending output

    for i, (x, y) in enumerate(sorted_data): # loop through the data by index
        print(f"x[{i}] = {x} y[{i}] = {y}") # print each x, y with index

    print(f"\nOptimal model parameters obtained by the program:\n") # print information of impending output

    print(f"m = {slope_model_parameters_rounded}") # print the slope optimal model parameter
    print(f"b = {intercept_model_parameters_rounded}\n") # print the intercept optimal model parameter

    print(f"The discrepancies between the model predictions and the actual values of the training dataset:\n") # print information of impending output
    for i, (x, y) in enumerate(sorted_data): # iterate through x values and y values by index i
        yhat = yhats[i][1] # assign yhat the value at that specific index in the yhats list
        error = round(errs[i][1], 2) # assign error the value at that specific index in the errors list and round
        print(f"x[{i}] = {x:.2f}, y[{i}] = {y:.2f}, yhat[{i}] = {yhat:.2f} |Err| = {error:.2f}") # print x values, y values, yhat values, and error values all with two decimal places

    print(f"\nMean Squared Error = {MSE}") # print the mean squared error value

---

In [17]:
LR(data)


The training dataset:

x[0] = -20.0 y[0] = -69.0
x[1] = -10.0 y[1] = -36.0
x[2] = -5.0 y[2] = -20.0
x[3] = 0.0 y[3] = -8.0
x[4] = 4.0 y[4] = 6.0
x[5] = 9.0 y[5] = 21.0
x[6] = 15.0 y[6] = 35.0
x[7] = 30.0 y[7] = 81.0

Optimal model parameters obtained by the program:

m = 2.96
b = -7.27

The discrepancies between the model predictions and the actual values of the training dataset:

x[0] = -20.00, y[0] = -69.00, yhat[0] = -66.51 |Err| = 2.49
x[1] = -10.00, y[1] = -36.00, yhat[1] = -36.89 |Err| = 0.89
x[2] = -5.00, y[2] = -20.00, yhat[2] = -22.08 |Err| = 2.08
x[3] = 0.00, y[3] = -8.00, yhat[3] = -7.27 |Err| = 0.73
x[4] = 4.00, y[4] = 6.00, yhat[4] = 4.58 |Err| = 1.42
x[5] = 9.00, y[5] = 21.00, yhat[5] = 19.39 |Err| = 1.61
x[6] = 15.00, y[6] = 35.00, yhat[6] = 37.17 |Err| = 2.17
x[7] = 30.00, y[7] = 81.00, yhat[7] = 81.60 |Err| = 0.60

Mean Squared Error = 2.685692


---