In [6]:
import pandas as pd
import numpy as np
pd.set_option('max_rows', 500)
%matplotlib inline
%autosave 180

# Declare column names for all variables, dependent variable, and independant variables
cols = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']
yCols = 'mpg'
xCols = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']

# In order for the import statement to work correctly, the last three rows of the training set (the ones
# with "?" instead of mpg, must be removed from the text file, this has already been done with the text file
# provided in the zip
data = pd.DataFrame(pd.read_csv('linear_regression_data.txt', header=None, names=cols, delim_whitespace=True))

# Add testing samples
samples = pd.DataFrame({
    'cylinders': [4, 6, 4],
    'displacement': [95, 168, 98],
    'horsepower': [92, 96, 68],
    'weight': [2043, 2981, 2147],
    'acceleration': [19.1, 14.7, 18.3]
})
    
# Standardize training set and testing samples
sData = (data - data.mean()) / data.std()
sSam = (samples - data[xCols].mean())/ data[xCols].std()

# Assign dependent and independent variables
y = sData[yCols]
X = sData[xCols]

# Add column of ones to dependent variable to hold value for y intercept
ones = np.ones([X.shape[0],1])
X = np.concatenate((ones,X), axis=1)

# Create array of six values for theta and gradient (intercept, x1, x2, x3, x4, x5), initialize to zero
theta = np.zeros([1,6])
grad = np.zeros([6,1])

# Linear regression cost function
def costFunc(theta, X, y):
    cost = 0
    for m in range (len(X)):
        hTheta = X[m].dot(theta.T)
        cost += (1/2)*(np.square(hTheta-y[m]))
        
    return cost

# Derivitive of cost function, used in gradient descent
def costDerivitive(theta, grad, X, y):
    for m in range(len(X)):
        for n in range(len(grad)):
            grad[n] += (theta[0][n] - y[m])*X[m][n]
    return grad

# Value of alpha, number of iterations, and error        
alpha = 0.0002
iterations = 300
er = 0.1
print("alpha: ", alpha)
print("max iterations: ", iterations)
print("min error for convergence: ", er)

# Gradient descent function, iterates until delta(J) is less than error value, 
# max iterations are reached, or if J becomes too big
# Returns cost and theta values
def gradDescent(X, y, theta, alpha, iterations, er):
    cost = np.zeros(iterations)
    grad = np.zeros([6,1])
    m = len(X)
    converged = False
    iter = 0
    J = costFunc(theta, X, y)

    while not converged:
        grad = costDerivitive(theta, grad, X, y)
        
        for n in range(len(grad)):
            theta[0][n] = theta[0][n] - (1/2/m) * alpha * grad[n]
        cost[iter] = costFunc(theta, X, y)
        if(abs(J-cost[iter]) <= er):
            print("Reached convergence after ", iter, " iterations")
            converged = True
            
        J = cost[iter]
        iter += 1
        
        if(iter == iterations):
            print("max iterations reached")
            converged = True
        if(J > 1000):
            print("Cost is getting too big")
            converged = True
            
    return cost, theta

# Call gradient descent function
cost, theta = gradDescent(X, y, theta, alpha, iterations,er)

# Assign values of theta array to their linear regression values and print
theta0 = theta[0][0]
theta1 = theta[0][1]
theta2 = theta[0][2]
theta3 = theta[0][3]
theta4 = theta[0][4]
theta5 = theta[0][5]

print("Theta0: ", theta0)
print("Theta1: ", theta1)
print("Theta2: ", theta2)
print("Theta3: ", theta3)
print("Theta4: ", theta4)
print("Theta5: ", theta5)

Autosaving every 180 seconds
alpha:  0.0002
max iterations:  300
min error for convergence:  0.1
Reached convergence after  69  iterations
Theta0:  -2.4767312712172263e-16
Theta1:  -0.19220184130658569
Theta2:  -0.1993422803512905
Theta3:  -0.19287951780267984
Theta4:  -0.20616830627827387
Theta5:  0.1041793780877697


In [5]:
# Create array to hold predicted values
samplePredictions = []

# Assign test sample values to corresponding x variables and plug into regression equation
for i in range(len(samples)):
    x1 = sSam.iloc[i]['cylinders']
    x2 = sSam.iloc[i]['displacement']
    x3 = sSam.iloc[i]['horsepower']
    x4 = sSam.iloc[i]['weight']
    x5 = sSam.iloc[i]['acceleration']
    stdMpg = theta0 + (x1*theta1) + (x2*theta2) + (x3 * theta3) + (x4 * theta4) + (x5 * theta5)
    ans = pd.DataFrame({
        'mpg': stdMpg,
        'cylinders': x1,
        'displacement': x2,
        'horsepower': x3,
        'weight': [x4],
        'acceleration': [x5]
    })
    tmp = ans * data.std() + data.mean()
    samplePredictions.append(tmp)
# Print predicted mpg with accompyning independent variables
samplePredictions

[         mpg  cylinders  displacement  horsepower  weight  acceleration
 0  29.561168        4.0          95.0        92.0  2043.0          19.1,
          mpg  cylinders  displacement  horsepower  weight  acceleration
 0  23.462537        6.0         168.0        96.0  2981.0          14.7,
          mpg  cylinders  displacement  horsepower  weight  acceleration
 0  30.026994        4.0          98.0        68.0  2147.0          18.3]