In [2]:
import pandas as pd

In [3]:
def my_multivariate_linear_regression(df):
    """
    This function runs a multivariate linear regression and returns
    the slope and intercept. Uses the closed form solution of 
    the gradient of the SSE with respect to the parameters.
    
    @params:
        df: a pandas dataframe with n columns. One column is named
            target.
    
    @returns:
        np.array: (intercept, slope)
    """
    import numpy as np
    
    dimension = df.shape[0]
    
    df['ones'] = np.ones(dimension)    
    # Ordering the matrix so the column of 1s is on the left
    cols = df.columns.to_list()
    cols.remove('ones')
    cols.insert(1, 'ones')
    df = df[cols]
    
    training_columns = cols
    training_columns.remove('target')
    
    X = np.matrix(df[training_columns].to_numpy())
    y = np.matrix(df['target']).T
    left = np.matmul(y.T,X)
    right = np.matmul(X.T, X)
    inv = np.linalg.inv(right)

    return np.array(np.matmul(left, inv))
    


In [4]:
def popular_multivariate_linear_regression(df):
    """
    This function uses sklearn to run a multivariate linear regression
    and returns the slope and intercept.
    
    @params:
        df: a pandas dataframe with several columns with one column
        named target
    
    @returns:
        np.array: (intercept, slope)
    """
    import numpy as np
    from sklearn.linear_model import LinearRegression
    
    dimension = df.shape[0]
    
    training_columns = df.columns.to_list()
    training_columns.remove('target')
    
    X = df[training_columns]
    y = df['target']
    reg = LinearRegression().fit(X, y)
    
    intercept = [reg.intercept_]
    coefficients = list(reg.coef_)[:len(training_columns)-1]

    return np.array([intercept + coefficients])

In [31]:
def using_gradient_descent(df, alpha, max_epoch):
    """
    This function uses the gradient descent algorith to run a 
    multivariate linear regression and returns the slope and 
    intercept. 
    
    WARNING: NOT VERY GOOD AS CHOOSING THE ALPHA CAN LEAD TO 
    DIVERGENCE ISSUES AND IT ALWAYS CHANGES W.R.T THE DATA.
    
    @params:
        df: a pandas dataframe with  columns named training and 
            target
    
    @returns:
        np.array: (intercept, slope)
    """
    
    dimension = df.shape[0]
    
    df['ones'] = np.ones(dimension)    
    # Ordering the matrix so the column of 1s is on the left
    cols = df.columns.to_list()
    cols.remove('ones')
    cols.insert(1, 'ones')
    df = df[cols]
    
    training_columns = cols
    training_columns.remove('target')
    
    X = np.matrix(df[training_columns].to_numpy())
    y = np.matrix(df['target']).T
    
    num_cols = len(training_columns)
    
    theta = np.matrix(np.zeros(num_cols)).T
    
    itterations = 0

    while True:
        itterations += 1
        old_theta = theta.copy()
        
        estimate = np.matmul(X, old_theta)
        err = estimate - y
        err_sum = (alpha/dimension)*np.matmul(X.T, err)
        
        theta = old_theta - err_sum
        
        # Check for convergence
        cost = 0.5*(1/dimension)*np.sum(np.square(X.dot(theta)-y))
        
        print(cost)
        if cost < 0.001:
            break
        if itterations >= max_epoch:
            break
    
    return theta
        
