## Imports

In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from scipy.optimize import minimize
from scipy.stats import norm

## Data Setup

In [2]:
import pandas as pd

df_happiness = (
    pd.read_csv('https://tinyurl.com/worldhappiness2018')
    .dropna()
    .rename(columns = {'happiness_score': 'happiness'})
    .filter(regex = '_sc|country|happ')
)

## Prediction Error

MSE comparison between the two models

In [3]:
y = df_happiness['happiness']

# Calculate the error for the guess of four
prediction = np.min(df_happiness['happiness']) + 1 * df_happiness['life_exp_sc']
mse_model_A   = np.mean((y - prediction)**2)

# Calculate the error for our other guess
prediction = y.mean() + .5 * df_happiness['life_exp_sc']
mse_model_B  = np.mean((y - prediction)**2)

pd.DataFrame({
    'Model': ['A', 'B'],
    'MSE': [mse_model_A, mse_model_B]
})

Unnamed: 0,Model,MSE
0,A,5.086298
1,B,0.63756


## OLS

In [4]:
# for later comparison
model_lr_happy = smf.ols('happiness ~ life_exp_sc', data = df_happiness).fit()

def ols(par, X, y):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # Calculate the predicted values
    y_hat = X @ par  # @ is matrix multiplication
    
    # Calculate the mean of the squared errors
    value = np.mean((y - y_hat)**2)
    
    # Return the objective value
    return value

In [5]:
from itertools import product

guesses = pd.DataFrame(
    product(
        np.arange(1, 7, 0.1),
        np.arange(-1, 1, 0.1)
    ),
    columns = ['b0', 'b1']
)

# Example for one guess
ols(
    par = guesses.iloc[0,:],
    X = df_happiness['life_exp_sc'],
    y = df_happiness['happiness']
)

23.77700449624871

In [6]:
guesses['objective'] = guesses.apply(
    lambda x: ols(
        par = x, 
        X = df_happiness['life_exp_sc'], 
        y = df_happiness['happiness']
    ),
    axis = 1
)

min_loss = guesses[guesses['objective'] == guesses['objective'].min()]

min_loss

Unnamed: 0,b0,b1,objective
899,5.4,0.9,0.490675


In [7]:
model_lr_happy_life = sm.OLS(df_happiness['happiness'], sm.add_constant(df_happiness['life_exp_sc'])).fit()

model_lr_happy_life.params, model_lr_happy_life.scale

(const          5.444832
 life_exp_sc    0.887796
 dtype: float64,
 0.4973994106686574)

## Optimization

In [8]:
our_ols_optim = minimize(
    fun  = ols,
    x0   = np.array([1., 0.]),
    args = (
        np.array(df_happiness['life_exp_sc']), 
        np.array(df_happiness['happiness'])
    ),
    method  = 'BFGS',   # optimization algorithm
    tol     = 1e-6,     # tolerance
    options = {
        'maxiter': 500  # max iterations
    }
)

our_ols_optim

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 0.48851727833528863
        x: [ 5.445e+00  8.878e-01]
      nit: 5
      jac: [-7.451e-09  0.000e+00]
 hess_inv: [[ 5.000e-01  3.539e-06]
            [ 3.539e-06  5.045e-01]]
     nfev: 21
     njev: 7

## Maximum Likelihood

Initial exploration

In [9]:
# two example life expectancy scores, at the mean (0) and 1 sd above
life_expectancy = np.array([0, 1])

# observed happiness scores
happiness = np.array([4, 5.2])

# predicted happiness with rounded coefs
mu = 5 + 1 * life_expectancy

# just a guess for sigma
sigma = .5

# likelihood for each observation
L = norm.pdf(happiness, loc = mu, scale = sigma)
L

array([0.10798193, 0.22184167])

Main function

In [10]:
def max_likelihood(par, X, y):
    
    # setup
    X = np.c_[np.ones(X.shape[0]), X] # add a column of 1s for the intercept
    beta   = par[1:]         # coefficients
    sigma  = np.exp(par[0])  # error sd, exp keeps positive
    N = X.shape[0]

    LP = X @ beta            # linear predictor
    mu = LP                  # identity link in the glm sense

    # calculate (log) likelihood
    ll = norm.logpdf(y, loc = mu, scale = sigma)

    value = -np.sum(ll)      # negative for minimization

    return value

our_max_like = minimize(
    fun  = max_likelihood,
    x0   = np.array([1, 0, 0]),
    args = (
        np.array(df_happiness['life_exp_sc']), 
        np.array(df_happiness['happiness'])
    )
)

our_max_like['x']

array([-0.35819022,  5.44483214,  0.88779604])

## Penalized Objectives

In [11]:
# we use lambda_ because lambda is a reserved word in python
def ridge(par, X, y, lambda_ = 0):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # Calculate the predicted values
    mu = X @ par
    
    # Calculate the error
    value = np.sum((y - mu)**2)
    
    # Add the penalty
    value = value + lambda_ * np.sum(par**2)
    
    return value

our_ridge = minimize(
    fun  = ridge,
    x0   = np.array([0, 0, 0, 0]),
    args = (
        np.array(df_happiness.drop(columns=['happiness', 'country'])),
        np.array(df_happiness['happiness']), 
        0.1
    )
)

In [12]:
our_ridge['x']

array([ 5.439975  ,  0.52422716, -0.1053189 ,  0.43749604])

## Classification

### Misclassification Error

In [13]:
def misclassification_rate(par, X, y, class_threshold = .5):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # Calculate the 'linear predictor'
    mu = X @ par 
    
    # Convert to a probability ('sigmoid' function)
    p = 1 / (1 + np.exp(-mu))
    
    # Convert to a class
    predicted_class = np.where(p > class_threshold, 1, 0)
    
    # Calculate the mean error
    value = np.mean(y - predicted_class)

    return value

### Log Loss

In [14]:
def log_loss(par, X, y):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # Calculate the predicted values
    y_hat = X @ par
    
    # Convert to a probability ('sigmoid' function)
    y_hat = 1 / (1 + np.exp(-y_hat))
    
    # likelihood
    ll = y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)

    value = -np.sum(ll)
    
    return value

In [15]:
df_happiness_bin = df_happiness.copy()
df_happiness_bin['happiness'] = np.where(df_happiness['happiness'] > 5.5, 1, 0)

model_logloss = minimize(
    log_loss,
    x0 = np.array([0, 0, 0, 0]),
    args = (
        df_happiness_bin[['life_exp_sc', 'corrupt_sc', 'gdp_pc_sc']],
        df_happiness_bin['happiness']
    )
)

model_glm = smf.glm(
    'happiness ~ life_exp_sc + corrupt_sc + gdp_pc_sc',
    data   = df_happiness_bin,
    family = sm.families.Binomial()
).fit()

model_logloss['x']

array([-0.16365245,  1.81715104, -0.46478325,  1.13108169])

## Gradient Descent

In [16]:
def gradient_descent(
    par, 
    X, 
    y, 
    tolerance = 1e-3, 
    maxit = 1000, 
    learning_rate = 1e-3
):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]
    
    # initialize
    beta = par
    loss = np.sum((X @ beta - y)**2)
    tol = 1
    iter = 1

    while (tol > tolerance and iter < maxit):
        LP = X @ beta
        grad = X.T @ (LP - y)
        betaCurrent = beta - learning_rate * grad
        tol = np.max(np.abs(betaCurrent - beta))
        beta = betaCurrent
        loss = np.append(loss, np.sum((LP - y)**2))
        iter = iter + 1

    output = {
        'par': beta,
        'loss': loss,
        'MSE': np.mean((LP - y)**2),
        'iter': iter,
        'fitted': LP
    }

    return output

our_gd = gradient_descent(
    par = np.array([0, 0, 0, 0]),
    X = df_happiness[['life_exp_sc', 'corrupt_sc', 'gdp_pc_sc']].to_numpy(),
    y = df_happiness['happiness'].to_numpy(),
    learning_rate = 1e-3
)

In [17]:
our_gd['par']

array([ 5.43691264,  0.52121949, -0.10746734,  0.43896778])

## SGD

In [18]:
def stochastic_gradient_descent(
    par, # parameter estimates
    X,   # model matrix
    y,   # target variable
    learning_rate = 1, # the learning rate
    stepsize_tau = 0,  # if > 0, a check on the LR at early iterations
    average = False    # a variation of the approach
):
    # initialize
    np.random.seed(1234)

    # shuffle the data
    idx = np.random.choice(
        df_happiness.shape[0], 
        df_happiness.shape[0], 
        replace = False
    )
    X = X[idx, :]
    y = y[idx]
    
    X = np.c_[np.ones(X.shape[0]), X]
    beta = par

    # Collect all estimates
    betamat = np.zeros((X.shape[0], beta.shape[0]))

    # Collect fitted values at each point))
    fits = np.zeros(X.shape[0])

    # Collect loss at each point
    loss = np.zeros(X.shape[0])

    # adagrad per parameter learning rate adjustment
    s = 0

    # a smoothing term to avoid division by zero
    eps = 1e-8

    for i in range(X.shape[0]):
        Xi = X[None, i, :]
        yi = y[i]

        # matrix operations not necessary here,
        # but makes consistent with previous gd func
        LP = Xi @ beta
        grad = Xi.T @ (LP - yi)
        s = s + grad**2 # adagrad approach

        # update
        beta = beta - learning_rate / \
            (stepsize_tau + np.sqrt(s + eps)) * grad

        betamat[i, :] = beta

        fits[i] = LP
        loss[i] = np.sum((LP - yi)**2)

    LP = X @ beta
    lastloss = np.sum((LP - y)**2)

    output = {
        'par': beta,          # final estimates
        'par_chain': betamat, # estimates at each iteration
        'MSE': lastloss / X.shape[0],
        'predictions': LP
    }

    return output

X = df_happiness[['life_exp_sc', 'corrupt_sc', 'gdp_pc_sc']].to_numpy()
y = df_happiness['happiness'].to_numpy()

our_sgd = stochastic_gradient_descent(
    par = np.array([np.mean(df_happiness['happiness']), 0, 0, 0]),
    X = X,
    y = y,
    learning_rate = .15,
    stepsize_tau = .1
)

our_sgd['par'], our_sgd['MSE']

  fits[i] = LP


(array([ 5.42765734,  0.53391505, -0.15014284,  0.39964098]),
 0.36857925293717886)