In [1]:

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.linear_model import LinearRegression

from scipy.optimize import minimize
from scipy import stats


df_reviews = pd.read_csv('../data/movie_reviews.csv').dropna()
df_reviews_pr = pd.read_csv('../data/movie_reviews_processed.csv').dropna()
model_reviews = sm.load('../linear_models/data/model_reviews.pickle') # pkl later

In [8]:
df_happiness = (
    pd.read_csv('../data/world_happiness_2018.csv')
    .dropna()
    .rename(
        columns = {
            'happiness_score': 'happiness',
            'healthy_life_expectancy_at_birth': 'life_exp',
            'log_gdp_per_capita': 'log_gdp_pc',
            'perceptions_of_corruption': 'corrupt'
        }
    )
    .assign(
        gdp_pc = lambda x: np.exp(x['log_gdp_pc']),
    )    
    [['country', 'happiness','life_exp', 'gdp_pc', 'corrupt']]
)


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df_happiness[['life_exp_sc', 'gdp_pc_sc', 'corrupt_sc']] = scaler.fit_transform(
    df_happiness[['life_exp', 'gdp_pc', 'corrupt']]
)
df_happiness = df_happiness.drop(columns = ['life_exp', 'gdp_pc', 'corrupt'])

## OLS

In [19]:
def ols(par, X, y, sum = False):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # Calculate the predicted values
    y_hat = X @ par
    
    # Calculate the error
    value = np.sum((y - y_hat)**2)
    
    # Calculate the value as sum or average
    if not sum:
        value = value / X.shape[0]
    
    # Return the value
    return(value)

# create a grid of guesses
from itertools import product

guesses = pd.DataFrame(
    product(
        np.arange(1, 7, 0.1),
        np.arange(-1, 1, 0.1)
    ),
    columns = ['b0', 'b1']
)

# Example for one guess
ols(
    par = guesses.iloc[0,:],
    X = df_happiness['life_exp_sc'],
    y = df_happiness['happiness']
)

# Calculate the function value for each guess
guesses['objective'] = guesses.apply(
    lambda x: ols(par = x, X = df_happiness['life_exp_sc'], y = df_happiness['happiness']),
    axis = 1
)

min_loss = guesses[guesses['objective'] == guesses['objective'].min()]

min_loss

Unnamed: 0,b0,b1,objective
899,5.4,0.9,0.490789


In [11]:
from scipy.optimize import minimize

our_result = minimize(
    fun    = ols,
    x0     = np.array([1., 0.]),
    args   = (np.array(df_happiness['life_exp_sc']), np.array(df_happiness['happiness'])),
    method = 'BFGS' # optimization algorithm
)


our_result

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 0.48851727833540676
        x: [ 5.445e+00  8.838e-01]
      nit: 3
      jac: [-9.313e-08  7.004e-07]
 hess_inv: [[ 5.190e-01 -9.564e-02]
            [-9.564e-02  9.810e-01]]
     nfev: 12
     njev: 4

In [12]:
from scipy.stats import norm

# two example life expectancy scores, mean and 1 sd above
life_expectancy = np.array([0, 1])

# observed happiness scores
happiness = np.array([4, 5.2])

# predicted happiness with rounded coefs
mu = 5 + 1 * life_expectancy

# just a guess for sigma
sigma = .5

# likelihood for each observation
L = norm.pdf(happiness, loc = mu, scale = sigma)
L

array([0.10798193, 0.22184167])

## MaxLike

In [20]:
def likelihood(par, X, y):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # setup
    beta   = par[1:]       # coefficients
    sigma  = np.exp(par[0])        # error sd

    N = X.shape[0]

    LP = X @ beta          # linear predictor
    mu = LP                # identity link in the glm sense

    # calculate (log) likelihood
    ll = norm.logpdf(y, loc = mu, scale = sigma) 
    return(-np.sum(ll))

our_result = minimize(
    fun    = likelihood,
    x0     = np.array([1, 0, 0]),
    args   = (np.array(df_happiness['life_exp_sc']), np.array(df_happiness['happiness'])),
    # method = "Nelder-Mead"
)

In [21]:
our_result

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 118.80381119428435
        x: [-3.582e-01  5.445e+00  8.838e-01]
      nit: 14
      jac: [ 1.907e-06  9.537e-07  9.537e-07]
 hess_inv: [[ 4.555e-03 -1.485e-05 -8.744e-05]
            [-1.485e-05  4.411e-03  1.340e-04]
            [-8.744e-05  1.340e-04  4.760e-03]]
     nfev: 88
     njev: 22

## Penalized

In [22]:
def ridge(par, X, y, lambda_ = 0):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # Calculate the predicted values
    mu = X @ par
    
    # Calculate the error
    value = np.sum((y - mu)**2)
    
    # Add the penalty
    value = value + lambda_ * np.sum(par**2)
    
    return(value)

ridge(
    X = df_happiness['life_exp_sc'],
    y = df_happiness['happiness'],
    par = np.array([0, 0]),
    lambda_ = 0.1
)

our_result = minimize(
    fun  = ridge,
    x0   = np.array([0, 0, 0, 0]),
    args = (
        np.array(df_happiness.drop(columns=['happiness', 'country'])),
        np.array(df_happiness['happiness']), 
        0.1
    ),
    # method = 'BFGS',
    tol=1e-10,
    options={'maxiter': 10000}
)

In [23]:
our_result['x']

array([ 5.43997501,  0.52188498,  0.43554025, -0.10484642])

In [24]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha = 0.1)

ridge.fit(
    X = df_happiness.drop(columns=['happiness', 'country']),
    y = df_happiness['happiness']
)

ridge.intercept_, ridge.coef_

(5.4448321376528055, array([ 0.52188497,  0.43554027, -0.10484641]))

## Classification

In [115]:
def objective(par, X, y):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # Calculate the predicted values
    y_hat = X @ par
    
    # Convert to a probability ('sigmoid' function)
    y_hat = 1 / (1 + np.exp(-y_hat))
    
    # likelihood
    ll = y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)
    
    return(-np.sum(ll))

In [122]:
from scipy.optimize import minimize

df_happiness_bin = df_happiness.copy()
df_happiness_bin['happiness'] = np.where(df_happiness['happiness'] > 5.5, 1, 0)

mod_logloss = minimize(
    objective,
    x0 = np.array([0, 0, 0, 0]),
    args = (
        df_happiness_bin[['life_exp_sc', 'gdp_pc_sc', 'corrupt_sc']],
        df_happiness_bin['happiness']
    )
)

mod_glm = smf.glm(
    'happiness ~ life_exp_sc + corrupt_sc + gdp_pc_sc',
    data   = df_happiness_bin,
    family = sm.families.Binomial()
).fit()

In [118]:
mod_logloss, mod_glm.summary()

(  message: Optimization terminated successfully.
   success: True
    status: 0
       fun: 40.663473928254746
         x: [-1.637e-01  1.809e+00  1.126e+00 -4.627e-01]
       nit: 14
       jac: [ 9.060e-06  1.907e-06  3.338e-06  9.060e-06]
  hess_inv: [[ 1.315e-01 -1.202e-01  1.242e-01 -5.169e-02]
             [-1.202e-01  4.133e-01 -2.214e-01 -3.464e-02]
             [ 1.242e-01 -2.214e-01  3.781e-01 -2.298e-02]
             [-5.169e-02 -3.464e-02 -2.298e-02  1.463e-01]]
      nfev: 85
      njev: 17,
 <class 'statsmodels.iolib.summary.Summary'>
 """
                  Generalized Linear Model Regression Results                  
 Dep. Variable:              happiness   No. Observations:                  112
 Model:                            GLM   Df Residuals:                      108
 Model Family:                Binomial   Df Model:                            3
 Link Function:                  Logit   Scale:                          1.0000
 Method:                         lbfgs 

## Gradient Descent

In [15]:
def gradient_descent(
    par, 
    X, 
    y, 
    tolerance = 1e-3, 
    maxit = 1000, 
    learning_rate = 1e-3, 
    adapt = False, 
    verbose = True, 
    plotLoss = True
):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]
    
    # initialize
    beta = par
    loss = np.sum((X @ beta - y)**2)
    tol = 1
    iter = 1

    while (tol > tolerance and iter < maxit):
        LP = X @ beta
        grad = X.T @ (LP - y)
        betaCurrent = beta - learning_rate * grad
        tol = np.max(np.abs(betaCurrent - beta))
        beta = betaCurrent
        loss = np.append(loss, np.sum((LP - y)**2))
        iter = iter + 1

        if (adapt):
            stepsize = np.where(loss[iter] < loss[iter - 1], stepsize * 1.2, stepsize * .8)

        if (verbose and iter % 10 == 0):
            print("Iteration:", iter)

    if (plotLoss):
        plt.plot(loss)
        plt.show()

    return({
        "par": beta,
        "loss": loss,
        "RSE": np.sqrt(np.sum((LP - y)**2) / (X.shape[0] - X.shape[1])),
        "iter": iter,
        "fitted": LP
    })

In [16]:
our_result = gradient_descent(
    par = np.array([0, 0, 0, 0]),
    X = df_happiness[['life_exp', 'gdp_pc', 'corrupt']].to_numpy(),
    y = df_happiness['happiness'].to_numpy(),
    learning_rate = 1e-3,
    verbose = False,
    plotLoss = False # will show below
)

In [17]:
our_result

{'par': array([ 5.43691264,  0.51898243,  0.4370022 , -0.10687814]),
 'loss': array([3462.57618111, 3462.57618111, 2719.61584058, 2141.99503995,
        1691.29344167, 1338.66327157, 1062.19911351,  845.11737579,
         674.46819375,  540.20549918,  434.50363582,  351.24751107,
         285.64744415,  233.94527786,  193.18834601,  161.05455731,
         135.71639644,  115.7348042 ,   99.97614844,   87.54712969,
          77.74367018,   70.01073704,   63.91073402,   59.09861886,
          55.30230601,   52.30722677,   49.94416173,   48.07965019,
          46.60842954,   45.44747447,   44.53129693,   43.80824002,
          43.23755541,   42.78709868,   42.43151162,   42.15078886,
          41.92914723,   41.75413416,   41.61592436,   41.50676513,
          41.42053882,   41.35241773,   41.29859186,   41.25605417,
          41.22243119,   41.19584938,   41.17482968,   41.15820437,
          41.14505143,   41.13464277,   41.12640334,   41.11987896,
          41.11471081,   41.11061542,  

## SGD

In [110]:
def stochastic_gradient_descent(
    par, # parameter estimates
    X,   # model matrix
    y,   # target variable
    learning_rate = 1, # the learning rate
    stepsize_tau = 0,  # if > 0, a check on the LR at early iterations
    average = False    # a variation of the approach
):
    # initialize
    np.random.seed(1234)

    # shuffle the data
    idx = np.random.choice(
        df_happiness.shape[0], 
        df_happiness.shape[0], 
        replace = False
    )
    X = X[idx, :]
    y = y[idx]
    
    X = np.c_[np.ones(X.shape[0]), X]
    beta = par

    # Collect all estimates
    betamat = np.zeros((X.shape[0], beta.shape[0]))

    # Collect fitted values at each point))
    fits = np.zeros(X.shape[0])

    # Collect loss at each point
    loss = np.zeros(X.shape[0])

    # adagrad per parameter learning rate adjustment
    s = 0

    # a smoothing term to avoid division by zero
    eps = 1e-8

    for i in range(X.shape[0]):
        Xi = X[None, i, :]
        yi = y[i]

        # matrix operations not necessary here,
        # but makes consistent with previous gd func
        LP = Xi @ beta
        grad = Xi.T @ (LP - yi)
        s = s + grad**2 # adagrad approach

        # update
        beta = beta - learning_rate / \
            (stepsize_tau + np.sqrt(s + eps)) * grad

        betamat[i, :] = beta

        fits[i] = LP
        loss[i] = np.sum((LP - yi)**2)

    LP = X @ beta
    lastloss = np.sum((LP - y)**2)

    return({
        "par": beta, # final estimates
        "par_chain": betamat, # estimates at each iteration
        "MSE": lastloss / X.shape[0],
        "fitted": LP
    })
X_train = df_happiness[['life_exp_sc', 'gdp_pc_sc', 'corrupt_sc']]
y_train = df_happiness['happiness']

our_result = stochastic_gradient_descent(
    par = np.array([np.mean(df_happiness['happiness']), 0, 0, 0]),
    X = X_train.to_numpy(),
    y = y_train.to_numpy(),
    learning_rate = .15,
    stepsize_tau = .1
)

our_result['par']

  fits[i] = LP


array([ 5.42766433,  0.53158177,  0.39770852, -0.14934094])

## bootstrap

In [112]:
def bootstrap(X, y, nboot=100, seed=123):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]
    N = X.shape[0]

    # initialize
    beta = np.empty((nboot, X.shape[1]))
    
    # beta = pd.DataFrame(beta, columns=['Intercept'] + list(cn))
    mse = np.empty(nboot)    

    # set seed
    np.random.seed(seed)

    for i in range(nboot):
        # sample with replacement
        idx = np.random.randint(0, N, N)
        Xi = X[idx, :]
        yi = y[idx]

        # estimate model
        model = LinearRegression(fit_intercept=False)
        mod = model.fit(Xi, yi)

        # save results
        beta[i, :] = mod.coef_
        mse[i] = np.sum((mod.predict(Xi) - yi)**2) / N

    # given mean estimates, calculate MSE
    y_hat = X @ beta.mean(axis=0)
    final_mse = np.sum((y - y_hat)**2) / N

    return dict(beta = beta, mse = mse, final_mse = final_mse)

our_result = bootstrap(
    X = df_happiness[['life_exp_sc', 'gdp_pc_sc', 'corrupt_sc']],
    y = df_happiness['happiness'],
    nboot = 250
)

np.mean(our_result['beta'], axis=0)

array([ 5.45437658,  0.51165618,  0.45536819, -0.10404313])

In [21]:
our_result['beta'].mean(axis=0)

array([ 5.46243972,  0.51282588,  0.45531028, -0.09958016])