In [10]:

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.optimize import minimize
from scipy import stats


df_reviews = pd.read_csv('../data/movie_reviews.csv').dropna()
df_reviews_pr = pd.read_csv('../data/movie_reviews_processed.csv').dropna()
model_reviews = sm.load('../linear_models/data/model_reviews.pickle') # pkl later

In [13]:
df_happiness = (
    pd.read_csv('../data/world_happiness_2018.csv')
    .dropna()
    .rename(
        columns = {
            'happiness_score': 'happiness',
            'healthy_life_expectancy_at_birth': 'life_exp',
            'log_gdp_per_capita': 'log_gdp_pc',
            'perceptions_of_corruption': 'corrupt'
        }
    )
    .assign(
        gdp_pc = lambda x: np.exp(x['log_gdp_pc']),
    )    
    .drop(columns = ['log_gdp_pc'])    
)


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df_happiness[['life_exp', 'gdp_pc', 'corrupt']] = scaler.fit_transform(
    df_happiness[['life_exp', 'gdp_pc', 'corrupt']]
)

## OLS

In [14]:
def ols(par, X, y, sum = False):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # Calculate the predicted values
    y_hat = X @ par
    
    # Calculate the error
    value = np.sum((y - y_hat)**2)
    
    # Calculate the value as sum or average
    if not sum:
        value = value / X.shape[0]
    
    # Return the value
    return(value)

# create a grid of guesses
from itertools import product

guesses = pd.DataFrame(
    product(
        np.arange(1, 7, 0.1),
        np.arange(-1, 1, 0.1)
    ),
    columns = ['b0', 'b1']
)

# Example for one guess
ols(
    par = guesses.iloc[0,:],
    X = df_happiness['life_exp'],
    y = df_happiness['happiness']
)

23.793842044979073

In [21]:
from scipy.optimize import minimize

our_result = minimize(
    fun    = ols,
    x0     = np.array([1., 0.]),
    args   = (np.array(df_happiness['life_exp']), np.array(df_happiness['happiness'])),
    method = 'BFGS' # optimization algorithm
)


our_result

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 0.48851727833540676
        x: [ 5.445e+00  8.838e-01]
      nit: 3
      jac: [-9.313e-08  7.004e-07]
 hess_inv: [[ 5.190e-01 -9.564e-02]
            [-9.564e-02  9.810e-01]]
     nfev: 12
     njev: 4

In [23]:
from scipy.stats import norm

# two example life expectancy scores, mean and 1 sd above
life_expectancy = np.array([0, 1])

# observed happiness scores
happiness = np.array([4, 5.2])

# predicted happiness with rounded coefs
mu = 5 + 1 * life_expectancy

# just a guess for sigma
sigma = .5

# likelihood for each observation
L = norm.pdf(happiness, loc = mu, scale = sigma)
L

array([0.10798193, 0.22184167])

## MaxLike

In [37]:
def likelihood(par, X, y):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # setup
    beta   = par[1:]       # coefficients
    sigma  = np.exp(par[0])        # error sd

    N = X.shape[0]

    LP = X @ beta          # linear predictor
    mu = LP                # identity link in the glm sense

    # calculate (log) likelihood
    ll = norm.logpdf(y, loc = mu, scale = sigma) 
    return(-np.sum(ll))

our_result = minimize(
    fun    = likelihood,
    x0     = np.array([1, 0, 0]),
    args   = (np.array(df_happiness['life_exp']), np.array(df_happiness['happiness'])),
    # method = "Nelder-Mead"
)

In [38]:
our_result

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 118.80381119428438
        x: [-3.582e-01  5.445e+00  8.838e-01]
      nit: 14
      jac: [-1.907e-06  0.000e+00  9.537e-07]
 hess_inv: [[ 4.600e-03 -1.529e-04  9.976e-05]
            [-1.529e-04  4.642e-03 -8.236e-05]
            [ 9.976e-05 -8.236e-05  4.459e-03]]
     nfev: 88
     njev: 22

## Penalized

In [None]:
def ridge(par, X, y, lambda_ = 0):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]

    # Calculate the predicted values
    y_hat = X @ par
    
    # Calculate the error
    value = np.sum((y - y_hat)**2)
    
    # Add the penalty
    value = value + lambda_ * X.shape[0] * np.sum(par**2)
    
    return(value)

ridge(
    X = df_reviews['word_count'],
    y = df_reviews['rating'],
    par = np.array([3, .1]),
    lambda_ = 0.1
)

2732.58