In [75]:
import os
import functools
import itertools

import pandas as pd
import numpy as np
from patsy import dmatrices
from scipy import stats
from estimagic import maximize
from estimagic.differentiation.derivatives import first_derivative
from respy.method_of_simulated_moments import _harmonize_input, get_flat_moments


# Ordered Logit Example


## Functions

In [40]:
def _build_data_df(x,y):
    # Basic utility
    data = np.concatenate([x,y.reshape(len(y),1)],axis=1)
    return pd.DataFrame(data=data.copy(),columns=["pared","public","gpa","apply"])

In [41]:
def ordered_logit_processing(formula, data):
    """Process user input for an ordered logit model."""
    # extract data arrays
    y, x = dmatrices(formula + " - 1", data, return_type="dataframe")
    y = y[y.columns[0]]

    # extract dimensions
    num_choices = len(y.unique())
    beta_names = list(x.columns)
    num_betas = len(beta_names)
    num_cutoffs = num_choices - 1

    # set-up index for params_df
    names = beta_names + list(range(num_cutoffs))
    categories = ["beta"] * num_betas + ["cutoff"] * num_cutoffs
    index = pd.MultiIndex.from_tuples(zip(categories, names), names=["type", "name"])

    # make params_df
    np.random.seed(5471)
    start_params = pd.DataFrame(index=index)
    start_params["value"] = np.hstack(
        [
            np.random.uniform(low=-0.5, high=0.5, size=len(x.columns)),
            np.arange(num_cutoffs) * 2,
        ]
    )
    start_params["group"] = start_params.index.get_level_values("type")

    # make constraints
    constr = [{"loc": "cutoff", "type": "increasing"}]

    # turn pandas objects into numpy arrays
    y_arr = y.to_numpy().astype(int)
    x_arr = x.to_numpy()
    
    return start_params, y_arr, x_arr, constr

In [42]:
def _build_moments(data):
    im = data.copy()
    im["gpa"] = pd.cut(im.gpa,bins=5,labels=False)
    ix = pd.MultiIndex.from_tuples(itertools.product(range(2),range(2),range(5),range(3)))
    ix.names = ["pared", "public","gpa", "apply"]
    out = pd.Series(index=ix,data=0)
    rslt =  im.groupby(["pared","public","gpa"])["apply"].value_counts(normalize=True)
    out[rslt.index] = rslt.values
    return out
    

In [None]:
def get_weighting_matrix(
    data,
    empirical_moments,
    calc_moments,
    n_bootstrap_samples,
    n_draws_individuals,
    replace_missing_weights=None,
    return_covariance_matrix=False,
):
    """Compute a diagonal weighting matrix for estimation with MSM.
    Weights are the inverse bootstrap variances of the observed sample moments.
    Args:
    ------
    data (pandas.DataFrame)
        Dataframe containing individual observations. Must contain index named
        "Identifier" by which observations are sampled.
    empirical_moments (dict)
        Dictionary containing empirical moments in the form of pandas.DataFrame
        or pandas.Series.
    calc_moments (dict)
        Dictionary containing moment functions.
    n_bootstrap_samples (int)
        Number of samples that should be boostrapped.
    n_draws_individuals (int)
        Observations per bootstrap sample (individual ids).
    replace_missing_weights (None or float)
        Can be used to replace missing weights with a float value. If none, in
        cases where where weights are computed to be missing/infinite (i.e. if
        variances are 0), weights are set to zero.
    return_covariance_matrix : bool, default False
        Return full covariance matrix of bootstrapped moments.
    Returns:
    --------
    weighting_matrix (numpy.array)
        Diagonal weighting matrix with dimensions RxR where R denotes the
        number of moments.
    covariance_matrix (numpy.array)
        Covariance matrix of moments.
    """
    data = data.copy()
    np.random.seed(123)
    flat_empirical_moments = get_flat_moments(empirical_moments)
    index_base = data.index.get_level_values(0).unique()
    calc_moments = _harmonize_input(calc_moments)
    # Create bootstrapped moments.
    moments_sample = []
    for _ in range(n_bootstrap_samples):
        ids_boot = np.random.choice(index_base, n_draws_individuals, replace=False)
        moments_boot = {k: func(data.loc[ids_boot]) for k, func in calc_moments.items()}
        flat_moments_boot = get_flat_moments(moments_boot)
        flat_moments_boot = flat_moments_boot.reindex_like(flat_empirical_moments)
        # flat_moments_boot = flat_moments_boot.fillna(0)
        moments_sample.append(flat_moments_boot)

    # Compute variance for each moment and construct diagonal weighting matrix.
    moments_var = np.array(moments_sample).var(axis=0)

    # The variance of missing moments is nan. Unless a replacement variance is
    # specified, their inverse variance will be set to 0.
    diagonal = moments_var ** (-1)
    if replace_missing_weights is None:
        diagonal = np.nan_to_num(diagonal, nan=0, posinf=0, neginf=0)
    else:
        diagonal = np.nan_to_num(
            moments_var,
            nan=replace_missing_weights,
            posinf=replace_missing_weights,
            neginf=replace_missing_weights,
        )

    weighting_matrix = np.diag(diagonal)

    # Checks weighting matrix.
    if np.isnan(weighting_matrix).any() or np.isinf(weighting_matrix).any():
        raise ValueError("Weighting matrix contains NaNs or infinite values.")

    if return_covariance_matrix:
        covariance_matrix = np.cov(np.array(moments_sample).T, ddof=0)
        out = weighting_matrix, covariance_matrix
        assert np.allclose(
            moments_var, np.diag(covariance_matrix)
        ), "Variances in two outputs are not equal."
    else:
        out = weighting_matrix
    return out

In [101]:
def ordered_logit_msm(
    params,
    x,
    moment_func,
    moments_obs,
    weighting=[],
    return_scalar=True
    
):
    """MSM criterion for ordered logit"""
    # parse the parameter vector into its quantities
    beta = params.loc["beta", "value"].to_numpy()
    cutoffs = params.loc["cutoff", "value"].to_numpy()

    # calculate deterministic part of utilities
    xb = x.dot(beta).reshape(len(x),1)

    # Simulate Result:
    upper_cutoffs = np.hstack([cutoffs, np.inf])
    lower_cutoffs = np.hstack([-np.inf, cutoffs])
    upper_cdf = stats.logistic.cdf(upper_cutoffs - xb)
    lower_cdf = stats.logistic.cdf(lower_cutoffs - xb)

    prob_cumulative = (upper_cdf - lower_cdf).cumsum(axis=1)
    draws = np.random.rand(len(xb), 1)
    labels = (draws < prob_cumulative).argmax(axis=1)
    
    moments_sim = moment_func(_build_data_df(x,labels))
    
    dev = (moments_sim - moments_obs).values
    
    if len(weighting)==0:
        weighting = np.identity(len(moments_obs))
    
    if return_scalar:
        return dev @ weighting @ dev
    else:
        return dev @ np.sqrt(weighting)

## Prepare data

In [102]:
# Data Set
data = pd.read_pickle("~/OpenSourceEconomics/estimagic/docs/source/getting_started/ologit.pickle")
formula = "apply ~ pared + public + gpa"
start_params, y, x, constraints = ordered_logit_processing(formula, data)
n = x.shape[0]

Assume we are not allowed to keep the dependent information due to privacy concerns.
We are only allowed to extract a moments at a certain level of granularity.

In [103]:
data = _build_data_df(x, y)
moments_obs = _build_moments(data)

In [104]:
weighting, S = get_weighting_matrix(
    data,
    moments_obs,
    _build_moments,
    n_bootstrap_samples=50,
    n_draws_individuals=100,
    replace_missing_weights=None,
    return_covariance_matrix=True,
)



In [105]:
# Now we pretend to leave our secure work space. Thus we have to delete y.
del y
del data

In [106]:
# Now we build the objective function
objective = functools.partial(
    ordered_logit_msm,
    x=x,
    moment_func=_build_moments,
    moments_obs=moments_obs,
    weighting=weighting
)

In [107]:
# We perform one evaluation to make sure our setup works
objective(start_params)

83.37781021712918

In [108]:
# Optmize
rslt = maximize(
    criterion=objective,
    params=start_params,
    algorithm="scipy_lbfgsb",
    constraints=constraints,
    logging="ordered_logit.db",
)


In [109]:
params = rslt["solution_params"] 

# Inference

In [110]:
def sandwich_cov(G, W, S, n):
    bread = np.linalg.inv(
        G.T @ W @ G
    )
    butter = G.T @ W @ S @ W @ G
    return bread @ butter @ bread / n

In [123]:
def get_msm_standart_errors(objective, theta_hat, S, W, n):
    # Get Hessian Matrix
    G = first_derivative(
    objective, 
    theta_hat, 
    method="central", 
    #key="moment_errors", 
    base_steps=0.3,
    return_func_value=True,
    n_cores=1,
    )[0].to_numpy()
    
    
    return sandwich_cov(G, W, S, n)

In [124]:
objective = functools.partial(
    ordered_logit_msm,
    x=x,
    moment_func=_build_moments,
    moments_obs=moments_obs,
    weighting=weighting,
    return_scalar=False
)

In [125]:
check = objective(start_params)

In [126]:
start_params

Unnamed: 0_level_0,Unnamed: 1_level_0,value,group
type,name,Unnamed: 2_level_1,Unnamed: 3_level_1
beta,pared,0.477382,beta
beta,public,0.226502,beta
beta,gpa,-0.467458,beta
cutoff,0,0.0,cutoff
cutoff,1,2.0,cutoff


In [127]:
check.shape

(60,)

In [128]:
moments_obs.shape

(60,)

In [129]:
get_msm_standart_errors(objective, params, S, weighting , n)

[[ 1.00191625  1.50287437 -1.00191625 -0.50095812 -1.00191625]
 [-0.57297133 -1.14594266  0.57297133  1.14594266  1.71891399]
 [-1.76345334 -1.76345334  1.76345334 -1.76345334 -1.76345334]
 [-1.10161307  2.86419398 -3.96580706  0.88129046 -1.76258091]
 [ 1.56231362 -2.2318766   3.79419021  0.          2.45506426]
 [-0.80467649 -1.20701474  0.40233825 -1.60935298 -1.20701474]
 [ 1.00930564  0.7209326  -5.19071472  2.01861128 -1.15349216]
 [-0.57667337 -0.57667337  4.32505028 -1.58585177  2.16252514]
 [-0.84883563 -0.28294521  1.69767125 -0.84883563 -1.98061646]
 [-1.11498438  0.8919875  -3.34495313  1.11498438  2.45296563]
 [ 0.59542219 -0.39694813  3.17558503 -1.19084439 -1.38931845]
 [ 0.93257392 -0.93257392 -0.46628696  0.46628696 -1.86514784]
 [ 0.5509043   1.65271289 -1.65271289  0.         -1.10180859]
 [-0.51803204 -1.03606408  0.51803204  0.51803204  1.03606408]
 [ 0.         -1.45079396  2.90158793 -1.45079396  0.        ]
 [ 0.          0.          0.          0.          0.  

array([[ 1.66734255e-06,  6.31592669e-08, -2.59086364e-07,
        -1.04380053e-06, -1.07444883e-07],
       [ 6.31592669e-08,  1.09612376e-06,  8.56686223e-08,
        -2.47556391e-07, -3.44459448e-07],
       [-2.59086364e-07,  8.56686223e-08,  6.48396486e-07,
         9.14845439e-07, -1.78117043e-07],
       [-1.04380053e-06, -2.47556391e-07,  9.14845439e-07,
         2.40364517e-06, -1.75965067e-07],
       [-1.07444883e-07, -3.44459448e-07, -1.78117043e-07,
        -1.75965067e-07,  4.49406596e-07]])