In [1]:
import os
import functools
import itertools
import string

import pandas as pd
import numpy as np
from patsy import dmatrices
from scipy import stats
from estimagic import minimize, maximize
from estimagic.differentiation.derivatives import first_derivative
from respy.method_of_simulated_moments import _harmonize_input, get_flat_moments
import estimagic

In [2]:
estimagic.__version__

'0.1.2'

# Tutorial Standart Errors MSM 
## Ordered Logit Example
This notebook contains a full tutorial about standart errors with method of simuated moments.
We continue the maximum likelihood example and still consider an ordered logit case.



## Functions

In [3]:
def simulate_dataset(n_agents, params):
    beta = params.loc["beta", "value"].to_numpy()
    cutoffs = params.loc["cutoff", "value"].to_numpy()
    range_vars = np.random.choice(range(2,7),size=len(beta))
    X = np.concatenate([np.random.choice(range(x),size=n_agents).reshape(n_agents,1) for x in range_vars],axis=1)

    # calculate deterministic part of utilities
    xb = X.dot(beta).reshape(n_agents,1)

    # Simulate Result:
    upper_cutoffs = np.hstack([cutoffs, np.inf])
    lower_cutoffs = np.hstack([-np.inf, cutoffs])
    upper_cdf = stats.logistic.cdf(upper_cutoffs - xb)
    lower_cdf = stats.logistic.cdf(lower_cutoffs - xb)

    prob_cumulative = (upper_cdf - lower_cdf).cumsum(axis=1)
    draws = np.random.rand(len(xb), 1)
    labels = (draws < prob_cumulative).argmax(axis=1)
    out = pd.DataFrame(X)
    out.columns = params.loc["beta"].index.values
    out["y"] = labels
    return out

    
    

In [4]:
def _build_data_df(x,y,cols):
    # Basic utility
    data = np.concatenate([x,y.reshape(len(y),1)],axis=1)
    return pd.DataFrame(data=data.copy(),columns=cols)

In [5]:
def ordered_logit_processing(formula, data):
    """Process user input for an ordered logit model."""
    # extract data arrays
    y, x = dmatrices(formula + " - 1", data, return_type="dataframe")
    y = y[y.columns[0]]

    # extract dimensions
    num_choices = len(y.unique())
    beta_names = list(x.columns)
    num_betas = len(beta_names)
    num_cutoffs = num_choices - 1

    # set-up index for params_df
    names = beta_names + list(range(num_cutoffs))
    categories = ["beta"] * num_betas + ["cutoff"] * num_cutoffs
    index = pd.MultiIndex.from_tuples(zip(categories, names), names=["type", "name"])

    # make params_df
    np.random.seed(5471)
    start_params = pd.DataFrame(index=index)
    start_params["value"] = np.hstack(
        [
            np.random.uniform(low=-0.5, high=0.5, size=len(x.columns)),
            np.arange(num_cutoffs) * 2,
        ]
    )
    start_params["group"] = start_params.index.get_level_values("type")

    # make constraints
    constr = [{"loc": "cutoff", "type": "increasing"}]

    # turn pandas objects into numpy arrays
    y_arr = y.to_numpy().astype(int)
    x_arr = x.to_numpy()
    
    return start_params, y_arr, x_arr, constr

In [6]:
def _build_moments(data, ind):
    im = data.copy()
    #im["gpa"] = pd.qcut(im.gpa,q=3,labels=False)
    ranges = data.max(axis=0)
    ix = pd.MultiIndex.from_tuples(itertools.product(*(range(int(x + 1)) for x in ranges)))
    ix.names = ind + ["y"]
    out = pd.Series(index=ix,data=0)
    rslt =  im.groupby(ind)["y"].value_counts(normalize=True)
    out[rslt.index] = rslt.values
    return out
    

In [7]:
def get_weighting_matrix(
    data,
    empirical_moments,
    calc_moments,
    n_bootstrap_samples,
    n_draws_individuals,
    replace_missing_weights=None,
    return_covariance_matrix=False,
):
    """Compute a diagonal weighting matrix for estimation with MSM.
    Weights are the inverse bootstrap variances of the observed sample moments.
    Args:
    ------
    data (pandas.DataFrame)
        Dataframe containing individual observations. Must contain index named
        "Identifier" by which observations are sampled.
    empirical_moments (dict)
        Dictionary containing empirical moments in the form of pandas.DataFrame
        or pandas.Series.
    calc_moments (dict)
        Dictionary containing moment functions.
    n_bootstrap_samples (int)
        Number of samples that should be boostrapped.
    n_draws_individuals (int)
        Observations per bootstrap sample (individual ids).
    replace_missing_weights (None or float)
        Can be used to replace missing weights with a float value. If none, in
        cases where where weights are computed to be missing/infinite (i.e. if
        variances are 0), weights are set to zero.
    return_covariance_matrix : bool, default False
        Return full covariance matrix of bootstrapped moments.
    Returns:
    --------
    weighting_matrix (numpy.array)
        Diagonal weighting matrix with dimensions RxR where R denotes the
        number of moments.
    covariance_matrix (numpy.array)
        Covariance matrix of moments.
    """
    data = data.copy()
    np.random.seed(123)
    flat_empirical_moments = get_flat_moments(empirical_moments)
    index_base = data.index.get_level_values(0).unique()
    calc_moments = _harmonize_input(calc_moments)
    # Create bootstrapped moments.
    moments_sample = []
    for _ in range(n_bootstrap_samples):
        ids_boot = np.random.choice(index_base, n_draws_individuals, replace=False)
        moments_boot = {k: func(data.loc[ids_boot]) for k, func in calc_moments.items()}
        flat_moments_boot = get_flat_moments(moments_boot)
        flat_moments_boot = flat_moments_boot.reindex_like(flat_empirical_moments)
        # flat_moments_boot = flat_moments_boot.fillna(0)
        moments_sample.append(flat_moments_boot)

    # Compute variance for each moment and construct diagonal weighting matrix.
    moments_var = np.array(moments_sample).var(axis=0)

    # The variance of missing moments is nan. Unless a replacement variance is
    # specified, their inverse variance will be set to 0.
    diagonal = moments_var ** (-1)
    if replace_missing_weights is None:
        diagonal = np.nan_to_num(diagonal, nan=0, posinf=0, neginf=0)
    else:
        diagonal = np.nan_to_num(
            moments_var,
            nan=replace_missing_weights,
            posinf=replace_missing_weights,
            neginf=replace_missing_weights,
        )

    weighting_matrix = np.diag(diagonal)

    # Checks weighting matrix.
    if np.isnan(weighting_matrix).any() or np.isinf(weighting_matrix).any():
        raise ValueError("Weighting matrix contains NaNs or infinite values.")

    if return_covariance_matrix:
        covariance_matrix = np.cov(np.array(moments_sample).T, ddof=0)
        out = weighting_matrix, covariance_matrix
        assert np.allclose(
            moments_var, np.diag(covariance_matrix)
        ), "Variances in two outputs are not equal."
    else:
        out = weighting_matrix
    return out

In [8]:
def ordered_logit_msm(
    params,
    x,
    moment_func,
    moments_obs,
    cols,
    weighting=[],
    return_scalar=True
    
):
    """MSM criterion for ordered logit"""
    # parse the parameter vector into its quantities
    beta = params.loc["beta", "value"].to_numpy()
    cutoffs = params.loc["cutoff", "value"].to_numpy()

    # calculate deterministic part of utilities
    xb = x.dot(beta).reshape(len(x),1)

    # Simulate Result:
    upper_cutoffs = np.hstack([cutoffs, np.inf])
    lower_cutoffs = np.hstack([-np.inf, cutoffs])
    upper_cdf = stats.logistic.cdf(upper_cutoffs - xb)
    lower_cdf = stats.logistic.cdf(lower_cutoffs - xb)

    prob_cumulative = (upper_cdf - lower_cdf).cumsum(axis=1)
    draws = np.random.rand(len(xb), 1)
    labels = (draws < prob_cumulative).argmax(axis=1)
    
    moments_sim = moment_func(_build_data_df(x,labels,cols))
    
    dev = (moments_sim - moments_obs).values
    
    if len(weighting)==0:
        weighting = np.identity(len(moments_obs))
    
    if return_scalar:
        return dev @ weighting @ dev
    else:
        return dev @ np.sqrt(weighting)

## Build Dataset


In [9]:
params = pd.DataFrame(pd.Series({
    ("beta","a"):-2,
    ("beta","b"):1,
    ("beta","c"):3,
    ("cutoff",0):2,
    ("cutoff",1):4,
}))
params.columns = ["value"]
params["lower_bound"] = - np.inf
params["upper_bound"] = np.inf

params.index = pd.MultiIndex.from_tuples(params.index)


In [10]:
params 

Unnamed: 0,Unnamed: 1,value,lower_bound,upper_bound
beta,a,-2,-inf,inf
beta,b,1,-inf,inf
beta,c,3,-inf,inf
cutoff,0,2,-inf,inf
cutoff,1,4,-inf,inf


In [35]:
data = simulate_dataset(10000, params)

In [36]:
data

Unnamed: 0,a,b,c,y
0,1,1,0,0
1,0,1,1,2
2,0,1,1,0
3,1,1,0,0
4,0,0,0,0
...,...,...,...,...
9995,1,0,1,0
9996,0,1,0,0
9997,2,1,1,0
9998,2,1,0,0


## Prepare data

In [37]:
# Data Set
#data = pd.read_pickle("~/OpenSourceEconomics/estimagic/docs/source/getting_started/ologit.pickle")
formula = "y ~ a + b + c"
start_params, y, x, constraints = ordered_logit_processing(formula, data)
n = x.shape[0]
n_agents_sim = 10000
cols = ["a","b","c","y"]

In [38]:
start_params.loc["beta"].index.values

array(['a', 'b', 'c'], dtype=object)

Assume we are not allowed to keep the dependent information due to privacy concerns.
We are only allowed to extract a moments at a certain level of granularity.

In [39]:
#data = _build_data_df(x, y)
ind = list(start_params.loc["beta"].index.values)
moments_obs = _build_moments(data,ind)
moment_func = functools.partial(_build_moments,ind=ind)

In [41]:
weighting, S = get_weighting_matrix(
    data,
    moments_obs,
    moment_func,
    n_bootstrap_samples=500,
    n_draws_individuals=100,
    replace_missing_weights=None,
    return_covariance_matrix=True,
)



In [42]:
# Now we pretend to leave our secure work space. Thus we have to delete y.
del y
del data

In [43]:
# Now we build the objective function
objective = functools.partial(
    ordered_logit_msm,
    x=x,
    moment_func=moment_func,
    moments_obs=moments_obs,
    weighting=weighting,
    cols=cols
)

In [44]:
x

array([[1., 1., 0.],
       [0., 1., 1.],
       [0., 1., 1.],
       ...,
       [2., 1., 1.],
       [2., 1., 0.],
       [2., 0., 0.]])

In [None]:
# We perform one evaluation to make sure our setup works
objective(start_params)

In [None]:
# Optmize
rslt = minimize(
    criterion=objective,
    params=start_params,
    algorithm="scipy_powell",
    constraints=constraints,
    logging="ordered_logit.db",
)


In [None]:
rslt

In [None]:
params = rslt["solution_params"] 

# Inference

In [None]:
def sandwich_cov(G, W, S, n):
    bread = np.linalg.inv(
        G.T @ W @ G
    )
    butter = G.T @ W @ S @ W @ G
    return bread @ butter @ bread / n

In [None]:
def get_msm_standart_errors(objective, theta_hat, S, W, n):
    # Get Hessian Matrix
    G = first_derivative(
    objective, 
    theta_hat, 
    method="central", 
    #key="moment_errors", 
    base_steps=0.3,
    return_func_value=True,
    n_cores=1,
    )[0].to_numpy()
    
    
    return sandwich_cov(G, W, S, n)

In [None]:
objective = functools.partial(
    ordered_logit_msm,
    x=x,
    moment_func=moment_func,
    moments_obs=moments_obs,
    weighting=weighting,
    return_scalar=False,
    cols=cols
)

In [None]:
check = objective(start_params)

In [None]:
params

In [None]:
cov = get_msm_standart_errors(objective, params, S, weighting , n)

In [31]:
np.sqrt(np.diag(cov))

array([1.34740956e-04, 6.37699663e-05, 7.90890875e-05, 1.11444044e-04,
       1.13477077e-04])