# Parameter Distribution Plot

In [1]:
import pandas as pd
import numpy as np
from numpy.random import uniform
np.random.seed(89345)

In [2]:
from bokeh.plotting import output_notebook
output_notebook()

In [3]:
from estimagic.optimization.utilities import index_element_to_string
from estimagic.visualization.distribution_plot.parameter_distribution_plot import (
    parameter_distribution_plot,
    _tidy_df_from_results
)

## Generate some artifical results

We start by creating some fake results. 
Let's start with a rather minimal version of results 
that only contains the columns that are guaranteed 
to be in the final params DataFrame.

In [4]:
base_params = pd.DataFrame()
n_work_coeffs = 2
n_home_coeffs = 4
n_educ_coeffs = 4
n_beta = n_work_coeffs + n_home_coeffs + n_educ_coeffs
# generate an index
base_params["vartype"] = ["coefficient"] * n_beta + ["cutoff"] * 3
base_params["choice"] = \
    ["work"] * n_work_coeffs + ["home"] * n_home_coeffs + ["educ"] * n_educ_coeffs + \
    ["home", "educ", "work"]
base_params["varname"] = \
    ["x{}".format(i) for i in range(n_work_coeffs)] + \
    ["x{}".format(i) for i in range(n_home_coeffs - 2)] + ["z1", "z2"] + \
    ["x5", "x6", "z3", "z4"] + \
    [None, None, None]

base_params["group"] = "All Parameters"

base_params["value"] = \
    uniform(-3, 3, n_work_coeffs).tolist() + \
    uniform(-1, 3, n_home_coeffs).tolist() + \
    uniform(-3, 1, n_educ_coeffs).tolist() + \
    [0, 1.153, 4.037]


base_params.set_index(["vartype", "choice", "varname"], inplace=True)

base_params["name"] = [index_element_to_string(tup) for tup in base_params.index]
base_params["lower"] = - np.inf
base_params["upper"] = np.inf

In [5]:
base_params

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,group,value,name,lower,upper
vartype,choice,varname,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
coefficient,work,x0,All Parameters,2.16589,coefficient_work_x0,-inf,inf
coefficient,work,x1,All Parameters,1.573793,coefficient_work_x1,-inf,inf
coefficient,home,x0,All Parameters,2.809119,coefficient_home_x0,-inf,inf
coefficient,home,x1,All Parameters,0.742688,coefficient_home_x1,-inf,inf
coefficient,home,z1,All Parameters,1.688022,coefficient_home_z1,-inf,inf
coefficient,home,z2,All Parameters,2.331855,coefficient_home_z2,-inf,inf
coefficient,educ,x5,All Parameters,-1.523941,coefficient_educ_x5,-inf,inf
coefficient,educ,x6,All Parameters,0.88351,coefficient_educ_x6,-inf,inf
coefficient,educ,z3,All Parameters,-2.359987,coefficient_educ_z3,-inf,inf
coefficient,educ,z4,All Parameters,0.616226,coefficient_educ_z4,-inf,inf


In [6]:
def create_results(params, n_res=50, model_classes=None, add_model_name=False):
    """Create a list of slightly perturbed versions of a base params DataFrame.
    
    Args:
        params (pd.DataFrame): base version of the params DataFrame.
        n_res (int): number of results to generate.
        model_classes (list): list of model classes
        add_model_name (bool): whether to add a model_name to each DataFrame.
    """
    if model_classes is None:
        model_classes = [None]
    results = []
    for model in model_classes: 
        for i in range(int(n_res / len(model_classes))):
            res = params.copy()
            res["value"] += uniform(-0.75, 0.75, len(res))
            if model is not None:
                res["model_class"] = model
            if add_model_name:
                res["model_name"] = "{}_{}".format(model, i)
            results.append(res)
    return results

In [7]:
minimal_results = create_results(base_params)

## Generate the comparison plot

In [8]:
source, plots = parameter_distribution_plot(
    results=minimal_results,
    height=1000,
)

## Adding model_class and group_col

This is very large and not well sorted. We can use the group_col to kick out groups of parameters we're not interested in. 
Assume for the moment that we are only interested in the wage and education coefficients.

In [9]:
base_params["group"] = \
    ["Wage Coefficients"] * n_work_coeffs + \
    [None] * (n_home_coeffs) + \
    ["Education Coefficients"] * n_educ_coeffs + \
    [None] * 3

Furthermore, we might have different model classes we want to compare.

We might try different specifications, different optimization algorithms or different estimators.

Estimagic allows you to color code parameters from models of the same class in the same color. 

Assume we estimated our model with GMM and maximum likelihood with different starting values. The results will show us how sensitive our results are to the additional assumptions we usually need to make when using maximum likelihood.

In [10]:
mixed_results = create_results(
    params=base_params, 
    n_res=100, 
    model_classes=["GMM", "ML"], 
    add_model_name=True
)

In [11]:
source, plots = parameter_distribution_plot(
    results=mixed_results,
    height=700,
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df.sort_values(drop_and_sort_cols, inplace=True)


Alternatively, we can use the MultiIndex to group our plots as any list of columns or index levels is supported. Note that in this case parameters that don't have an entry in every level would not be plotted.

In [12]:
# fill the index of the cut offs so that they will be plotted
index_filled_results = []
for df in mixed_results:
    new_df = df.reset_index()
    new_df["varname"].fillna("Cut Off", inplace=True)
    index_filled_results.append(new_df)


In [13]:
source, plots = parameter_distribution_plot(
    results=index_filled_results,
    group_cols=["vartype", "choice", "varname"],
    height=1400,
)