In [132]:
import sys
import os 
import pandas as pd
import numpy as np

import soepy

from dev_library import plot_basics_choices
from dev_library import plot_basics_wages
from dev_library import df_alignment

import pandas as pd
pd.set_option('display.max_rows', 500)
sys.path.insert(0, os.environ["PROJECT_DIR"] + "/estimations/peisenha_exploration")

In [98]:
fname = os.environ["PROJECT_DIR"] + "/resources/soepcore_struct_prep.dta"
df_obs = pd.read_stata(fname, convert_categoricals = False)
df_obs = df_alignment(df_obs, is_obs=True)

In [99]:
df_obs["Age_Youngest_Child"].value_counts(sort=True).sort_index()

-1.0     41773
 0.0      2600
 1.0      3218
 2.0      2982
 3.0      2942
 4.0      2634
 5.0      2413
 6.0      2285
 7.0      2264
 8.0      2202
 9.0      1989
 10.0     1938
Name: Age_Youngest_Child, dtype: int64

In [155]:
bins

IntervalIndex([(-0.1, 2.1], (2.9, 5.1], (5.9, 10.1]],
              closed='right',
              dtype='interval[float64]')

In [138]:
bins = pd.IntervalIndex.from_tuples([(-0.1, 2.1), (2.9, 5.1), (5.9, 10.1)]) 
labels = ['0-2', '3-5', '6-10']
df_obs["Age_Range"] = pd.cut(df_obs["Age_Youngest_Child"] , bins, labels = labels)

In [139]:
df_obs["Age_Range"].value_counts()

(5.9, 10.1]    10678
(-0.1, 2.1]     8800
(2.9, 5.1]      7989
Name: Age_Range, dtype: int64

In [140]:
2600 + 3218 + 2982  

8800

In [141]:
2942 + 2634 + 2413

7989

In [142]:
2285 + 2264 + 2202 + 1989 + 1938

10678

In [168]:
LABELS_AGE = ['0-2', '3-5', '6-10']
LABELS_EDUCATION = ["High", "Medium", "Low"]
LABELS_CHOICE = ["Home", "Part", "Full"]
LABELS_WORK = ["Part", "Full"]


def get_moments(df):


    df_int = df.copy()

    # For the observed dataset, we have many missing values in our dataset and so we must
    # restrict attention to those that work and make sure we have a numeric type.
    df_sim_working = df_int[df_int["Choice"].isin(LABELS_WORK)]
    df_sim_working = df_sim_working.astype({"Wage_Observed": np.float})

    # We need to add information on the age range of the youngest child.
    bins = pd.IntervalIndex.from_tuples([(-0.1, 2.1), (2.9, 5.1), (5.9, 10.1)]) 
    df_int["Age_Range"] = pd.cut(df_int["Age_Youngest_Child"], bins, labels=LABELS_AGE)
    
    num_periods = df_int.index.get_level_values("Period").max()

    # Choice probabilities, differentiating by age range of youngest child, default entry is zero
    # We restrict attention to the first 20 periods as aferwards the cells get rather thin
    max_period = 20
    entries = [list(range(max_period)), bins.get_level_values(0), LABELS_CHOICE]
    conditioning = ["Period", "Age_Range", "Choice"]
    default_entry = 0

    index = pd.MultiIndex.from_product(entries, names=conditioning)
    df_probs_grid = pd.DataFrame(data=default_entry, columns=["Value"], index=index)

    
    df_probs = df_int.groupby(conditioning[:2]).Choice.value_counts(normalize=True).rename("Value")
    df_probs_grid.update(df_probs)
    
    moments = list(df_probs_grid.sort_index().values.flatten())

    
    return df_probs_grid

df_probs_grid = get_moments(df_obs)

In [169]:
df_probs_grid

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Value
Period,Age_Range,Choice,Unnamed: 3_level_1
0,"(-0.1, 2.1]",Home,0.892157
0,"(-0.1, 2.1]",Part,0.029412
0,"(-0.1, 2.1]",Full,0.078431
0,"(2.9, 5.1]",Home,0.705882
0,"(2.9, 5.1]",Part,0.117647
0,"(2.9, 5.1]",Full,0.176471
0,"(5.9, 10.1]",Home,1.0
0,"(5.9, 10.1]",Part,0.0
0,"(5.9, 10.1]",Full,0.0
1,"(-0.1, 2.1]",Home,0.918919
