In [44]:
include("../src/SyntheticPopulation.jl")
include("utils.jl")
using .SyntheticPopulation
using DataFrames
using StatsBase



# Summary:
Sample-free IPF algorithm is not efficient for creating joint distributions of the attributes that are highly dependent.

# 1. Population with dependent variables

### 1.1. Generate target population with dependent variables
First we generate target population that we'll try to synthesize using available algorithms. The variables marital status and income are highly dependent on sex.

In [45]:
SIZE = 300000
OLD_ADULTS = 0.6
YOUNG_ADULTS = 0.2
CHILDREN = 0.2

SEX = ['M', 'F']; SEX_WEIGHTS = [0.5, 0.5]
MARITAL_STATUS = ["Not_married", "Married", "Divorced", "Widowed"]; 
MARITAL_STATUS_WEIGHTS_M = [0.1, 0.2, 0.3, 0.4]; 
MARITAL_STATUS_WEIGHTS_F = [0.4, 0.3, 0.2, 0.1];

AGE_YOUNG_ADULT = [20, 25];
AGE_YOUNG_ADULT_WEIGHTS = repeat([1 / length(AGE_YOUNG_ADULT)], length(AGE_YOUNG_ADULT));
AGE_OLD_ADULT = [30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80];
AGE_OLD_ADULT_WEIGHTS = repeat([1 / length(AGE_OLD_ADULT)], length(AGE_OLD_ADULT));
AGE_CHILDREN = [5, 10, 15];
AGE_CHILDREN_WEIGHTS = repeat([1 / length(AGE_CHILDREN)], length(AGE_CHILDREN));

INCOME = [40000, 50000, 60000, 70000, 80000];
ZERO_INCOME = [60000, 70000, 80000]
INCOME_WEIGHTS_M_YOUNG = SIZE ./ 2 .* YOUNG_ADULTS .* [0.01, 0.1, 0, 0, 0];
INCOME_WEIGHTS_F_YOUNG = SIZE ./ 2 .* YOUNG_ADULTS .* [0.39, 0.3, 0, 0, 0];
INCOME_WEIGHTS_M_OLD = SIZE ./2 * (OLD_ADULTS+YOUNG_ADULTS) .* [0.01, 0.1, 0.2, 0.3, 0.39] .- INCOME_WEIGHTS_M_YOUNG;
INCOME_WEIGHTS_F_OLD = SIZE ./2 * (OLD_ADULTS+YOUNG_ADULTS) .*[0.39, 0.3, 0.2, 0.1, 0.01] .- INCOME_WEIGHTS_F_YOUNG;

population_m_young_adult = DataFrame(
    AGE = sample(AGE_YOUNG_ADULT, Weights(AGE_YOUNG_ADULT_WEIGHTS), Int(SIZE * YOUNG_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_M), Int(SIZE * YOUNG_ADULTS / 2)),
    SEX = repeat(['M'], Int(SIZE * YOUNG_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_M_YOUNG), Int(SIZE * YOUNG_ADULTS / 2)),
)
population_f_young_adult = DataFrame(
    AGE = sample(AGE_YOUNG_ADULT, Weights(AGE_YOUNG_ADULT_WEIGHTS), Int(SIZE * YOUNG_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_F), Int(SIZE * YOUNG_ADULTS / 2)),
    SEX = repeat(['F'], Int(SIZE * YOUNG_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_M_YOUNG), Int(SIZE * YOUNG_ADULTS / 2)),
)
population_m_old_adult = DataFrame(
    AGE = sample(AGE_OLD_ADULT, Weights(AGE_OLD_ADULT_WEIGHTS), Int(SIZE * OLD_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_M), Int(SIZE * OLD_ADULTS / 2)),
    SEX = repeat(['M'], Int(SIZE * OLD_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_M_OLD), Int(SIZE * OLD_ADULTS / 2)),
)
population_f_old_adult = DataFrame(
    AGE = sample(AGE_OLD_ADULT, Weights(AGE_OLD_ADULT_WEIGHTS), Int(SIZE * OLD_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_F), Int(SIZE * OLD_ADULTS / 2)),
    SEX = repeat(['F'], Int(SIZE * OLD_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_F_OLD), Int(SIZE * OLD_ADULTS / 2)),
)
population_children = DataFrame(
    AGE = sample(AGE_CHILDREN, Weights(AGE_CHILDREN_WEIGHTS), Int(SIZE * CHILDREN)),
    MARITAL_STATUS = repeat([missing], Int(SIZE * CHILDREN)),
    SEX = sample(SEX, Weights(SEX_WEIGHTS), Int(SIZE * CHILDREN)),
    INCOME = repeat([missing], Int(SIZE * CHILDREN))
)

disaggregated_dependent_population = reduce(vcat, [
    population_m_young_adult, 
    population_f_young_adult,
    population_m_old_adult,
    population_f_old_adult,
    population_children
    ]
)

dependent_population = combine(groupby(disaggregated_dependent_population, names(disaggregated_dependent_population), sort=true), nrow)
rename!(dependent_population, :nrow => :population)
zero_population = DataFrame(vec(collect(Iterators.product(AGE_YOUNG_ADULT, MARITAL_STATUS, SEX, ZERO_INCOME))))
zero_population.:population = repeat([0], nrow(zero_population))
rename!(zero_population, names(dependent_population))
dependent_population = reduce(vcat, [dependent_population, zero_population])

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64
1,5,missing,F,missing,9993
2,5,missing,M,missing,9928
3,10,missing,F,missing,9975
4,10,missing,M,missing,9947
5,15,missing,F,missing,9994
6,15,missing,M,missing,10163
7,20,Divorced,F,40000,279
8,20,Divorced,F,50000,2686
9,20,Divorced,M,40000,392
10,20,Divorced,M,50000,4067


### 1.2. Compute marginal attribute distributions of the dependent population
Then we compute the marginal population attributes. This is a type of data that we can usually get from the census data and will be input for our algorithms for population generation.

In [46]:
#Population by age and sex
dependent_age_sex = combine(groupby(disaggregated_dependent_population, [:AGE, :SEX], sort=true), nrow); 
sort!(dependent_age_sex, [:SEX, :AGE])

#Population by sex and marital status
dependent_sex_marital = combine(groupby(disaggregated_dependent_population, [:MARITAL_STATUS, :SEX], sort=true), nrow); 
sort!(dependent_sex_marital, [:SEX, :MARITAL_STATUS])

#Population by income
dependent_income = combine(groupby(disaggregated_dependent_population, [:INCOME], sort=true), nrow)

#Correct column names
dependent_age_sex, dependent_sex_marital, dependent_income = map(x -> rename!(x, :nrow => :population), [dependent_age_sex, dependent_sex_marital, dependent_income])

#filter out missing values
dependent_sex_marital = filter(:MARITAL_STATUS => x -> typeof(x) != Missing, dependent_sex_marital)
dependent_income = filter(:INCOME => x -> typeof(x) != Missing, dependent_income)

Row,INCOME,population
Unnamed: 0_level_1,Int64?,Int64
1,40000,37971
2,50000,85988
3,60000,40203
4,70000,38506
5,80000,37332


### 1.3. Generate dependent population from marginals
Then, we use our algorithm to estimate joint distribution of the attributes. 

#### Guo, Bhat, 2007

In [47]:
guo_bhat = SyntheticPopulation.generate_joint_distribution(dependent_age_sex, dependent_sex_marital, dependent_income)
guo_bhat = guo_bhat[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149962, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150038, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [300000, 240000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/Proportio

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String,Char,Int64,Int64
1,5,Divorced,F,40000,317
2,10,Divorced,F,40000,316
3,15,Divorced,F,40000,317
4,20,Divorced,F,40000,477
5,25,Divorced,F,40000,474
6,30,Divorced,F,40000,264
7,35,Divorced,F,40000,261
8,40,Divorced,F,40000,260
9,45,Divorced,F,40000,260
10,50,Divorced,F,40000,258


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [48]:
ponge_et_al = SyntheticPopulation.generate_joint_distribution(dependent_age_sex, dependent_sex_marital, dependent_income, config_file = "ind_ponge2021.json")
ponge_et_al = ponge_et_al[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149962, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149962, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150038, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, c

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [300000, 240000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 18 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130


Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String,Char,Int64,Int64
1,5,Divorced,F,40000,243
2,10,Divorced,F,40000,242
3,15,Divorced,F,40000,243
4,20,Divorced,F,40000,924
5,25,Divorced,F,40000,919
6,30,Divorced,F,40000,202
7,35,Divorced,F,40000,200
8,40,Divorced,F,40000,199
9,45,Divorced,F,40000,199
10,50,Divorced,F,40000,198


#### Modified algorithm

In [49]:
modified = SyntheticPopulation.generate_joint_distribution(dependent_age_sex, dependent_sex_marital, dependent_income, config_file = "ind_modified.json")
modified = modified[:, Not(:id)]

└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Converged in 2 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Converged in 3 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [29962, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [30038, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64?,String?,Char?,Int64?,Int64
1,5,missing,F,missing,9991
2,10,missing,F,missing,9976
3,15,missing,F,missing,9995
4,20,Divorced,F,40000,924
5,25,Divorced,F,40000,919
6,30,Divorced,F,40000,182
7,35,Divorced,F,40000,180
8,40,Divorced,F,40000,179
9,45,Divorced,F,40000,179
10,50,Divorced,F,40000,177


### 1.4. Evaluation of fit of generated dependent population.
Finally, we evaluate if the generated population is correct. We use approach based on Z-score described by [Williamson, 2013] [1].


[1] Williamson, P. (2013). An evaluation of two synthetic small-area microdata simulation methodologies: Synthetic reconstruction and combinatorial optimisation. Spatial microsimulation: A reference guide for users, 19-47. https://ndl.ethernet.edu.et/bitstream/123456789/14722/1/205.pdf#page=38

#### Guo, Bhat, 2007

In [50]:
validate_table(guo_bhat, dependent_population)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.009505703422053232
Percentage of well fitting values at 0.90 confidence interval: 0.005703422053231939

=Table statistics=
Statistic value equals: Inf
Table is not well fitting.



Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Float64
1,20,Divorced,F,40000,279,477,-11.8595
2,20,Divorced,F,50000,2686,1081,31.1082
3,20,Divorced,F,60000,0,505,-Inf
4,20,Divorced,F,70000,0,484,-Inf
5,20,Divorced,F,80000,0,469,-Inf
6,20,Divorced,M,40000,392,707,-15.9203
7,20,Divorced,M,50000,4067,1600,38.949
8,20,Divorced,M,60000,0,748,-Inf
9,20,Divorced,M,70000,0,717,-Inf
10,20,Divorced,M,80000,0,695,-Inf


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [51]:
validate_table(ponge_et_al, dependent_population)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.2585551330798479
Percentage of well fitting values at 0.90 confidence interval: 0.24524714828897337

=Table statistics=
Statistic value equals: 599591.3448394606
Table is not well fitting.



Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Real
1,20,Divorced,F,40000,279,924,-38.6331
2,20,Divorced,F,50000,2686,2092,11.5129
3,20,Divorced,F,60000,0,0,0
4,20,Divorced,F,70000,0,0,0
5,20,Divorced,F,80000,0,0,0
6,20,Divorced,M,40000,392,1368,-49.3277
7,20,Divorced,M,50000,4067,3099,15.2828
8,20,Divorced,M,60000,0,0,0
9,20,Divorced,M,70000,0,0,0
10,20,Divorced,M,80000,0,0,0


#### Modified algorithm

In [71]:
validate_table(modified, dependent_population)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.2509505703422053


Percentage of well fitting values at 0.90 confidence interval: 0.21673003802281368

=Table statistics=
Statistic value equals: 530687.9161854419
Table is not well fitting.



Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Real
1,5,missing,F,missing,9993,9991,0.0203488
2,5,missing,M,missing,9928,9928,0.0
3,10,missing,F,missing,9975,9976,-0.0101833
4,10,missing,M,missing,9947,9947,0.0
5,15,missing,F,missing,9994,9995,-0.0101739
6,15,missing,M,missing,10163,10161,0.0201838
7,20,Divorced,F,40000,279,924,-38.6331
8,20,Divorced,F,50000,2686,2092,11.5129
9,20,Divorced,F,60000,0,0,0
10,20,Divorced,F,70000,0,0,0


### 1.5 Internal Validation

Once the tables have been generated, the internal validation can be performed. This is done to assess the distance between input contingency tables the the generated tables.

#### Guo, Bhat, 2007

In [72]:
guo_bhat_age_sex, guo_bhat_sex_marital, guo_bhat_income = compute_marginals(guo_bhat)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE   [0m[1m SEX  [0m[1m population [0m
     │[90m Int64 [0m[90m Char [0m[90m Int64      [0m
─────┼─────────────────────────
   1 │     5  F           9991
   2 │     5  M           9928
   3 │    10  F           9976
   4 │    10  M           9947
   5 │    15  F           9995
   6 │    15  M          10161
   7 │    20  F          15043
   8 │    20  M          14960
  ⋮  │   ⋮     ⋮        ⋮
  26 │    65  M           8003
  27 │    70  F           8204
  28 │    70  M           8139
  29 │    75  F           8225
  30 │    75  M           8336
  31 │    80  F           8140
  32 │    80  M           8148
[36m                17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX  [0m[1m population [0m
     │[90m String         [0m[90m Char [0m[90m Int64      [0m
─────┼──────────────────────────────────
   1 │ Divorced        F          30070
   2 │ Divorced        M          44806
   3

In [73]:
validate_table(guo_bhat_age_sex, dependent_age_sex)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.004708713777727473
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,9993,9991,0.0203488
2,5,M,9928,9928,0.0
3,10,F,9975,9976,-0.0101833
4,10,M,9947,9947,0.0
5,15,F,9994,9995,-0.0101739
6,15,M,10163,10161,0.0201838
7,20,F,15042,15043,-0.00836599
8,20,M,14959,14960,-0.00838795
9,25,F,14958,14959,-0.00838821
10,25,M,15041,15040,0.00836625


In [74]:
validate_table(guo_bhat_sex_marital, dependent_sex_marital)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 17702.748461128518
Table is not well fitting.



Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,24058,30070,-40.8626
2,Divorced,M,35836,44806,-51.3746
3,Married,F,36245,45298,-51.6083
4,Married,M,23977,29980,-40.8627
5,Not_married,F,47553,59425,-60.7974
6,Not_married,M,11954,14946,-28.0737
7,Widowed,F,12144,15172,-28.2001
8,Widowed,M,48233,60304,-61.4879


In [75]:
validate_table(guo_bhat_income, dependent_income)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 19842.866560258422
Table is not well fitting.



Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,37971,47459,-53.0698
2,50000,85988,107487,-91.5225
3,60000,40203,50255,-54.9458
4,70000,38506,48134,-53.5484
5,80000,37332,46666,-52.5703


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [76]:
ponge_et_al_age_sex, ponge_et_al_sex_marital, ponge_et_al_income = compute_marginals(ponge_et_al)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE   [0m[1m SEX  [0m[1m population [0m
     │[90m Int64 [0m[90m Char [0m[90m Int64      [0m
─────┼─────────────────────────
   1 │     5  F           9993
   2 │     5  M           9927
   3 │    10  F           9975
   4 │    10  M           9947
   5 │    15  F           9996
   6 │    15  M          10163
   7 │    20  F          15042
   8 │    20  M          14959
  ⋮  │   ⋮     ⋮        ⋮
  26 │    65  M           8004
  27 │    70  F           8203
  28 │    70  M           8137
  29 │    75  F           8225
  30 │    75  M           8339
  31 │    80  F           8140
  32 │    80  M           8150
[36m                17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX  [0m[1m population [0m
     │[90m String         [0m[90m Char [0m[90m Int64      [0m
─────┼──────────────────────────────────
   1 │ Divorced        F          30064
   2 │ Divorced        M          44810
   3

In [77]:
validate_table(ponge_et_al_age_sex, dependent_age_sex)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.004703957214813783
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,9993,9993,0.0
2,5,M,9928,9927,0.0102065
3,10,F,9975,9975,0.0
4,10,M,9947,9947,0.0
5,15,F,9994,9996,-0.0203478
6,15,M,10163,10163,0.0
7,20,F,15042,15042,0.0
8,20,M,14959,14959,0.0
9,25,F,14958,14958,0.0
10,25,M,15041,15041,0.0


In [78]:
validate_table(ponge_et_al_sex_marital, dependent_sex_marital)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 17705.475106408994
Table is not well fitting.



Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,24058,30064,-40.8218
2,Divorced,M,35836,44810,-51.3975
3,Married,F,36245,45293,-51.5798
4,Married,M,23977,29974,-40.8218
5,Not_married,F,47553,59424,-60.7923
6,Not_married,M,11954,14951,-28.1206
7,Widowed,F,12144,15180,-28.2746
8,Widowed,M,48233,60310,-61.5185


In [79]:
validate_table(ponge_et_al_income, dependent_income)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 19845.437208700387
Table is not well fitting.



Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,37971,47466,-53.109
2,50000,85988,107485,-91.5139
3,60000,40203,50252,-54.9294
4,70000,38506,48133,-53.5428
5,80000,37332,46670,-52.5928


#### Modified

In [80]:
modified_age_sex, modified_sex_marital, modified_income = compute_marginals(modified)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE    [0m[1m SEX   [0m[1m population [0m
     │[90m Int64? [0m[90m Char? [0m[90m Int64      [0m
─────┼───────────────────────────
   1 │      5  F            9991
   2 │      5  M            9928
   3 │     10  F            9976
   4 │     10  M            9947
   5 │     15  F            9995
   6 │     15  M           10161
   7 │     20  F           15042
   8 │     20  M           14959
  ⋮  │   ⋮       ⋮        ⋮
  26 │     65  M            8000
  27 │     70  F            8202
  28 │     70  M            8137
  29 │     75  F            8223
  30 │     75  M            8338
  31 │     80  F            8141
  32 │     80  M            8148
[36m                  17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX   [0m[1m population [0m
     │[90m String?        [0m[90m Char? [0m[90m Int64      [0m
─────┼───────────────────────────────────
   1 │ Divorced        F           24057


In [81]:
validate_table(modified_age_sex, dependent_age_sex)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.005159468202086297
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,9993,9991,0.0203488
2,5,M,9928,9928,0.0
3,10,F,9975,9976,-0.0101833
4,10,M,9947,9947,0.0
5,15,F,9994,9995,-0.0101739
6,15,M,10163,10161,0.0201838
7,20,F,15042,15042,0.0
8,20,M,14959,14959,0.0
9,25,F,14958,14958,0.0
10,25,M,15041,15041,0.0


In [82]:
validate_table(modified_sex_marital, dependent_sex_marital)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.0006965978691037966
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,24058,24057,0.00679684
2,Divorced,M,35836,35837,-0.00572738
3,Married,F,36245,36244,0.00570069
4,Married,M,23977,23978,-0.00680704
5,Not_married,F,47553,47553,0.0
6,Not_married,M,11954,11953,0.00938291
7,Widowed,F,12144,12146,-0.0186262
8,Widowed,M,48233,48231,0.0101877


In [83]:
validate_table(modified_income, dependent_income)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.0015920924348500958
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,37971,37974,-0.0167801
2,50000,85988,85987,0.00425706
3,60000,40203,40205,-0.0109323
4,70000,38506,38507,-0.00556174
5,80000,37332,37326,0.0337928
