In [22]:
include("../src/SyntheticPopulation.jl")
include("utils.jl")
using .SyntheticPopulation
using DataFrames
using StatsBase



# Summary:
Sample-free IPF algorithm is not efficient for creating joint distributions of the attributes that are highly dependent.

# 1. Population with dependent variables

### 1.1. Generate target population with dependent variables
First we generate target population that we'll try to synthesize using available algorithms. The variables marital status and income are highly dependent on sex.

In [23]:
SIZE = 300000
OLD_ADULTS = 0.6
YOUNG_ADULTS = 0.2
CHILDREN = 0.2

SEX = ['M', 'F']; SEX_WEIGHTS = [0.5, 0.5]
MARITAL_STATUS = ["Not_married", "Married", "Divorced", "Widowed"]; 
MARITAL_STATUS_WEIGHTS_M = [0.1, 0.2, 0.3, 0.4]; 
MARITAL_STATUS_WEIGHTS_F = [0.4, 0.3, 0.2, 0.1];

AGE_YOUNG_ADULT = [20, 25];
AGE_YOUNG_ADULT_WEIGHTS = repeat([1 / length(AGE_YOUNG_ADULT)], length(AGE_YOUNG_ADULT));
AGE_OLD_ADULT = [30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80];
AGE_OLD_ADULT_WEIGHTS = repeat([1 / length(AGE_OLD_ADULT)], length(AGE_OLD_ADULT));
AGE_CHILDREN = [5, 10, 15];
AGE_CHILDREN_WEIGHTS = repeat([1 / length(AGE_CHILDREN)], length(AGE_CHILDREN));

INCOME = [40000, 50000, 60000, 70000, 80000];
ZERO_INCOME = [60000, 70000, 80000]
INCOME_WEIGHTS_M_YOUNG = SIZE ./ 2 .* YOUNG_ADULTS .* [0.01, 0.1, 0, 0, 0];
INCOME_WEIGHTS_F_YOUNG = SIZE ./ 2 .* YOUNG_ADULTS .* [0.39, 0.3, 0, 0, 0];
INCOME_WEIGHTS_M_OLD = SIZE ./2 * (OLD_ADULTS+YOUNG_ADULTS) .* [0.01, 0.1, 0.2, 0.3, 0.39] .- INCOME_WEIGHTS_M_YOUNG;
INCOME_WEIGHTS_F_OLD = SIZE ./2 * (OLD_ADULTS+YOUNG_ADULTS) .*[0.39, 0.3, 0.2, 0.1, 0.01] .- INCOME_WEIGHTS_F_YOUNG;

population_m_young_adult = DataFrame(
    AGE = sample(AGE_YOUNG_ADULT, Weights(AGE_YOUNG_ADULT_WEIGHTS), Int(SIZE * YOUNG_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_M), Int(SIZE * YOUNG_ADULTS / 2)),
    SEX = repeat(['M'], Int(SIZE * YOUNG_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_M_YOUNG), Int(SIZE * YOUNG_ADULTS / 2)),
)
population_f_young_adult = DataFrame(
    AGE = sample(AGE_YOUNG_ADULT, Weights(AGE_YOUNG_ADULT_WEIGHTS), Int(SIZE * YOUNG_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_F), Int(SIZE * YOUNG_ADULTS / 2)),
    SEX = repeat(['F'], Int(SIZE * YOUNG_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_M_YOUNG), Int(SIZE * YOUNG_ADULTS / 2)),
)
population_m_old_adult = DataFrame(
    AGE = sample(AGE_OLD_ADULT, Weights(AGE_OLD_ADULT_WEIGHTS), Int(SIZE * OLD_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_M), Int(SIZE * OLD_ADULTS / 2)),
    SEX = repeat(['M'], Int(SIZE * OLD_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_M_OLD), Int(SIZE * OLD_ADULTS / 2)),
)
population_f_old_adult = DataFrame(
    AGE = sample(AGE_OLD_ADULT, Weights(AGE_OLD_ADULT_WEIGHTS), Int(SIZE * OLD_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_F), Int(SIZE * OLD_ADULTS / 2)),
    SEX = repeat(['F'], Int(SIZE * OLD_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_F_OLD), Int(SIZE * OLD_ADULTS / 2)),
)
population_children = DataFrame(
    AGE = sample(AGE_CHILDREN, Weights(AGE_CHILDREN_WEIGHTS), Int(SIZE * CHILDREN)),
    MARITAL_STATUS = repeat([missing], Int(SIZE * CHILDREN)),
    SEX = sample(SEX, Weights(SEX_WEIGHTS), Int(SIZE * CHILDREN)),
    INCOME = repeat([missing], Int(SIZE * CHILDREN))
)

disaggregated_dependent_population = reduce(vcat, [
    population_m_young_adult, 
    population_f_young_adult,
    population_m_old_adult,
    population_f_old_adult,
    population_children
    ]
)

dependent_population = combine(groupby(disaggregated_dependent_population, names(disaggregated_dependent_population), sort=true), nrow)
rename!(dependent_population, :nrow => :population)
zero_population = DataFrame(vec(collect(Iterators.product(AGE_YOUNG_ADULT, MARITAL_STATUS, SEX, ZERO_INCOME))))
zero_population.:population = repeat([0], nrow(zero_population))
rename!(zero_population, names(dependent_population))
dependent_population = reduce(vcat, [dependent_population, zero_population])

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64
1,5,missing,F,missing,10086
2,5,missing,M,missing,10066
3,10,missing,F,missing,9996
4,10,missing,M,missing,9789
5,15,missing,F,missing,9926
6,15,missing,M,missing,10137
7,20,Divorced,F,40000,249
8,20,Divorced,F,50000,2821
9,20,Divorced,M,40000,431
10,20,Divorced,M,50000,4111


### 1.2. Compute marginal attribute distributions of the dependent population
Then we compute the marginal population attributes. This is a type of data that we can usually get from the census data and will be input for our algorithms for population generation.

In [24]:
#Population by age and sex
dependent_age_sex = combine(groupby(disaggregated_dependent_population, [:AGE, :SEX], sort=true), nrow); 
sort!(dependent_age_sex, [:SEX, :AGE])

#Population by sex and marital status
dependent_sex_marital = combine(groupby(disaggregated_dependent_population, [:MARITAL_STATUS, :SEX], sort=true), nrow); 
sort!(dependent_sex_marital, [:SEX, :MARITAL_STATUS])

#Population by income
dependent_income = combine(groupby(disaggregated_dependent_population, [:INCOME], sort=true), nrow)

#Correct column names
dependent_age_sex, dependent_sex_marital, dependent_income = map(x -> rename!(x, :nrow => :population), [dependent_age_sex, dependent_sex_marital, dependent_income])

#filter out missing values
dependent_sex_marital = filter(:MARITAL_STATUS => x -> typeof(x) != Missing, dependent_sex_marital)
dependent_income = filter(:INCOME => x -> typeof(x) != Missing, dependent_income)

Row,INCOME,population
Unnamed: 0_level_1,Int64?,Int64
1,40000,38003
2,50000,85892
3,60000,40390
4,70000,38496
5,80000,37219


### 1.3. Generate dependent population from marginals
Then, we use our algorithm to estimate joint distribution of the attributes. 

#### Guo, Bhat, 2007

In [25]:
guo_bhat = SyntheticPopulation.generate_joint_distribution(dependent_age_sex, dependent_sex_marital, dependent_income)
guo_bhat = guo_bhat[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150008, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149992, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [300002, 240000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/Proportio

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String,Char,Int64,Int64
1,5,Divorced,F,40000,322
2,10,Divorced,F,40000,319
3,15,Divorced,F,40000,317
4,20,Divorced,F,40000,481
5,25,Divorced,F,40000,476
6,30,Divorced,F,40000,259
7,35,Divorced,F,40000,260
8,40,Divorced,F,40000,262
9,45,Divorced,F,40000,262
10,50,Divorced,F,40000,258


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [26]:
ponge_et_al = SyntheticPopulation.generate_joint_distribution(dependent_age_sex, dependent_sex_marital, dependent_income, config_file = "ind_ponge2021.json")
ponge_et_al = ponge_et_al[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150008, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150008, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149992, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, c

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [300002, 240000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 18 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130


Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String,Char,Int64,Int64
1,5,Divorced,F,40000,246
2,10,Divorced,F,40000,244
3,15,Divorced,F,40000,242
4,20,Divorced,F,40000,932
5,25,Divorced,F,40000,921
6,30,Divorced,F,40000,198
7,35,Divorced,F,40000,199
8,40,Divorced,F,40000,201
9,45,Divorced,F,40000,200
10,50,Divorced,F,40000,198


#### Modified algorithm

In [27]:
modified = SyntheticPopulation.generate_joint_distribution(dependent_age_sex, dependent_sex_marital, dependent_income, config_file = "ind_modified.json")
modified = modified[:, Not(:id)]

└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Converged in 2 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Converged in 2 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [30008, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [29992, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64?,String?,Char?,Int64?,Int64
1,5,missing,F,missing,10088
2,10,missing,F,missing,9997
3,15,missing,F,missing,9928
4,20,Divorced,F,40000,932
5,25,Divorced,F,40000,921
6,30,Divorced,F,40000,178
7,35,Divorced,F,40000,179
8,40,Divorced,F,40000,180
9,45,Divorced,F,40000,180
10,50,Divorced,F,40000,177


### 1.4. Evaluation of fit of generated dependent population.
Finally, we evaluate if the generated population is correct. We use approach based on Z-score described by [Williamson, 2013] [1].


[1] Williamson, P. (2013). An evaluation of two synthetic small-area microdata simulation methodologies: Synthetic reconstruction and combinatorial optimisation. Spatial microsimulation: A reference guide for users, 19-47. https://ndl.ethernet.edu.et/bitstream/123456789/14722/1/205.pdf#page=38

#### Guo, Bhat, 2007

In [28]:
validate_table(guo_bhat, dependent_population)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.011406844106463879
Percentage of well fitting values at 0.90 confidence interval: 0.0076045627376425855

=Table statistics=
Statistic value equals: Inf
Table is not well fitting.



Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Float64
1,20,Divorced,F,40000,249,481,-14.7085
2,20,Divorced,F,50000,2821,1087,32.8019
3,20,Divorced,F,60000,0,511,-Inf
4,20,Divorced,F,70000,0,487,-Inf
5,20,Divorced,F,80000,0,471,-Inf
6,20,Divorced,M,40000,431,714,-13.6414
7,20,Divorced,M,50000,4111,1614,39.214
8,20,Divorced,M,60000,0,759,-Inf
9,20,Divorced,M,70000,0,724,-Inf
10,20,Divorced,M,80000,0,700,-Inf


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [29]:
validate_table(ponge_et_al, dependent_population)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.2623574144486692
Percentage of well fitting values at 0.90 confidence interval: 0.24904942965779467

=Table statistics=
Statistic value equals: 594442.2578293269
Table is not well fitting.



Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Real
1,20,Divorced,F,40000,249,932,-43.3013
2,20,Divorced,F,50000,2821,2105,13.5445
3,20,Divorced,F,60000,0,0,0
4,20,Divorced,F,70000,0,0,0
5,20,Divorced,F,80000,0,0,0
6,20,Divorced,M,40000,431,1384,-45.9374
7,20,Divorced,M,50000,4111,3127,15.4532
8,20,Divorced,M,60000,0,0,0
9,20,Divorced,M,70000,0,0,0
10,20,Divorced,M,80000,0,0,0


#### Modified algorithm

In [30]:
validate_table(modified, dependent_population)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.2376425855513308
Percentage of well fitting values at 0.90 confidence interval: 0.20912547528517111

=Table statistics=
Statistic value equals: 526200.9116311877
Table is not well fitting.



Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Real
1,5,missing,F,missing,10086,10088,-0.020258
2,5,missing,M,missing,10066,10067,-0.0101387
3,10,missing,F,missing,9996,9997,-0.0101729
4,10,missing,M,missing,9789,9790,-0.0102762
5,15,missing,F,missing,9926,9928,-0.020415
6,15,missing,M,missing,10137,10137,0.0
7,20,Divorced,F,40000,249,932,-43.3013
8,20,Divorced,F,50000,2821,2105,13.5445
9,20,Divorced,F,60000,0,0,0
10,20,Divorced,F,70000,0,0,0


### 1.5 Internal Validation

Once the tables have been generated, the internal validation can be performed. This is done to assess the distance between input contingency tables the the generated tables.

#### Guo, Bhat, 2007

In [31]:
guo_bhat_age_sex, guo_bhat_sex_marital, guo_bhat_income = compute_marginals(guo_bhat)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE   [0m[1m SEX  [0m[1m population [0m
     │[90m Int64 [0m[90m Char [0m[90m Int64      [0m
─────┼─────────────────────────
   1 │     5  F          10088
   2 │     5  M          10067
   3 │    10  F           9997
   4 │    10  M           9790
   5 │    15  F           9928
   6 │    15  M          10137
   7 │    20  F          15083
   8 │    20  M          14873
  ⋮  │   ⋮     ⋮        ⋮
  26 │    65  M           8247
  27 │    70  F           8239
  28 │    70  M           8053
  29 │    75  F           8167
  30 │    75  M           8148
  31 │    80  F           8178
  32 │    80  M           8136
[36m                17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX  [0m[1m population [0m
     │[90m String         [0m[90m Char [0m[90m Int64      [0m
─────┼──────────────────────────────────
   1 │ Divorced        F          30207
   2 │ Divorced        M          45492
   3

In [32]:
validate_table(guo_bhat_age_sex, dependent_age_sex)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.008775099886151695
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,10086,10088,-0.020258
2,5,M,10066,10067,-0.0101387
3,10,F,9996,9997,-0.0101729
4,10,M,9789,9790,-0.0102762
5,15,F,9926,9928,-0.020415
6,15,M,10137,10137,0.0
7,20,F,15085,15083,0.0167094
8,20,M,14875,14873,0.0168207
9,25,F,14915,14913,0.0167993
10,25,M,15125,15125,0.0


In [33]:
validate_table(guo_bhat_sex_marital, dependent_sex_marital)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 17703.528812267272
Table is not well fitting.



Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,24163,30207,-41.0007
2,Divorced,M,36393,45492,-51.7838
3,Married,F,36069,45088,-51.5176
4,Married,M,24197,30247,-41.0158
5,Not_married,F,47753,59699,-61.0799
6,Not_married,M,11845,14803,-27.8754
7,Widowed,F,12015,15020,-28.1278
8,Widowed,M,47565,59453,-60.8736


In [34]:
validate_table(guo_bhat_income, dependent_income)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 19841.805899022544
Table is not well fitting.



Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,38003,47505,-53.1299
2,50000,85892,107366,-91.4386
3,60000,40390,50491,-55.1115
4,70000,38496,48122,-53.5429
5,80000,37219,46525,-52.4774


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [35]:
ponge_et_al_age_sex, ponge_et_al_sex_marital, ponge_et_al_income = compute_marginals(ponge_et_al)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE   [0m[1m SEX  [0m[1m population [0m
     │[90m Int64 [0m[90m Char [0m[90m Int64      [0m
─────┼─────────────────────────
   1 │     5  F          10088
   2 │     5  M          10066
   3 │    10  F           9997
   4 │    10  M           9789
   5 │    15  F           9927
   6 │    15  M          10137
   7 │    20  F          15084
   8 │    20  M          14874
  ⋮  │   ⋮     ⋮        ⋮
  26 │    65  M           8246
  27 │    70  F           8242
  28 │    70  M           8048
  29 │    75  F           8166
  30 │    75  M           8146
  31 │    80  F           8179
  32 │    80  M           8136
[36m                17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX  [0m[1m population [0m
     │[90m String         [0m[90m Char [0m[90m Int64      [0m
─────┼──────────────────────────────────
   1 │ Divorced        F          30210
   2 │ Divorced        M          45488
   3

In [36]:
validate_table(ponge_et_al_age_sex, dependent_age_sex)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.00711812581680025
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,10086,10088,-0.020258
2,5,M,10066,10066,0.0
3,10,F,9996,9997,-0.0101729
4,10,M,9789,9789,0.0
5,15,F,9926,9927,-0.0102075
6,15,M,10137,10137,0.0
7,20,F,15085,15084,0.00835469
8,20,M,14875,14874,0.00841036
9,25,F,14915,14914,0.00839966
10,25,M,15125,15125,0.0


In [37]:
validate_table(ponge_et_al_sex_marital, dependent_sex_marital)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 17692.974117952195
Table is not well fitting.



Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,24163,30210,-41.0211
2,Divorced,M,36393,45488,-51.761
3,Married,F,36069,45089,-51.5233
4,Married,M,24197,30240,-40.9684
5,Not_married,F,47753,59695,-61.0594
6,Not_married,M,11845,14799,-27.8377
7,Widowed,F,12015,15021,-28.1371
8,Widowed,M,47565,59449,-60.8531


In [38]:
validate_table(ponge_et_al_income, dependent_income)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 19830.482214515705
Table is not well fitting.



Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,38003,47501,-53.1076
2,50000,85892,107363,-91.4258
3,60000,40390,50482,-55.0624
4,70000,38496,48117,-53.5151
5,80000,37219,46528,-52.4944


#### Modified

In [39]:
modified_age_sex, modified_sex_marital, modified_income = compute_marginals(modified)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE    [0m[1m SEX   [0m[1m population [0m
     │[90m Int64? [0m[90m Char? [0m[90m Int64      [0m
─────┼───────────────────────────
   1 │      5  F           10088
   2 │      5  M           10067
   3 │     10  F            9997
   4 │     10  M            9790
   5 │     15  F            9928
   6 │     15  M           10137
   7 │     20  F           15084
   8 │     20  M           14874
  ⋮  │   ⋮       ⋮        ⋮
  26 │     65  M            8246
  27 │     70  F            8241
  28 │     70  M            8050
  29 │     75  F            8167
  30 │     75  M            8148
  31 │     80  F            8179
  32 │     80  M            8138
[36m                  17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX   [0m[1m population [0m
     │[90m String?        [0m[90m Char? [0m[90m Int64      [0m
─────┼───────────────────────────────────
   1 │ Divorced        F           24166


In [40]:
validate_table(modified_age_sex, dependent_age_sex)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.0071171169063627745
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,10086,10088,-0.020258
2,5,M,10066,10067,-0.0101387
3,10,F,9996,9997,-0.0101729
4,10,M,9789,9790,-0.0102762
5,15,F,9926,9928,-0.020415
6,15,M,10137,10137,0.0
7,20,F,15085,15084,0.00835469
8,20,M,14875,14874,0.00841036
9,25,F,14915,14914,0.00839966
10,25,M,15125,15125,0.0


In [41]:
validate_table(modified_sex_marital, dependent_sex_marital)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.0018694172078463035
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,24163,24166,-0.0203511
2,Divorced,M,36393,36391,0.0113823
3,Married,F,36069,36071,-0.0114242
4,Married,M,24197,24193,0.0271179
5,Not_married,F,47753,47755,-0.010226
6,Not_married,M,11845,11843,0.0188475
7,Widowed,F,12015,12015,0.0
8,Widowed,M,47565,47565,0.0


In [42]:
validate_table(modified_income, dependent_income)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.00028824047519773564
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,38003,38005,-0.0111829
2,50000,85892,85889,0.0127743
3,60000,40390,40390,0.0
4,70000,38496,38496,0.0
5,80000,37219,37219,0.0
