In [29]:
include("../src/SyntheticPopulation.jl")
include("utils.jl")
using .SyntheticPopulation
using DataFrames
using StatsBase



# Summary:
Sample-free IPF algorithm is not efficient for creating joint distributions of the attributes that are highly dependent.

# 1. Population with dependent variables

### 1.1. Generate target population with dependent variables
First we generate target population that we'll try to synthesize using available algorithms. The variables marital status and income are highly dependent on sex.

In [30]:
SIZE = 300000
OLD_ADULTS = 0.6
YOUNG_ADULTS = 0.2
CHILDREN = 0.2

SEX = ['M', 'F']; SEX_WEIGHTS = [0.5, 0.5]
MARITAL_STATUS = ["Not_married", "Married", "Divorced", "Widowed"]; 
MARITAL_STATUS_WEIGHTS_M = [0.1, 0.2, 0.3, 0.4]; 
MARITAL_STATUS_WEIGHTS_F = [0.4, 0.3, 0.2, 0.1];

AGE_YOUNG_ADULT = [20, 25];
AGE_YOUNG_ADULT_WEIGHTS = repeat([1 / length(AGE_YOUNG_ADULT)], length(AGE_YOUNG_ADULT));
AGE_OLD_ADULT = [30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80];
AGE_OLD_ADULT_WEIGHTS = repeat([1 / length(AGE_OLD_ADULT)], length(AGE_OLD_ADULT));
AGE_CHILDREN = [5, 10, 15];
AGE_CHILDREN_WEIGHTS = repeat([1 / length(AGE_CHILDREN)], length(AGE_CHILDREN));

INCOME = [40000, 50000, 60000, 70000, 80000];
ZERO_INCOME = [60000, 70000, 80000]
INCOME_WEIGHTS_M_YOUNG = SIZE ./ 2 .* YOUNG_ADULTS .* [0.01, 0.1, 0, 0, 0];
INCOME_WEIGHTS_F_YOUNG = SIZE ./ 2 .* YOUNG_ADULTS .* [0.39, 0.3, 0, 0, 0];
INCOME_WEIGHTS_M_OLD = SIZE ./2 * (OLD_ADULTS+YOUNG_ADULTS) .* [0.01, 0.1, 0.2, 0.3, 0.39] .- INCOME_WEIGHTS_M_YOUNG;
INCOME_WEIGHTS_F_OLD = SIZE ./2 * (OLD_ADULTS+YOUNG_ADULTS) .*[0.39, 0.3, 0.2, 0.1, 0.01] .- INCOME_WEIGHTS_F_YOUNG;

population_m_young_adult = DataFrame(
    AGE = sample(AGE_YOUNG_ADULT, Weights(AGE_YOUNG_ADULT_WEIGHTS), Int(SIZE * YOUNG_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_M), Int(SIZE * YOUNG_ADULTS / 2)),
    SEX = repeat(['M'], Int(SIZE * YOUNG_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_M_YOUNG), Int(SIZE * YOUNG_ADULTS / 2)),
)
population_f_young_adult = DataFrame(
    AGE = sample(AGE_YOUNG_ADULT, Weights(AGE_YOUNG_ADULT_WEIGHTS), Int(SIZE * YOUNG_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_F), Int(SIZE * YOUNG_ADULTS / 2)),
    SEX = repeat(['F'], Int(SIZE * YOUNG_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_M_YOUNG), Int(SIZE * YOUNG_ADULTS / 2)),
)
population_m_old_adult = DataFrame(
    AGE = sample(AGE_OLD_ADULT, Weights(AGE_OLD_ADULT_WEIGHTS), Int(SIZE * OLD_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_M), Int(SIZE * OLD_ADULTS / 2)),
    SEX = repeat(['M'], Int(SIZE * OLD_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_M_OLD), Int(SIZE * OLD_ADULTS / 2)),
)
population_f_old_adult = DataFrame(
    AGE = sample(AGE_OLD_ADULT, Weights(AGE_OLD_ADULT_WEIGHTS), Int(SIZE * OLD_ADULTS / 2)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_STATUS_WEIGHTS_F), Int(SIZE * OLD_ADULTS / 2)),
    SEX = repeat(['F'], Int(SIZE * OLD_ADULTS / 2)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_F_OLD), Int(SIZE * OLD_ADULTS / 2)),
)
population_children = DataFrame(
    AGE = sample(AGE_CHILDREN, Weights(AGE_CHILDREN_WEIGHTS), Int(SIZE * CHILDREN)),
    MARITAL_STATUS = repeat([missing], Int(SIZE * CHILDREN)),
    SEX = sample(SEX, Weights(SEX_WEIGHTS), Int(SIZE * CHILDREN)),
    INCOME = repeat([missing], Int(SIZE * CHILDREN))
)

disaggregated_dependent_population = reduce(vcat, [
    population_m_young_adult, 
    population_f_young_adult,
    population_m_old_adult,
    population_f_old_adult,
    population_children
    ]
)

dependent_population = combine(groupby(disaggregated_dependent_population, names(disaggregated_dependent_population), sort=true), nrow)
rename!(dependent_population, :nrow => :population)
zero_population = DataFrame(vec(collect(Iterators.product(AGE_YOUNG_ADULT, MARITAL_STATUS, SEX, ZERO_INCOME))))
zero_population.:population = repeat([0], nrow(zero_population))
rename!(zero_population, names(dependent_population))
dependent_population = reduce(vcat, [dependent_population, zero_population])

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64
1,5,missing,F,missing,10015
2,5,missing,M,missing,9996
3,10,missing,F,missing,9982
4,10,missing,M,missing,9990
5,15,missing,F,missing,9966
6,15,missing,M,missing,10051
7,20,Divorced,F,40000,280
8,20,Divorced,F,50000,2732
9,20,Divorced,M,40000,448
10,20,Divorced,M,50000,4020


### 1.2. Compute marginal attribute distributions of the dependent population
Then we compute the marginal population attributes. This is a type of data that we can usually get from the census data and will be input for our algorithms for population generation.

In [31]:
#Population by age and sex
dependent_age_sex = combine(groupby(disaggregated_dependent_population, [:AGE, :SEX], sort=true), nrow); 
sort!(dependent_age_sex, [:SEX, :AGE])

#Population by sex and marital status
dependent_sex_marital = combine(groupby(disaggregated_dependent_population, [:MARITAL_STATUS, :SEX], sort=true), nrow); 
sort!(dependent_sex_marital, [:SEX, :MARITAL_STATUS])

#Population by income
dependent_income = combine(groupby(disaggregated_dependent_population, [:INCOME], sort=true), nrow)

#Correct column names
dependent_age_sex, dependent_sex_marital, dependent_income = map(x -> rename!(x, :nrow => :population), [dependent_age_sex, dependent_sex_marital, dependent_income])

#filter out missing values
dependent_sex_marital = filter(:MARITAL_STATUS => x -> typeof(x) != Missing, dependent_sex_marital)
dependent_income = filter(:INCOME => x -> typeof(x) != Missing, dependent_income)

Row,INCOME,population
Unnamed: 0_level_1,Int64?,Int64
1,40000,37934
2,50000,85901
3,60000,40050
4,70000,38857
5,80000,37258


### 1.3. Generate dependent population from marginals
Then, we use our algorithm to estimate joint distribution of the attributes. 

#### Guo, Bhat, 2007

In [32]:
guo_bhat = SyntheticPopulation.generate_joint_distribution(dependent_age_sex, dependent_sex_marital, dependent_income)
guo_bhat = guo_bhat[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149963, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150037, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [299991, 240000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/Proportio

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String,Char,Int64,Int64
1,5,Divorced,F,40000,315
2,10,Divorced,F,40000,314
3,15,Divorced,F,40000,313
4,20,Divorced,F,40000,476
5,25,Divorced,F,40000,467
6,30,Divorced,F,40000,251
7,35,Divorced,F,40000,261
8,40,Divorced,F,40000,255
9,45,Divorced,F,40000,262
10,50,Divorced,F,40000,260


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [33]:
ponge_et_al = SyntheticPopulation.generate_joint_distribution(dependent_age_sex, dependent_sex_marital, dependent_income, config_file = "ind_ponge2021.json")
ponge_et_al = ponge_et_al[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149963, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149963, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150037, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, c

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [299991, 240000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 18 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130


Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String,Char,Int64,Int64
1,5,Divorced,F,40000,241
2,10,Divorced,F,40000,240
3,15,Divorced,F,40000,240
4,20,Divorced,F,40000,922
5,25,Divorced,F,40000,905
6,30,Divorced,F,40000,192
7,35,Divorced,F,40000,200
8,40,Divorced,F,40000,195
9,45,Divorced,F,40000,201
10,50,Divorced,F,40000,199


#### Modified algorithm

In [34]:
modified = SyntheticPopulation.generate_joint_distribution(dependent_age_sex, dependent_sex_marital, dependent_income, config_file = "ind_modified.json")
modified = modified[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149963, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149963, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150037, 120000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, c

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64?,String?,Char?,Int64?,Int64
1,5,missing,F,missing,10013
2,10,missing,F,missing,9983
3,15,missing,F,missing,9966
4,20,Divorced,F,40000,922
5,25,Divorced,F,40000,905
6,30,Divorced,F,40000,192
7,35,Divorced,F,40000,200
8,40,Divorced,F,40000,195
9,45,Divorced,F,40000,201
10,50,Divorced,F,40000,199


### 1.4. Evaluation of fit of generated dependent population.
Finally, we evaluate if the generated population is correct. We use approach based on Z-score described by [Williamson, 2013] [1].


[1] Williamson, P. (2013). An evaluation of two synthetic small-area microdata simulation methodologies: Synthetic reconstruction and combinatorial optimisation. Spatial microsimulation: A reference guide for users, 19-47. https://ndl.ethernet.edu.et/bitstream/123456789/14722/1/205.pdf#page=38

#### Guo, Bhat, 2007

In [35]:
validate_table(guo_bhat, dependent_population)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.009505703422053232
Percentage of well fitting values at 0.90 confidence interval: 0.0038022813688212928


=Table statistics=

Statistic value equals: Inf
Table is not well fitting.


Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Float64
1,20,Divorced,F,40000,280,476,-11.7187
2,20,Divorced,F,50000,2732,1078,31.7894
3,20,Divorced,F,60000,0,502,-Inf
4,20,Divorced,F,70000,0,487,-Inf
5,20,Divorced,F,80000,0,467,-Inf
6,20,Divorced,M,40000,448,717,-12.7186
7,20,Divorced,M,50000,4020,1624,38.0455
8,20,Divorced,M,60000,0,757,-Inf
9,20,Divorced,M,70000,0,735,-Inf
10,20,Divorced,M,80000,0,704,-Inf


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [36]:
validate_table(ponge_et_al, dependent_population)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.2623574144486692
Percentage of well fitting values at 0.90 confidence interval: 0.2509505703422053


=Table statistics=

Statistic value equals: 602527.1362041465
Table is not well fitting.


Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Real
1,20,Divorced,F,40000,280,922,-38.3848
2,20,Divorced,F,50000,2732,2089,12.3583
3,20,Divorced,F,60000,0,0,0
4,20,Divorced,F,70000,0,0,0
5,20,Divorced,F,80000,0,0,0
6,20,Divorced,M,40000,448,1390,-44.5386
7,20,Divorced,M,50000,4020,3148,13.8463
8,20,Divorced,M,60000,0,0,0
9,20,Divorced,M,70000,0,0,0
10,20,Divorced,M,80000,0,0,0


#### Modified algorithm

In [37]:
validate_table(modified, dependent_population)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.2737642585551331
Percentage of well fitting values at 0.90 confidence interval: 0.2623574144486692


=Table statistics=

Statistic value equals: 540458.1550910624
Table is not well fitting.


Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Real
1,5,missing,F,missing,10015,10013,0.0203272
2,5,missing,M,missing,9996,9996,0.0
3,10,missing,F,missing,9982,9983,-0.0101798
4,10,missing,M,missing,9990,9989,0.0101759
5,15,missing,F,missing,9966,9966,0.0
6,15,missing,M,missing,10051,10050,0.010146
7,20,Divorced,F,40000,280,922,-38.3848
8,20,Divorced,F,50000,2732,2089,12.3583
9,20,Divorced,F,60000,0,0,0
10,20,Divorced,F,70000,0,0,0


### 1.5 Internal Validation

Once the tables have been generated, the internal validation can be performed. This is done to assess the distance between input contingency tables the the generated tables.

#### Guo, Bhat, 2007

In [38]:
guo_bhat_age_sex, guo_bhat_sex_marital, guo_bhat_income = compute_marginals(guo_bhat)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE   [0m[1m SEX  [0m[1m population [0m
     │[90m Int64 [0m[90m Char [0m[90m Int64      [0m
─────┼─────────────────────────
   1 │     5  F          10017
   2 │     5  M           9996
   3 │    10  F           9981
   4 │    10  M           9990
   5 │    15  F           9967
   6 │    15  M          10050
   7 │    20  F          15145
   8 │    20  M          15188
  ⋮  │   ⋮     ⋮        ⋮
  26 │    65  M           8249
  27 │    70  F           8264
  28 │    70  M           8269
  29 │    75  F           8068
  30 │    75  M           7975
  31 │    80  F           8191
  32 │    80  M           8147
[36m                17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX  [0m[1m population [0m
     │[90m String         [0m[90m Char [0m[90m Int64      [0m
─────┼──────────────────────────────────
   1 │ Divorced        F          29811
   2 │ Divorced        M          44836
   3

In [50]:
validate_table(guo_bhat_age_sex, dependent_age_sex)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0


=Table statistics=

Statistic value equals: 0.00600498821344193
Table is well fitting at 0.9 and 0.95 confidence interval.


Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,10015,10017,-0.0203272
2,5,M,9996,9996,0.0
3,10,F,9982,9981,0.0101798
4,10,M,9990,9990,0.0
5,15,F,9966,9967,-0.0101877
6,15,M,10051,10050,0.010146
7,20,F,15146,15145,0.00833874
8,20,M,15187,15188,-0.00832808
9,25,F,14854,14852,0.016832
10,25,M,14813,14814,-0.00842702


In [51]:
validate_table(guo_bhat_sex_marital, dependent_sex_marital)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: 17713.023332168304
Table is not well fitting.


Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,23858,29811,-40.612
2,Divorced,M,35860,44836,-51.3948
3,Married,F,35944,44918,-51.3338
4,Married,M,24062,30079,-40.8936
5,Not_married,F,48160,60188,-61.3036
6,Not_married,M,11983,14990,-28.1821
7,Widowed,F,12038,15048,-28.149
8,Widowed,M,48095,60136,-61.4009


In [52]:
validate_table(guo_bhat_income, dependent_income)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: 19839.17108994752
Table is not well fitting.


Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,37934,47423,-53.0964
2,50000,85901,107372,-91.4237
3,60000,40050,50071,-54.8599
4,70000,38857,48569,-53.818
5,80000,37258,46571,-52.4945


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [42]:
ponge_et_al_age_sex, ponge_et_al_sex_marital, ponge_et_al_income = compute_marginals(ponge_et_al)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE   [0m[1m SEX  [0m[1m population [0m
     │[90m Int64 [0m[90m Char [0m[90m Int64      [0m
─────┼─────────────────────────
   1 │     5  F          10013
   2 │     5  M           9996
   3 │    10  F           9983
   4 │    10  M           9989
   5 │    15  F           9966
   6 │    15  M          10050
   7 │    20  F          15146
   8 │    20  M          15187
  ⋮  │   ⋮     ⋮        ⋮
  26 │    65  M           8250
  27 │    70  F           8265
  28 │    70  M           8269
  29 │    75  F           8065
  30 │    75  M           7973
  31 │    80  F           8190
  32 │    80  M           8147
[36m                17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX  [0m[1m population [0m
     │[90m String         [0m[90m Char [0m[90m Int64      [0m
─────┼──────────────────────────────────
   1 │ Divorced        F          29811
   2 │ Divorced        M          44829
   3

In [53]:
validate_table(ponge_et_al_age_sex, dependent_age_sex)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0


=Table statistics=

Statistic value equals: 0.0069684839695416684
Table is well fitting at 0.9 and 0.95 confidence interval.


Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,10015,10013,0.0203272
2,5,M,9996,9996,0.0
3,10,F,9982,9983,-0.0101798
4,10,M,9990,9989,0.0101759
5,15,F,9966,9966,0.0
6,15,M,10051,10050,0.010146
7,20,F,15146,15146,0.0
8,20,M,15187,15187,0.0
9,25,F,14854,14853,0.00841599
10,25,M,14813,14813,0.0


In [54]:
validate_table(ponge_et_al_sex_marital, dependent_sex_marital)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: 17701.68517164271
Table is not well fitting.


Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,23858,29811,-40.612
2,Divorced,M,35860,44829,-51.3547
3,Married,F,35944,44920,-51.3453
4,Married,M,24062,30080,-40.9004
5,Not_married,F,48160,60185,-61.2883
6,Not_married,M,11983,14980,-28.0883
7,Widowed,F,12038,15041,-28.0836
8,Widowed,M,48095,60139,-61.4162


In [55]:
validate_table(ponge_et_al_income, dependent_income)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: 19826.273896930572
Table is not well fitting.


Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,37934,47416,-53.0572
2,50000,85901,107370,-91.4152
3,60000,40050,50061,-54.8051
4,70000,38857,48570,-53.8235
5,80000,37258,46568,-52.4776


#### Modified

In [46]:
modified_age_sex, modified_sex_marital, modified_income = compute_marginals(modified)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE    [0m[1m SEX   [0m[1m population [0m
     │[90m Int64? [0m[90m Char? [0m[90m Int64      [0m
─────┼───────────────────────────
   1 │      5  F           10013
   2 │      5  M            9996
   3 │     10  F            9983
   4 │     10  M            9989
   5 │     15  F            9966
   6 │     15  M           10050
   7 │     20  F           15146
   8 │     20  M           15187
  ⋮  │   ⋮       ⋮        ⋮
  26 │     65  M            8250
  27 │     70  F            8265
  28 │     70  M            8269
  29 │     75  F            8065
  30 │     75  M            7973
  31 │     80  F            8190
  32 │     80  M            8147
[36m                  17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX   [0m[1m population [0m
     │[90m String?        [0m[90m Char? [0m[90m Int64      [0m
─────┼───────────────────────────────────
   1 │ Divorced        F           23855


In [56]:
validate_table(modified_age_sex, dependent_age_sex)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0


=Table statistics=

Statistic value equals: 0.0069684839695416684
Table is well fitting at 0.9 and 0.95 confidence interval.


Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,10015,10013,0.0203272
2,5,M,9996,9996,0.0
3,10,F,9982,9983,-0.0101798
4,10,M,9990,9989,0.0101759
5,15,F,9966,9966,0.0
6,15,M,10051,10050,0.010146
7,20,F,15146,15146,0.0
8,20,M,15187,15187,0.0
9,25,F,14854,14853,0.00841599
10,25,M,14813,14813,0.0


In [57]:
validate_table(modified_sex_marital, dependent_sex_marital)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0


=Table statistics=

Statistic value equals: 0.004871666395114081
Table is well fitting at 0.9 and 0.95 confidence interval.


Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,23858,23855,0.0204663
2,Divorced,M,35860,35853,0.0400806
3,Married,F,35944,35945,-0.00572029
4,Married,M,24062,24058,0.0271854
5,Not_married,F,48160,48160,0.0
6,Not_married,M,11983,11981,0.0187443
7,Widowed,F,12038,12035,0.0280555
8,Widowed,M,48095,48101,-0.0305959


In [58]:
validate_table(modified_income, dependent_income)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: 1155.939990205556
Table is not well fitting.


Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,37934,40158,-12.4446
2,50000,85901,90932,-21.422
3,60000,40050,37545,13.7136
4,70000,38857,36427,13.4656
5,80000,37258,34926,13.1448
