In [45]:
include("../src/SyntheticPopulation.jl")
include("utils.jl")
using .SyntheticPopulation
using DataFrames
using StatsBase



# Summary:
Sample-free IPF algorithm is well suited for creating joint distributions of the attributes that are independent.

# 1. Population with independent variables

### 1.1. Generate target population with independent variables
First we generate target population that we'll try to synthesize using available algorithms. All variables are independent.

In [46]:
SIZE = 300000
OLD_ADULTS = 0.6
YOUNG_ADULTS = 0.2
CHILDREN = 0.2

SEX = ['M', 'F']; SEX_WEIGHTS = [0.5, 0.5]
MARITAL_STATUS = ["Not_married", "Married", "Divorced", "Widowed"]; 
MARITAL_WEIGHTS = [0.3, 0.5, 0.1, 0.1]

AGE_YOUNG_ADULT = [20, 25];
AGE_YOUNG_ADULT_WEIGHTS = repeat([1 / length(AGE_YOUNG_ADULT)], length(AGE_YOUNG_ADULT));
AGE_OLD_ADULT = [30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80];
AGE_OLD_ADULT_WEIGHTS = repeat([1 / length(AGE_OLD_ADULT)], length(AGE_OLD_ADULT));
AGE_CHILDREN = [5, 10, 15];
AGE_CHILDREN_WEIGHTS = repeat([1 / length(AGE_CHILDREN)], length(AGE_CHILDREN));

INCOME = [40000, 50000, 60000, 70000, 80000];
ZERO_INCOME = [60000, 70000, 80000];
INCOME_WEIGHTS_YOUNG = SIZE .* YOUNG_ADULTS .* [0.5, 0.5, 0, 0, 0]
INCOME_WEIGHTS_OLD = SIZE * (OLD_ADULTS+YOUNG_ADULTS) .* [0.2, 0.2, 0.2, 0.2, 0.2] .- INCOME_WEIGHTS_YOUNG

population_young_adult = DataFrame(
    AGE = sample(AGE_YOUNG_ADULT, Weights(AGE_YOUNG_ADULT_WEIGHTS), Int(SIZE * YOUNG_ADULTS)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_WEIGHTS), Int(SIZE * YOUNG_ADULTS)),
    SEX = sample(SEX, Weights(SEX_WEIGHTS), Int(SIZE * YOUNG_ADULTS)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_YOUNG), Int(SIZE * YOUNG_ADULTS)),
)
population_old_adult = DataFrame(
    AGE = sample(AGE_OLD_ADULT, Weights(AGE_OLD_ADULT_WEIGHTS), Int(SIZE * OLD_ADULTS)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_WEIGHTS), Int(SIZE * OLD_ADULTS)),
    SEX = sample(SEX, Weights(SEX_WEIGHTS), Int(SIZE * OLD_ADULTS)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_OLD), Int(SIZE * OLD_ADULTS)),
)
population_children = DataFrame(
    AGE = sample(AGE_CHILDREN, Weights(AGE_CHILDREN_WEIGHTS), Int(SIZE * CHILDREN)),
    MARITAL_STATUS = repeat([missing], Int(SIZE * CHILDREN)),
    SEX = sample(SEX, Weights(SEX_WEIGHTS), Int(SIZE * CHILDREN)),
    INCOME = repeat([missing], Int(SIZE * CHILDREN))
)

disaggregated_independent_population = reduce(vcat, [
    population_young_adult, 
    population_old_adult,
    population_children
    ]
)

independent_population = combine(groupby(disaggregated_independent_population, names(disaggregated_independent_population), sort=true), nrow)
rename!(independent_population, :nrow => :population)
zero_population = DataFrame(vec(collect(Iterators.product(AGE_YOUNG_ADULT, MARITAL_STATUS, SEX, ZERO_INCOME))))
zero_population.:population = repeat([0], nrow(zero_population))
rename!(zero_population, names(independent_population))
independent_population = reduce(vcat, [independent_population, zero_population])

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64
1,5,missing,F,missing,10088
2,5,missing,M,missing,10029
3,10,missing,F,missing,9964
4,10,missing,M,missing,9928
5,15,missing,F,missing,9966
6,15,missing,M,missing,10025
7,20,Divorced,F,40000,769
8,20,Divorced,F,50000,700
9,20,Divorced,M,40000,776
10,20,Divorced,M,50000,734


### 1.2. Compute marginal attribute distributions of the independent population
Then we compute the marginal population attributes. This is a type of data that we can usually get from the census data and will be input for our algorithms for population generation.

In [47]:
#Population by age and sex
independent_age_sex = combine(groupby(disaggregated_independent_population, [:AGE, :SEX], sort=true), nrow); 
sort!(independent_age_sex, [:SEX, :AGE])

#Population by sex and marital status
independent_sex_marital = combine(groupby(disaggregated_independent_population, [:MARITAL_STATUS, :SEX], sort=true), nrow); 
sort!(independent_sex_marital, [:SEX, :MARITAL_STATUS])

#Population by income
independent_income = combine(groupby(disaggregated_independent_population, [:INCOME], sort=true), nrow)

#Correct column names
independent_age_sex, independent_sex_marital, independent_income = map(x -> rename!(x, :nrow => :population), [independent_age_sex, independent_sex_marital, independent_income])

#filter out missing values
independent_sex_marital = filter(:MARITAL_STATUS => x -> typeof(x) != Missing, independent_sex_marital)
independent_income = filter(:INCOME => x -> typeof(x) != Missing, independent_income)

Row,INCOME,population
Unnamed: 0_level_1,Int64?,Int64
1,40000,48219
2,50000,47529
3,60000,48165
4,70000,48003
5,80000,48084


### 1.3. Generate independent population from marginals
Then, we use our algorithm to estimate joint distribution of the attributes. 

#### Guo, Bhat, 2007

In [48]:
guo_bhat = SyntheticPopulation.generate_joint_distribution(independent_age_sex, independent_sex_marital, independent_income)
guo_bhat = guo_bhat[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149670, 119652]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150330, 120348]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [300001, 240000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/Proportio

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String,Char,Int64,Int64
1,5,Divorced,F,40000,203
2,10,Divorced,F,40000,200
3,15,Divorced,F,40000,200
4,20,Divorced,F,40000,300
5,25,Divorced,F,40000,302
6,30,Divorced,F,40000,164
7,35,Divorced,F,40000,162
8,40,Divorced,F,40000,159
9,45,Divorced,F,40000,166
10,50,Divorced,F,40000,165


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [49]:
ponge_et_al = SyntheticPopulation.generate_joint_distribution(independent_age_sex, independent_sex_marital, independent_income, config_file = "ind_ponge2021.json")
ponge_et_al = ponge_et_al[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149670, 119652]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149670, 119652]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150330, 120348]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, c

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [300001, 240000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 27 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130


Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String,Char,Int64,Int64
1,5,Divorced,F,40000,126
2,10,Divorced,F,40000,125
3,15,Divorced,F,40000,125
4,20,Divorced,F,40000,751
5,25,Divorced,F,40000,756
6,30,Divorced,F,40000,102
7,35,Divorced,F,40000,101
8,40,Divorced,F,40000,99
9,45,Divorced,F,40000,103
10,50,Divorced,F,40000,103


#### Modified algorithm

In [50]:
modified = SyntheticPopulation.generate_joint_distribution(independent_income, independent_age_sex, independent_sex_marital, config_file = "ind_modified.json")
modified = modified[:, Not(:id)]

┌ Info: Converged in 46 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [240000, 60000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128


┌ Info: Converged in 2 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [120349, 120348]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [120349, 120348]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [30019, 119652]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ In

Row,INCOME,AGE,MARITAL_STATUS,SEX,population
Unnamed: 0_level_1,Int64?,Int64?,String?,Char?,Int64
1,missing,5,missing,F,10088
2,missing,10,missing,F,9963
3,missing,15,missing,F,9965
4,40000,20,Divorced,F,751
5,50000,20,Divorced,F,741
6,60000,20,Divorced,F,0
7,70000,20,Divorced,F,0
8,80000,20,Divorced,F,0
9,40000,25,Divorced,F,756
10,50000,25,Divorced,F,745


### 1.4. External validation of fit of generated independent population.
Finally, we evaluate if the generated population is correct. We use approach based on Z-score described by [Williamson, 2013] [1].


[1] Williamson, P. (2013). An evaluation of two synthetic small-area microdata simulation methodologies: Synthetic reconstruction and combinatorial optimisation. Spatial microsimulation: A reference guide for users, 19-47. https://ndl.ethernet.edu.et/bitstream/123456789/14722/1/205.pdf#page=38

#### Guo, Bhat, 2007

In [51]:
validate_table(guo_bhat, independent_population)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0076045627376425855
Percentage of well fitting values at 0.90 confidence interval: 0.0038022813688212928

=Table statistics=
Statistic value equals: Inf
Table is not well fitting.



Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Float64
1,20,Divorced,F,40000,769,300,16.9343
2,20,Divorced,F,50000,700,295,15.3255
3,20,Divorced,F,60000,0,299,-Inf
4,20,Divorced,F,70000,0,298,-Inf
5,20,Divorced,F,80000,0,299,-Inf
6,20,Divorced,M,40000,776,298,17.1814
7,20,Divorced,M,50000,734,294,16.2606
8,20,Divorced,M,60000,0,298,-Inf
9,20,Divorced,M,70000,0,297,-Inf
10,20,Divorced,M,80000,0,297,-Inf


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [52]:
validate_table(ponge_et_al, independent_population)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.5741444866920152
Percentage of well fitting values at 0.90 confidence interval: 0.5038022813688213

=Table statistics=
Statistic value equals: 65428.004664156935
Table is not well fitting.



Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Real
1,20,Divorced,F,40000,769,751,0.64993
2,20,Divorced,F,50000,700,741,-1.55147
3,20,Divorced,F,60000,0,0,0
4,20,Divorced,F,70000,0,0,0
5,20,Divorced,F,80000,0,0,0
6,20,Divorced,M,40000,776,747,1.04239
7,20,Divorced,M,50000,734,737,-0.110868
8,20,Divorced,M,60000,0,0,0
9,20,Divorced,M,70000,0,0,0
10,20,Divorced,M,80000,0,0,0


#### Modified algorithm

In [53]:
res = validate_table(modified, independent_population)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.9695817490494296
Percentage of well fitting values at 0.90 confidence interval: 0.9315589353612167

=Table statistics=
Statistic value equals: 453.64722572037283
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Real
1,20,Divorced,F,40000,769,751,0.64993
2,20,Divorced,M,40000,776,747,1.04239
3,20,Married,F,40000,3740,3773,-0.543003
4,20,Married,M,40000,3698,3744,-0.761146
5,20,Not_married,F,40000,2260,2245,0.316723
6,20,Not_married,M,40000,2237,2234,0.0636669
7,20,Widowed,F,40000,718,751,-1.23303
8,20,Widowed,M,40000,746,750,-0.146633
9,25,Divorced,F,40000,715,756,-1.53514
10,25,Divorced,M,40000,753,764,-0.401366


### 1.5 Internal Validation

Once the tables have been generated, the internal validation can be performed. This is done to assess the distance between input contingency tables the the generated tables.

#### Guo, Bhat, 2007

In [54]:
guo_bhat_age_sex, guo_bhat_sex_marital, guo_bhat_income = compute_marginals(guo_bhat)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE   [0m[1m SEX  [0m[1m population [0m
     │[90m Int64 [0m[90m Char [0m[90m Int64      [0m
─────┼─────────────────────────
   1 │     5  F          10089
   2 │     5  M          10026
   3 │    10  F           9965
   4 │    10  M           9925
   5 │    15  F           9966
   6 │    15  M          10023
   7 │    20  F          14930
   8 │    20  M          14845
  ⋮  │   ⋮     ⋮        ⋮
  26 │    65  M           8296
  27 │    70  F           8255
  28 │    70  M           8147
  29 │    75  F           8276
  30 │    75  M           8260
  31 │    80  F           8151
  32 │    80  M           8279
[36m                17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX  [0m[1m population [0m
     │[90m String         [0m[90m Char [0m[90m Int64      [0m
─────┼──────────────────────────────────
   1 │ Divorced        F          14954
   2 │ Divorced        M          15021
   3

In [55]:
validate_table(guo_bhat_age_sex, independent_age_sex)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.006108342375349761
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,10088,10089,-0.010128
2,5,M,10029,10026,0.0304702
3,10,F,9964,9965,-0.0101887
4,10,M,9928,9925,0.0306195
5,15,F,9966,9966,0.0
6,15,M,10025,10023,0.0203174
7,20,F,14932,14930,0.0167903
8,20,M,14846,14845,0.00841814
9,25,F,15031,15027,0.0334756
10,25,M,15191,15193,-0.0166541


In [56]:
validate_table(guo_bhat_sex_marital, independent_sex_marital)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 18452.229230875575
Table is not well fitting.



Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,11955,14954,-28.1382
2,Divorced,M,12027,15021,-28.0115
3,Married,F,60032,75091,-70.9762
4,Married,M,60273,75286,-70.6651
5,Not_married,F,35714,44671,-51.3724
6,Not_married,M,35969,44926,-51.2219
7,Widowed,F,11951,14951,-28.1521
8,Widowed,M,12079,15088,-28.0944


In [57]:
validate_table(guo_bhat_income, independent_income)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 18742.666304999133
Table is not well fitting.



Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,48219,60280,-61.4437
2,50000,47529,59410,-60.8551
3,60000,48165,60204,-61.3573
4,70000,48003,59989,-61.1644
5,80000,48084,60105,-61.3042


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [58]:
ponge_et_al_age_sex, ponge_et_al_sex_marital, ponge_et_al_income = compute_marginals(ponge_et_al)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE   [0m[1m SEX  [0m[1m population [0m
     │[90m Int64 [0m[90m Char [0m[90m Int64      [0m
─────┼─────────────────────────
   1 │     5  F          10086
   2 │     5  M          10029
   3 │    10  F           9965
   4 │    10  M           9926
   5 │    15  F           9966
   6 │    15  M          10025
   7 │    20  F          14932
   8 │    20  M          14846
  ⋮  │   ⋮     ⋮        ⋮
  26 │    65  M           8294
  27 │    70  F           8256
  28 │    70  M           8149
  29 │    75  F           8278
  30 │    75  M           8260
  31 │    80  F           8150
  32 │    80  M           8279
[36m                17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX  [0m[1m population [0m
     │[90m String         [0m[90m Char [0m[90m Int64      [0m
─────┼──────────────────────────────────
   1 │ Divorced        F          14954
   2 │ Divorced        M          15022
   3

In [59]:
validate_table(ponge_et_al_age_sex, independent_age_sex)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.0056659673041173034
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,10088,10086,0.0202561
2,5,M,10029,10029,0.0
3,10,F,9964,9965,-0.0101887
4,10,M,9928,9926,0.020413
5,15,F,9966,9966,0.0
6,15,M,10025,10025,0.0
7,20,F,14932,14932,0.0
8,20,M,14846,14846,0.0
9,25,F,15031,15030,0.00836889
10,25,M,15191,15191,0.0


In [60]:
validate_table(ponge_et_al_sex_marital, independent_sex_marital)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 18464.76948572168
Table is not well fitting.



Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,11955,14954,-28.1382
2,Divorced,M,12027,15022,-28.0209
3,Married,F,60032,75092,-70.9809
4,Married,M,60273,75290,-70.6839
5,Not_married,F,35714,44676,-51.4011
6,Not_married,M,35969,44934,-51.2677
7,Widowed,F,11951,14951,-28.1521
8,Widowed,M,12079,15090,-28.1131


In [61]:
validate_table(ponge_et_al_income, independent_income)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0

=Table statistics=
Statistic value equals: 18755.784492158546
Table is not well fitting.



Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,48219,60271,-61.3978
2,50000,47529,59410,-60.8551
3,60000,48165,60207,-61.3726
4,70000,48003,60017,-61.3073
5,80000,48084,60104,-61.2991


#### Modified

In [62]:
modified_age_sex, modified_sex_marital, modified_income = compute_marginals(modified)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE    [0m[1m SEX   [0m[1m population [0m
     │[90m Int64? [0m[90m Char? [0m[90m Int64      [0m
─────┼───────────────────────────
   1 │      5  F           10088
   2 │      5  M           10028
   3 │     10  F            9963
   4 │     10  M            9927
   5 │     15  F            9965
   6 │     15  M           10023
   7 │     20  F           14932
   8 │     20  M           14846
  ⋮  │   ⋮       ⋮        ⋮
  26 │     65  M            8294
  27 │     70  F            8255
  28 │     70  M            8146
  29 │     75  F            8277
  30 │     75  M            8262
  31 │     80  F            8149
  32 │     80  M            8280
[36m                  17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX   [0m[1m population [0m
     │[90m String?        [0m[90m Char? [0m[90m Int64      [0m
─────┼───────────────────────────────────
   1 │ Divorced        F           11952


In [63]:
validate_table(modified_age_sex, independent_age_sex)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.004042217643011466
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,10088,10088,0.0
2,5,M,10029,10028,0.0101567
3,10,F,9964,9963,0.0101887
4,10,M,9928,9927,0.0102065
5,15,F,9966,9965,0.0101877
6,15,M,10025,10023,0.0203174
7,20,F,14932,14932,0.0
8,20,M,14846,14846,0.0
9,25,F,15031,15030,0.00836889
10,25,M,15191,15191,0.0


In [64]:
validate_table(modified_sex_marital, independent_sex_marital)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.002962067521548061
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,11955,11952,0.0281476
2,Divorced,M,12027,12027,0.0
3,Married,F,60032,60028,0.0188528
4,Married,M,60273,60274,-0.00470693
5,Not_married,F,35714,35717,-0.0172063
6,Not_married,M,35969,35969,0.0
7,Widowed,F,11951,11947,0.0375361
8,Widowed,M,12079,12080,-0.0093368


In [65]:
validate_table(modified_income, independent_income)

=Cell statistics=
Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0

=Table statistics=
Statistic value equals: 0.0003648975849099926
Table is well fitting at 0.9 and 0.95 confidence interval.



Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,48219,48217,0.0101888
2,50000,47529,47527,0.0102441
3,60000,48165,48166,-0.00509655
4,70000,48003,48001,0.010206
5,80000,48084,48083,0.00509976
