In [231]:
include("../src/SyntheticPopulation.jl")
include("utils.jl")
using .SyntheticPopulation
using DataFrames
using StatsBase



# Summary:
Sample-free IPF algorithm is well suited for creating joint distributions of the attributes that are independent.

# 1. Population with independent variables

### 1.1. Generate target population with independent variables
First we generate target population that we'll try to synthesize using available algorithms. All variables are independent.

In [232]:
SIZE = 300000
OLD_ADULTS = 0.6
YOUNG_ADULTS = 0.2
CHILDREN = 0.2

SEX = ['M', 'F']; SEX_WEIGHTS = [0.5, 0.5]
MARITAL_STATUS = ["Not_married", "Married", "Divorced", "Widowed"]; 
MARITAL_WEIGHTS = [0.3, 0.5, 0.1, 0.1]

AGE_YOUNG_ADULT = [20, 25];
AGE_YOUNG_ADULT_WEIGHTS = repeat([1 / length(AGE_YOUNG_ADULT)], length(AGE_YOUNG_ADULT));
AGE_OLD_ADULT = [30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80];
AGE_OLD_ADULT_WEIGHTS = repeat([1 / length(AGE_OLD_ADULT)], length(AGE_OLD_ADULT));
AGE_CHILDREN = [5, 10, 15];
AGE_CHILDREN_WEIGHTS = repeat([1 / length(AGE_CHILDREN)], length(AGE_CHILDREN));

INCOME = [40000, 50000, 60000, 70000, 80000];
ZERO_INCOME = [60000, 70000, 80000];
INCOME_WEIGHTS_YOUNG = SIZE .* YOUNG_ADULTS .* [0.5, 0.5, 0, 0, 0]
INCOME_WEIGHTS_OLD = SIZE * (OLD_ADULTS+YOUNG_ADULTS) .* [0.2, 0.2, 0.2, 0.2, 0.2] .- INCOME_WEIGHTS_YOUNG

population_young_adult = DataFrame(
    AGE = sample(AGE_YOUNG_ADULT, Weights(AGE_YOUNG_ADULT_WEIGHTS), Int(SIZE * YOUNG_ADULTS)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_WEIGHTS), Int(SIZE * YOUNG_ADULTS)),
    SEX = sample(SEX, Weights(SEX_WEIGHTS), Int(SIZE * YOUNG_ADULTS)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_YOUNG), Int(SIZE * YOUNG_ADULTS)),
)
population_old_adult = DataFrame(
    AGE = sample(AGE_OLD_ADULT, Weights(AGE_OLD_ADULT_WEIGHTS), Int(SIZE * OLD_ADULTS)),
    MARITAL_STATUS = sample(MARITAL_STATUS, Weights(MARITAL_WEIGHTS), Int(SIZE * OLD_ADULTS)),
    SEX = sample(SEX, Weights(SEX_WEIGHTS), Int(SIZE * OLD_ADULTS)),
    INCOME = sample(INCOME, Weights(INCOME_WEIGHTS_OLD), Int(SIZE * OLD_ADULTS)),
)
population_children = DataFrame(
    AGE = sample(AGE_CHILDREN, Weights(AGE_CHILDREN_WEIGHTS), Int(SIZE * CHILDREN)),
    MARITAL_STATUS = repeat([missing], Int(SIZE * CHILDREN)),
    SEX = sample(SEX, Weights(SEX_WEIGHTS), Int(SIZE * CHILDREN)),
    INCOME = repeat([missing], Int(SIZE * CHILDREN))
)

disaggregated_independent_population = reduce(vcat, [
    population_young_adult, 
    population_old_adult,
    population_children
    ]
)

independent_population = combine(groupby(disaggregated_independent_population, names(disaggregated_independent_population), sort=true), nrow)
rename!(independent_population, :nrow => :population)
zero_population = DataFrame(vec(collect(Iterators.product(AGE_YOUNG_ADULT, MARITAL_STATUS, SEX, ZERO_INCOME))))
zero_population.:population = repeat([0], nrow(zero_population))
rename!(zero_population, names(independent_population))
independent_population = reduce(vcat, [independent_population, zero_population])

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64
1,5,missing,F,missing,9992
2,5,missing,M,missing,9944
3,10,missing,F,missing,9970
4,10,missing,M,missing,10021
5,15,missing,F,missing,9974
6,15,missing,M,missing,10099
7,20,Divorced,F,40000,734
8,20,Divorced,F,50000,737
9,20,Divorced,M,40000,762
10,20,Divorced,M,50000,761


### 1.2. Compute marginal attribute distributions of the independent population
Then we compute the marginal population attributes. This is a type of data that we can usually get from the census data and will be input for our algorithms for population generation.

In [233]:
#Population by age and sex
independent_age_sex = combine(groupby(disaggregated_independent_population, [:AGE, :SEX], sort=true), nrow); 
sort!(independent_age_sex, [:SEX, :AGE])

#Population by sex and marital status
independent_sex_marital = combine(groupby(disaggregated_independent_population, [:MARITAL_STATUS, :SEX], sort=true), nrow); 
sort!(independent_sex_marital, [:SEX, :MARITAL_STATUS])

#Population by income
independent_income = combine(groupby(disaggregated_independent_population, [:INCOME], sort=true), nrow)

#Correct column names
independent_age_sex, independent_sex_marital, independent_income = map(x -> rename!(x, :nrow => :population), [independent_age_sex, independent_sex_marital, independent_income])

#filter out missing values
independent_sex_marital = filter(:MARITAL_STATUS => x -> typeof(x) != Missing, independent_sex_marital)
independent_income = filter(:INCOME => x -> typeof(x) != Missing, independent_income)

Row,INCOME,population
Unnamed: 0_level_1,Int64?,Int64
1,40000,48034
2,50000,47861
3,60000,47760
4,70000,48187
5,80000,48158


### 1.3. Generate independent population from marginals
Then, we use our algorithm to estimate joint distribution of the attributes. 

#### Guo, Bhat, 2007

In [234]:
guo_bhat = SyntheticPopulation.generate_joint_distribution(independent_age_sex, independent_sex_marital, independent_income)
guo_bhat = guo_bhat[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149969, 120033]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150031, 119967]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [300004, 240000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/Proportio

Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String,Char,Int64,Int64
1,5,Divorced,F,40000,200
2,10,Divorced,F,40000,200
3,15,Divorced,F,40000,200
4,20,Divorced,F,40000,301
5,25,Divorced,F,40000,302
6,30,Divorced,F,40000,164
7,35,Divorced,F,40000,164
8,40,Divorced,F,40000,163
9,45,Divorced,F,40000,164
10,50,Divorced,F,40000,165


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [235]:
ponge_et_al = SyntheticPopulation.generate_joint_distribution(independent_age_sex, independent_sex_marital, independent_income, config_file = "ind_ponge2021.json")
ponge_et_al = ponge_et_al[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149969, 120033]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149969, 120033]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [150031, 119967]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, c

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [300004, 240000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 27 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130


Row,AGE,MARITAL_STATUS,SEX,INCOME,population
Unnamed: 0_level_1,Int64,String,Char,Int64,Int64
1,5,Divorced,F,40000,125
2,10,Divorced,F,40000,125
3,15,Divorced,F,40000,125
4,20,Divorced,F,40000,753
5,25,Divorced,F,40000,756
6,30,Divorced,F,40000,102
7,35,Divorced,F,40000,102
8,40,Divorced,F,40000,101
9,45,Divorced,F,40000,102
10,50,Divorced,F,40000,103


#### Modified algorithm

In [236]:
modified = SyntheticPopulation.generate_joint_distribution(independent_income, independent_age_sex, independent_sex_marital, config_file = "ind_modified.json")
modified = modified[:, Not(:id)]

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [240000, 300000]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 25 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149970, 120033]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:128
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [149970, 120033]
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting /Users/marcinzurek/.julia/packages/ProportionalFitting/gNJEu/src/ipf.jl:130
┌ 

Row,INCOME,AGE,MARITAL_STATUS,SEX,population
Unnamed: 0_level_1,Int64?,Int64?,String?,Char?,Int64
1,missing,5,missing,F,9995
2,missing,10,missing,F,9968
3,missing,15,missing,F,9973
4,40000,20,Divorced,F,753
5,50000,20,Divorced,F,751
6,60000,20,Divorced,F,0
7,70000,20,Divorced,F,0
8,80000,20,Divorced,F,0
9,40000,25,Divorced,F,756
10,50000,25,Divorced,F,753


### 1.4. External validation of fit of generated independent population.
Finally, we evaluate if the generated population is correct. We use approach based on Z-score described by [Williamson, 2013] [1].


[1] Williamson, P. (2013). An evaluation of two synthetic small-area microdata simulation methodologies: Synthetic reconstruction and combinatorial optimisation. Spatial microsimulation: A reference guide for users, 19-47. https://ndl.ethernet.edu.et/bitstream/123456789/14722/1/205.pdf#page=38

#### Guo, Bhat, 2007

In [237]:
validate_table(guo_bhat, independent_population)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: Inf
Table is not well fitting.


Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Float64
1,20,Divorced,F,40000,734,301,16.0019
2,20,Divorced,F,50000,737,300,16.1169
3,20,Divorced,F,60000,0,299,-Inf
4,20,Divorced,F,70000,0,302,-Inf
5,20,Divorced,F,80000,0,302,-Inf
6,20,Divorced,M,40000,762,302,16.6852
7,20,Divorced,M,50000,761,301,16.6962
8,20,Divorced,M,60000,0,300,-Inf
9,20,Divorced,M,70000,0,303,-Inf
10,20,Divorced,M,80000,0,303,-Inf


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [238]:
validate_table(ponge_et_al, independent_population)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.5532319391634981
Percentage of well fitting values at 0.90 confidence interval: 0.49619771863117873


=Table statistics=

Statistic value equals: 65458.752241456736
Table is not well fitting.


Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Real
1,20,Divorced,F,40000,734,753,-0.702162
2,20,Divorced,F,50000,737,751,-0.516331
3,20,Divorced,F,60000,0,0,0
4,20,Divorced,F,70000,0,0,0
5,20,Divorced,F,80000,0,0,0
6,20,Divorced,M,40000,762,756,0.217634
7,20,Divorced,M,50000,761,754,0.254072
8,20,Divorced,M,60000,0,0,0
9,20,Divorced,M,70000,0,0,0
10,20,Divorced,M,80000,0,0,0


#### Modified algorithm

In [239]:
res = validate_table(modified, independent_population)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.564638783269962
Percentage of well fitting values at 0.90 confidence interval: 0.5057034220532319


=Table statistics=

Statistic value equals: 3388.672973177534
Table is not well fitting.


Row,AGE,MARITAL_STATUS,SEX,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,String?,Char,Int64?,Int64,Int64?,Real
1,20,Divorced,F,40000,734,753,-0.702162
2,20,Divorced,M,40000,762,756,0.217634
3,20,Married,F,40000,3668,3756,-1.46197
4,20,Married,M,40000,3797,3744,0.865608
5,20,Not_married,F,40000,2337,2257,1.66134
6,20,Not_married,M,40000,2223,2234,-0.234174
7,20,Widowed,F,40000,844,759,2.92995
8,20,Widowed,M,40000,768,747,0.758744
9,25,Divorced,F,40000,765,756,0.325811
10,25,Divorced,M,40000,785,757,1.00067


### 1.5 Internal Validation

Once the tables have been generated, the internal validation can be performed. This is done to assess the distance between input contingency tables the the generated tables.

#### Guo, Bhat, 2007

In [240]:
guo_bhat_age_sex, guo_bhat_sex_marital, guo_bhat_income = compute_marginals(guo_bhat)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE   [0m[1m SEX  [0m[1m population [0m
     │[90m Int64 [0m[90m Char [0m[90m Int64      [0m
─────┼─────────────────────────
   1 │     5  F           9992
   2 │     5  M           9942
   3 │    10  F           9969
   4 │    10  M          10023
   5 │    15  F           9974
   6 │    15  M          10100
   7 │    20  F          15024
   8 │    20  M          14940
  ⋮  │   ⋮     ⋮        ⋮
  26 │    65  M           7995
  27 │    70  F           8328
  28 │    70  M           8316
  29 │    75  F           8199
  30 │    75  M           8210
  31 │    80  F           8157
  32 │    80  M           8115
[36m                17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX  [0m[1m population [0m
     │[90m String         [0m[90m Char [0m[90m Int64      [0m
─────┼──────────────────────────────────
   1 │ Divorced        F          15014
   2 │ Divorced        M          15169
   3

In [241]:
validate_table(guo_bhat_age_sex, independent_age_sex)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0


=Table statistics=

Statistic value equals: 0.009744432276884478
Table is well fitting at 0.9 and 0.95 confidence interval.


Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,9992,9992,0.0
2,5,M,9944,9942,0.0203971
3,10,F,9970,9969,0.0101857
4,10,M,10021,10023,-0.0203213
5,15,F,9974,9974,0.0
6,15,M,10099,10100,-0.0101227
7,20,F,15025,15024,0.00837047
8,20,M,14938,14940,-0.0167871
9,25,F,15078,15078,0.0
10,25,M,14959,14960,-0.00838795


In [242]:
validate_table(guo_bhat_sex_marital, independent_sex_marital)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: 18458.000085225383
Table is not well fitting.


Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,12013,15014,-28.0925
2,Divorced,M,12127,15169,-28.3493
3,Married,F,59910,74850,-70.4631
4,Married,M,60031,75081,-70.9342
5,Not_married,F,36005,44982,-51.3152
6,Not_married,M,35827,44808,-51.443
7,Widowed,F,12105,15128,-28.1964
8,Widowed,M,11982,14988,-28.1738


In [243]:
validate_table(guo_bhat_income, independent_income)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: 18762.571246014555
Table is not well fitting.


Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,48034,60052,-61.3128
2,50000,47861,59833,-61.1609
3,60000,47760,59702,-61.056
4,70000,48187,60234,-61.3876
5,80000,48158,60199,-61.3708


#### Ponge, Enbergs, Schungel, Hellingrath, Karch, Ludwig, 2021

In [244]:
ponge_et_al_age_sex, ponge_et_al_sex_marital, ponge_et_al_income = compute_marginals(ponge_et_al)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE   [0m[1m SEX  [0m[1m population [0m
     │[90m Int64 [0m[90m Char [0m[90m Int64      [0m
─────┼─────────────────────────
   1 │     5  F           9994
   2 │     5  M           9945
   3 │    10  F           9970
   4 │    10  M          10022
   5 │    15  F           9972
   6 │    15  M          10098
   7 │    20  F          15025
   8 │    20  M          14938
  ⋮  │   ⋮     ⋮        ⋮
  26 │    65  M           7996
  27 │    70  F           8327
  28 │    70  M           8316
  29 │    75  F           8198
  30 │    75  M           8205
  31 │    80  F           8159
  32 │    80  M           8112
[36m                17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX  [0m[1m population [0m
     │[90m String         [0m[90m Char [0m[90m Int64      [0m
─────┼──────────────────────────────────
   1 │ Divorced        F          15005
   2 │ Divorced        M          15168
   3

In [245]:
validate_table(ponge_et_al_age_sex, independent_age_sex)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0


=Table statistics=

Statistic value equals: 0.004938867916401493
Table is well fitting at 0.9 and 0.95 confidence interval.


Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,9992,9994,-0.0203498
2,5,M,9944,9945,-0.0101986
3,10,F,9970,9970,0.0
4,10,M,10021,10022,-0.0101607
5,15,F,9974,9972,0.0203675
6,15,M,10099,10098,0.0101227
7,20,F,15025,15025,0.0
8,20,M,14938,14938,0.0
9,25,F,15078,15079,-0.00835653
10,25,M,14959,14958,0.00838795


In [246]:
validate_table(ponge_et_al_sex_marital, independent_sex_marital)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: 18445.816157143287
Table is not well fitting.


Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,12013,15005,-28.0083
2,Divorced,M,12127,15168,-28.3399
3,Married,F,59910,74853,-70.4773
4,Married,M,60031,75073,-70.8965
5,Not_married,F,36005,44986,-51.338
6,Not_married,M,35827,44806,-51.4315
7,Widowed,F,12105,15126,-28.1778
8,Widowed,M,11982,14981,-28.1082


In [247]:
validate_table(ponge_et_al_income, independent_income)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: 18748.8210537047
Table is not well fitting.


Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,48034,60044,-61.272
2,50000,47861,59828,-61.1353
3,60000,47760,59697,-61.0305
4,70000,48187,60231,-61.3723
5,80000,48158,60198,-61.3657


#### Modified

In [248]:
modified_age_sex, modified_sex_marital, modified_income = compute_marginals(modified)

([1m32×3 DataFrame[0m
[1m Row [0m│[1m AGE    [0m[1m SEX   [0m[1m population [0m
     │[90m Int64? [0m[90m Char? [0m[90m Int64      [0m
─────┼───────────────────────────
   1 │      5  F            9995
   2 │      5  M            9944
   3 │     10  F            9968
   4 │     10  M           10019
   5 │     15  F            9973
   6 │     15  M           10099
   7 │     20  F           15024
   8 │     20  M           14938
  ⋮  │   ⋮       ⋮        ⋮
  26 │     65  M            7995
  27 │     70  F            8329
  28 │     70  M            8315
  29 │     75  F            8198
  30 │     75  M            8205
  31 │     80  F            8158
  32 │     80  M            8114
[36m                  17 rows omitted[0m, [1m8×3 DataFrame[0m
[1m Row [0m│[1m MARITAL_STATUS [0m[1m SEX   [0m[1m population [0m
     │[90m String?        [0m[90m Char? [0m[90m Int64      [0m
─────┼───────────────────────────────────
   1 │ Divorced        F           12011


In [249]:
validate_table(modified_age_sex, independent_age_sex)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0


=Table statistics=

Statistic value equals: 0.005397113911590847
Table is well fitting at 0.9 and 0.95 confidence interval.


Row,AGE,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64,Char,Int64,Int64?,Float64
1,5,F,9992,9995,-0.0305246
2,5,M,9944,9944,0.0
3,10,F,9970,9968,0.0203714
4,10,M,10021,10019,0.0203213
5,15,F,9974,9973,0.0101837
6,15,M,10099,10099,0.0
7,20,F,15025,15024,0.00837047
8,20,M,14938,14938,0.0
9,25,F,15078,15079,-0.00835653
10,25,M,14959,14959,0.0


In [250]:
validate_table(modified_sex_marital, independent_sex_marital)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 1.0
Percentage of well fitting values at 0.90 confidence interval: 1.0


=Table statistics=

Statistic value equals: 0.00324825892928292
Table is well fitting at 0.9 and 0.95 confidence interval.


Row,MARITAL_STATUS,SEX,population,estimated_population,Z_score
Unnamed: 0_level_1,String?,Char,Int64,Int64?,Float64
1,Divorced,F,12013,12011,0.0187221
2,Divorced,M,12127,12129,-0.0186386
3,Married,F,59910,59914,-0.0188656
4,Married,M,60031,60026,0.0235662
5,Not_married,F,36005,36004,0.00571629
6,Not_married,M,35827,35828,-0.00572798
7,Widowed,F,12105,12108,-0.0279819
8,Widowed,M,11982,11979,0.0281176


In [251]:
validate_table(modified_income, independent_income)

=Cell statistics=

Percentage of well fitting values at 0.95 confidence interval: 0.0
Percentage of well fitting values at 0.90 confidence interval: 0.0


=Table statistics=

Statistic value equals: 1759.2392056835945
Table is not well fitting.


Row,INCOME,population,estimated_population,Z_score
Unnamed: 0_level_1,Int64?,Int64,Int64?,Float64
1,40000,48034,52539,-22.9834
2,50000,47861,52358,-22.9736
3,60000,47760,44780,15.2359
4,70000,48187,45172,15.3635
5,80000,48158,45150,15.3312


In [252]:
combine(groupby(modified, [:AGE, :INCOME]), :estimated_population => sum)

Row,AGE,INCOME,estimated_population_sum
Unnamed: 0_level_1,Int64?,Int64?,Int64
1,20,40000,15006
2,20,50000,14956
3,20,60000,0
4,20,70000,0
5,20,80000,0
6,25,40000,15046
7,25,50000,14992
8,25,60000,0
9,25,70000,0
10,25,80000,0
