### Manualy create the input dataframes based on Chinese census data

In [1]:
using DataFrames
using SyntheticPopulation

In [2]:

#each individual and each household represent 100.000 individuals or households
SCALE = 0.00001 

#all values are based on China census data
individual_popoulation_size = 21890000

#individuals
marginal_ind_age_sex = DataFrame(
    sex = repeat(['M', 'F'], 18),
    age = repeat(2:5:87, inner = 2), 
    population = SCALE .* 10000 .* [52.6, 49.0, 48.5, 44.8, 33.6, 30.6, 34.6, 28.8, 71.6, 63.4, 99.6, 90.9, 130.9, 119.4, 110.8, 103.5, 83.8, 76.4, 84.2, 77.7, 84.2, 77.8, 82.8, 79.9, 67.7, 71.0, 56.9, 62.6, 31.5, 35.3, 18.5, 23.0, 15.2, 19.7, 12.5, 16.0]
    )

marginal_ind_sex_maritalstatus = DataFrame(
    sex = repeat(['M', 'F'], 4), 
    maritalstatus = repeat(["Never_married", "Married", "Divorced", "Widowed"], inner = 2), 
    population = SCALE .* [1679, 1611, 5859, 5774, 140, 206, 128, 426] ./ 0.00082
    )

marginal_ind_income = DataFrame(
    income = [25394, 44855, 63969, 88026, 145915], 
    population = repeat([individual_popoulation_size * SCALE / 5], 5)
    )

#households
household_total_population = 8230000
marginal_hh_size = DataFrame(
    hh_size = [1,2,3,4,5],
    population = Int.(round.(SCALE * household_total_population .* [0.299, 0.331, 0.217, 0.09, 0.063]))
    )
nothing #to avoid printing output

### Create dataframe of individals
Some individual types have the population 0 because we are using the scale of 0.00001

In [3]:

#generation of dataframe of individuals
aggregated_individuals = generate_joint_distribution(marginal_ind_sex_maritalstatus, marginal_ind_income, marginal_ind_age_sex, config_file = "tutorial_notebooks/config_file.json")

┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [193.0, 220.0]
└ @ ProportionalFitting C:\Users\plzurekma\.julia\packages\ProportionalFitting\gNJEu\src\ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting C:\Users\plzurekma\.julia\packages\ProportionalFitting\gNJEu\src\ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [105.0, 92.0]
└ @ ProportionalFitting C:\Users\plzurekma\.julia\packages\ProportionalFitting\gNJEu\src\ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting C:\Users\plzurekma\.julia\packages\ProportionalFitting\gNJEu\src\ipf.jl:130
┌ Info: Inconsistent target margins, converting `X` and `mar` to proportions. Margin totals: [115.0, 94.0]
└ @ ProportionalFitting C:\Users\plzurekma\.julia\packages\ProportionalFitting\gNJEu\src\ipf.jl:61
┌ Info: Converged in 1 iterations.
└ @ ProportionalFitting C:\Users\plzurekma\.julia\packages\ProportionalFitti

Row,id,maritalstatus,income,sex,age,population
Unnamed: 0_level_1,Int64,String?,Int64?,Char?,Int64?,Int64
1,1,Divorced,25394,F,22,0
2,2,Married,25394,F,22,1
3,3,Never_married,25394,F,22,0
4,4,Widowed,25394,F,22,0
5,5,Divorced,44855,F,22,0
6,6,Married,44855,F,22,1
7,7,Never_married,44855,F,22,0
8,8,Widowed,44855,F,22,0
9,9,Divorced,63969,F,22,0
10,10,Married,63969,F,22,1


In [4]:
# extra information about the individuals
describe(aggregated_individuals[aggregated_individuals.:population .> 0,:])

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,id,284.815,2,310.0,590,0,Int64
2,maritalstatus,,Divorced,,Married,20,"Union{Missing, String}"
3,income,73631.8,25394,63969.0,145915,0,"Union{Missing, Int64}"
4,sex,,F,,M,0,"Union{Missing, Char}"
5,age,38.6667,2,37.0,72,0,"Union{Missing, Int64}"
6,population,1.14815,1,1.0,2,0,Int64


### Create dataframe of households

In [5]:

#generation of dataframe of households
aggregated_households = generate_joint_distribution(marginal_hh_size)

Row,id,hh_size,population
Unnamed: 0_level_1,Int64,Int64,Int64
1,1,1,25
2,2,2,27
3,3,3,18
4,4,4,7
5,5,5,5


### Allocate individuals to households

In [6]:
model, allocation_values, disaggregated_individuals, disaggregated_households = assign_and_optimize_individuals_to_households!(aggregated_individuals, aggregated_households)

Total number of individuals: 155
Total number of households: 82
Allocation started...
Creation of individual constraints started.


[32mAdding individual constraints 100%|██████████████████████| Time: 0:00:00[39m[K


Creation of individual constraints finished.


[32mPreparing household constraints 1/3 100%|████████████████| Time: 0:00:00[39m[K
[32mPreparing household constraints 2/3 100%|████████████████| Time: 0:00:00[39m[K


Preparation for creation of household constraints started.
Preparation for creation of household constraints finished.
Creation of household constraints started.


[32mPreparing household constraints 3/3 100%|████████████████| Time: 0:00:00[39m[K
[32mAdding household constraints. 100%|██████████████████████| Time: 0:00:02[39m[K


### Check the outputs of allocation

In [None]:
model

In [None]:
aggregated_individuals

In [None]:
disaggregated_individuals

In [None]:
aggregated_households

In [None]:
disaggregated_households

### See the full output of allocation (join of disaggrgated_households with aggregated_individuals)

In [None]:
function join_and_rename!(df1::DataFrame, df2::DataFrame, column_name::Symbol)
    df_joined = leftjoin(df1, df2, on = column_name => :id, makeunique=true, matchmissing = :notequal)

    # Rename the new columns
    for col in names(df2)[2:end]  # Skip the id column
        rename!(df_joined, Symbol(col) => Symbol(replace(string(column_name), "_id" => "_"*col)))
    end

    return df_joined
end

# Apply the function to each id column in df1
id_columns = [:head_id, :partner_id, :child1_id, :child2_id, :child3_id]
disaggregated_households_joined = disaggregated_households
for column_name in id_columns
    disaggregated_households_joined = join_and_rename!(disaggregated_households_joined, aggregated_individuals, column_name)
end
disaggregated_households_joined

In [None]:
describe(disaggregated_households_joined)

In [None]:
disaggregated_households_joined[(disaggregated_households_joined.:head_age) .< 20, :]

#need to debug this! maybe this is a child but assigned to household as head since this is the only person

#### Extra tests to validate outputs

In [None]:
#check that there is a proper age difference between children and parents (between 20 and 40)
for parent in [:head_age, :partner_age]
    for child in [:child1_age, :child2_age, :child3_age]
        print(unique(collect(skipmissing(disaggregated_households_joined[!, parent] - disaggregated_households_joined[!, child]))))
    end
end
print("\n")
#check that for all assigned individuals, the value in column :population from aggregated_individuals is larger than 1
for column in [:head_population, :partner_population, :child1_population, :child2_population, :child3_population]
    print(unique(collect(skipmissing(disaggregated_households_joined[!, column]))))
end