In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import cvxpy as cp
import scipy.stats as st
from scipy.stats import uniform,norm
from utils.metropolishastings import MetropolisHastings
from utils.state import State
from utils.drawing import discrete_inverse_trans, \
    transform_into_marginals_clean, DFconditional, draw_from_marginals
from utils.statistical_assessments import stats_assessment
from joblib import Parallel, delayed, cpu_count
import matplotlib
import copy
import warnings
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import pickle
import tikzplotlib
from datetime import datetime
from matplotlib.cm import get_cmap
import os
warnings.filterwarnings("ignore")

In [2]:
%load_ext autoreload
%autoreload 2

### Identifying the differences

In [3]:
projected_2010_2015 = pd.read_csv("results/proj2010_2015/synthetic_2010_2015_99.csv") #results of projection from 2010 to 2015
real_data_2015 = pd.read_csv("results/merged/bs_1_hh.csv") #our syntehtic data from 2015, we use them as real ones

In [4]:
def compare_hsize_distributions(df_projected, df_real, hsize_column='hsize', cap=6):
    # Cap the hsize values at `cap` for both dataframes
    for df in [df_projected, df_real]:
        df['hsize_new'] = df[hsize_column].apply(lambda x: x if x < cap else cap)
    
    # Group by hsize_new and count
    combinations_projected = df_projected.groupby("hsize_new").size().reset_index(name="Counts_projected")
    combinations_real = df_real.groupby("hsize_new").size().reset_index(name="Counts_real")
    
    # Merge the grouped results
    merged_df = pd.merge(combinations_projected, combinations_real, on="hsize_new", how="outer")
    
    # Fill missing values with 0 and cast to int
    merged_df["Counts_projected"] = merged_df["Counts_projected"].fillna(0).astype(int)
    merged_df["Counts_real"] = merged_df["Counts_real"].fillna(0).astype(int)
    
    # Calculate total counts
    total_counts_projected = merged_df["Counts_projected"].sum()
    total_counts_real = merged_df["Counts_real"].sum()
    
    # Calculate probabilities
    merged_df["Prob_projected"] = merged_df["Counts_projected"] / total_counts_projected
    merged_df["Prob_real"] = merged_df["Counts_real"] / total_counts_real
    
    return merged_df

In [5]:
merged_df = compare_hsize_distributions(projected_2010_2015,real_data_2015)

In [6]:
merged_df

Unnamed: 0,hsize_new,Counts_projected,Counts_real,Prob_projected,Prob_real
0,1,24037,19372,0.162384,0.154685
1,2,43128,40412,0.291354,0.322689
2,3,26226,22161,0.177172,0.176955
3,4,34540,27756,0.233337,0.221631
4,5,14890,10520,0.10059,0.084002
5,6,5205,5014,0.035163,0.040037


### Calculating adiitional observations between projected dataset and new data

In [7]:
'''Since GS cannot delete data but only add, we have to find one positive feasible solution that gives us information on how much observations we should add per household size category. 
In other words we obtain vector where each element represents counts per each household size category that should be added so we achieve probability distribution of new data. '''
def calculate_frequencies(prob_dist_projected, prob_dist_new, counts_projected, counts_new):
    

    c_total = np.sum(counts_projected)
    N = len(prob_dist_new)

    M = np.diag(prob_dist_new) @ np.ones(N).reshape(-1,1) @ np.ones(N).reshape(1,-1) - np.eye(N)

    eps = 0.01
    L_inq = np.eye(N)
    r_inq = eps*np.ones(N)

    z = cp.Variable(N)

    prob = cp.Problem(cp.Minimize(1), [L_inq @ z >= r_inq, M @ z == -M @ counts_projected])
    prob.solve()

    z_array = np.squeeze(np.array(z.value))

    print(z_array)

    return z_array

In [8]:
adding_counts = calculate_frequencies(np.array(merged_df['Prob_projected']), np.array(merged_df['Prob_real']), np.array(merged_df['Counts_projected']), np.array(merged_df['Counts_real']))

[ 4800.84108586 17030.72568459  6763.64465743  6778.55859897
   770.44229936  2259.01689059]


In [9]:
#this is double-checking if adding these quantities actually leads to the real probability distribution
def compare_adjusted_probs(merged_df, adding_counts):
    # Extract projected counts
    c = np.array(merged_df['Counts_projected'])

    # Compute total
    total = np.sum(c) + np.sum(adding_counts)

    # Compute adjusted probabilities
    adjusted_probs = (c + adding_counts) / total

    # Get real probabilities from dataframe
    real_probs = np.array(merged_df['Prob_real'])

    # Compare with a tolerance (to avoid floating-point issues)
    return np.allclose(adjusted_probs, real_probs, atol=1e-6)

In [10]:
is_equal = compare_adjusted_probs(merged_df, adding_counts)
print(is_equal)  # True or False

True


### Gibbs Sampler for adding households 


#### The reason we chose to add based on household size is because we can modify the number of draws for the Generation method that actually represents the households size distribution

#### Thus, we have to perform the following steps:
###### 1. Change the number of draws according to the output of calculate_frequencies() function instead of href, also add initial states for each hsize group
###### 2. We will obtain the subsample that should be concantenated to the projected one

###### 3. We extend the warmup period as much as needed to achieve the convergence, then discard the draws and draw exact desired number of draws from the unique joint distribution (note that in Generation code, the number of draws would inflate until we don't reach the unique joint distribution, so to avoid the post-processing, we changed the MetropolistHasting function)

#### The code is in the Generation_subsample_clean.ipynb, results in results/resampling_generation


#### Output csv: small sample that should be added to projected sample from this script

### Concatenation

In [21]:
gs = pd.read_csv("results/resampling_generation/synthetic_individuals.csv") #loading subsample
projection_2010_2015_concat = projected_2010_2015 #loading projected sample

In [23]:
gs['hsize'].value_counts().sort_index() #same as the number of draws given by calculate_frequencies function

1     4800
2    17030
3     6762
4     6778
5      770
6     2258
Name: hsize, dtype: int64

In [24]:
working_attributes = ["hid","htype","nbcars_agg","hsize","age_discrete","gender","marital_status","employment","driving_licence"]
projection_2010_2015_concat = projection_2010_2015_concat[working_attributes].copy()

In [25]:
gs["hid"] = gs["hid"]+max(projection_2010_2015_concat["hid"]) #so we don't have duplicated ids with the real data

In [26]:
final_dataset = pd.concat([projection_2010_2015_concat, gs], ignore_index=True)

In [27]:
final_dataset #data size inflated

Unnamed: 0,hid,htype,nbcars_agg,hsize,age_discrete,gender,marital_status,employment,driving_licence
0,0,10,1,1,6,2,3,4,1
1,1,10,1,1,6,2,3,1,1
2,2,10,0,1,6,2,3,4,2
3,3,10,0,1,6,2,3,4,2
4,7,10,0,1,6,2,3,4,2
...,...,...,...,...,...,...,...,...,...
186419,113692,220,3,6,1,1,1,5,2
186420,113693,220,1,6,5,1,2,1,1
186421,113694,230,1,6,5,1,2,1,1
186422,113695,220,2,6,4,2,2,1,1


In [28]:
final_dataset.to_csv("results/resampled_2015_GS.csv", index = False)

In [29]:
compare_hsize_distributions(final_dataset, real_data_2015) #the probabilities the same, although the counts are different

Unnamed: 0,hsize_new,Counts_projected,Counts_real,Prob_projected,Prob_real
0,1,28837,19372,0.154685,0.154685
1,2,60158,40412,0.322695,0.322689
2,3,32988,22161,0.176951,0.176955
3,4,41318,27756,0.221635,0.221631
4,5,15660,10520,0.084002,0.084002
5,6,7463,5014,0.040032,0.040037


### Uniformly deleting

In [33]:
# Calculate the total excess population
projected_total = len(projected_2010_2015)
excess_to_delete = len(final_dataset) - projected_total

# Check if excess_to_delete is a positive number
if excess_to_delete > 0:
    print(f"Excess population to delete: {excess_to_delete}")
else:
    print("No excess population to delete. Final sample already matches projected size.")

# Get the household size counts from the augmented dataset
augmented_counts = final_dataset['hsize'].value_counts().sort_index()

# Calculate the proportions for each household size category
delete_share = augmented_counts / augmented_counts.sum()

# Calculate how many observations to remove for each household size category
to_delete = np.round(delete_share * excess_to_delete).astype(int)

# Step 5: Perform deletion in final_dataset
final_dataset_adjusted = []

# Iterate over each household size category
for size, delete_count in to_delete.items():
    category_data = final_dataset[final_dataset['hsize'] == size]
    
    # If we need to delete any rows, sample them randomly and remove
    if delete_count > 0:
        indices_to_delete = category_data.sample(n=delete_count, random_state=42).index
        category_data = category_data.drop(indices_to_delete)
    
    final_dataset_adjusted.append(category_data)

# Concatenate the final adjusted dataset
final_dataset_adjusted = pd.concat(final_dataset_adjusted)

# Check if the final size matches the original projected size
final_size = len(final_dataset_adjusted)
print(f"Final Adjusted Count: {final_size}")
print(f"Final sample size matches original projected: {final_size == projected_total}")

# Step 6: Compare final probabilities with real probabilities
final_counts = final_dataset_adjusted['hsize'].value_counts().sort_index()
final_probs = final_counts / final_counts.sum()

# Real probabilities (you should have these from the real dataset)
real_counts = np.array(real_data_2015.hsize_new.value_counts().sort_index())  # Example probabilities
real_total = real_counts.sum()
real_probs = real_counts / real_total

print("\n--- Comparison Between Final and Real Probabilities ---")
for size, final_prob, real_prob in zip(range(1, 7), final_probs, real_probs):
    print(f"Household Size {size}: Real Prob = {real_prob:.4f}, Final Prob = {final_prob:.4f}, Diff = {abs(real_prob - final_prob):.4f}")

Excess population to delete: 38398
Final Adjusted Count: 148026
Final sample size matches original projected: True

--- Comparison Between Final and Real Probabilities ---
Household Size 1: Real Prob = 0.1547, Final Prob = 0.1547, Diff = 0.0000
Household Size 2: Real Prob = 0.3227, Final Prob = 0.3227, Diff = 0.0000
Household Size 3: Real Prob = 0.1770, Final Prob = 0.1769, Diff = 0.0000
Household Size 4: Real Prob = 0.2216, Final Prob = 0.2216, Diff = 0.0000
Household Size 5: Real Prob = 0.0840, Final Prob = 0.0840, Diff = 0.0000
Household Size 6: Real Prob = 0.0400, Final Prob = 0.0328, Diff = 0.0072


### Example from the paper

In [34]:
import numpy as np
import pandas as pd
import cvxpy as cp

# Function to compute added frequencies using constrained optimization
def calculate_frequencies(prob_dist_projected, prob_dist_new, counts_projected, counts_new):
    c_total = np.sum(counts_projected)
    N = len(prob_dist_new)

    M = np.diag(prob_dist_new) @ np.ones(N).reshape(-1,1) @ np.ones(N).reshape(1,-1) - np.eye(N)

    eps = 0.01
    L_inq = np.eye(N)
    r_inq = eps * np.ones(N)

    z = cp.Variable(N)

    prob = cp.Problem(cp.Minimize(1), [L_inq @ z >= r_inq, M @ z == -M @ counts_projected])
    prob.solve()

    z_array = np.squeeze(np.array(z.value))
    return np.round(z_array).astype(int)

# Inputs: projected and real counts
projected_counts = np.array([23287, 43760, 26357, 33620, 14195, 5282])
real_counts = np.array([19373, 40286, 21786, 28237, 10320, 5649])
household_sizes = np.arange(1, 7)

### STEP 1: Print projected sample
projected_total = projected_counts.sum()
projected_probs = projected_counts / projected_total
print("\n--- Step 1: Projected Sample ---")
for size, count, prob in zip(household_sizes, projected_counts, projected_probs):
    print(f"Household Size {size}: Count = {count}, Probability = {prob:.4f}")
print(f"Total Projected Count: {projected_total}")

### STEP 2: Print real sample
real_total = real_counts.sum()
real_probs = real_counts / real_total
print("\n--- Step 2: Real Sample ---")
for size, count, prob in zip(household_sizes, real_counts, real_probs):
    print(f"Household Size {size}: Count = {count}, Probability = {prob:.4f}")
print(f"Total Real Count: {real_total}")

### STEP 3: Add frequencies to match real probabilities
added_counts = calculate_frequencies(projected_probs, real_probs, projected_counts, real_counts)
augmented_counts = projected_counts + added_counts
augmented_total = augmented_counts.sum()
augmented_probs = augmented_counts / augmented_total

print("\n--- Step 3: Augmentation ---")
for size, proj, add, aug, prob in zip(household_sizes, projected_counts, added_counts, augmented_counts, augmented_probs):
    print(f"Household Size {size}: Projected = {proj}, Added = {add}, Augmented = {aug}, Aug. Prob = {prob:.4f}")
print(f"Total Augmented Count: {augmented_total}")
print(f"Difference in count from projected: {augmented_total - projected_total}")
print(f"Difference in count from real: {augmented_total - real_total}")

### STEP 4: Delete to restore original projected total size
excess_to_delete = augmented_total - projected_total
delete_share = augmented_counts / augmented_total
to_delete = np.round(delete_share * excess_to_delete).astype(int)
final_counts = augmented_counts - to_delete
final_total = final_counts.sum()
final_probs = final_counts / final_total

print("\n--- Step 4: Deletion and Final Sample ---")
for size, final_count, final_prob in zip(household_sizes, final_counts, final_probs):
    print(f"Household Size {size}: Final Count = {final_count}, Final Prob = {final_prob:.4f}")
print(f"Total Final Count: {final_total}")
print(f"Final sample size matches original projected: {final_total == projected_total}")

### STEP 5: Compare final probabilities with real
print("\n--- Step 5: Comparison Between Final and Real Probabilities ---")
for size, real_p, final_p in zip(household_sizes, real_probs, final_probs):
    print(f"Household Size {size}: Real Prob = {real_p:.4f}, Final Prob = {final_p:.4f}, Diff = {abs(real_p - final_p):.4f}")



--- Step 1: Projected Sample ---
Household Size 1: Count = 23287, Probability = 0.1590
Household Size 2: Count = 43760, Probability = 0.2987
Household Size 3: Count = 26357, Probability = 0.1799
Household Size 4: Count = 33620, Probability = 0.2295
Household Size 5: Count = 14195, Probability = 0.0969
Household Size 6: Count = 5282, Probability = 0.0361
Total Projected Count: 146501

--- Step 2: Real Sample ---
Household Size 1: Count = 19373, Probability = 0.1542
Household Size 2: Count = 40286, Probability = 0.3206
Household Size 3: Count = 21786, Probability = 0.1734
Household Size 4: Count = 28237, Probability = 0.2247
Household Size 5: Count = 10320, Probability = 0.0821
Household Size 6: Count = 5649, Probability = 0.0450
Total Real Count: 125651

--- Step 3: Augmentation ---
Household Size 1: Projected = 23287, Added = 3658, Augmented = 26945, Aug. Prob = 0.1542
Household Size 2: Projected = 43760, Added = 12272, Augmented = 56032, Aug. Prob = 0.3206
Household Size 3: Projected