In [1]:
import numpy as np
import ot    # pip install pot
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import sys
sys.path.append("../src")
from clustering_methods import wbarycenter_clustering, wpairwise_clustering
from utils import reconstruct_joint_distribution_ot, compute_and_save_distance_matrix, compute_and_save_distance_matrix2

from tqdm.notebook import tqdm
import geopandas as gpd


In [2]:
# Load the file
df_income_temp = pd.read_csv('../data/BASE_TD_FILO_DEC_IRIS_2020.csv', sep=';', decimal=',')


# Decile columns
decile_cols = ['DEC_D120', 'DEC_D220', 'DEC_D320', 'DEC_D420', 'DEC_MED20', 'DEC_D620', 'DEC_D720', 'DEC_D820', 'DEC_D920']

# Convert from object to float64
df_income_temp[decile_cols] = df_income_temp[decile_cols].apply(pd.to_numeric, errors='coerce')

# Remove the 1319 rows with missing deciles
df_income = df_income_temp.dropna(subset=decile_cols).reset_index(drop=True)

distributions = df_income[decile_cols].to_numpy()

# Useful 
colors = ['cornflowerblue', 'forestgreen', 'red', 'deeppink', 'orange', 'brown', 'purple']
age_groups = ["0_17", "18_29", "30_39", "40_49", "50_64", "65_74", "75P"]


n_distributions = distributions.shape[0]

In [3]:
# Load the file
df_income_age = pd.read_excel("../data/reve-niv-vie-individu-age-med.xlsx", skiprows=3)

# Keep only the 2020 column
df_income_age = df_income_age[["Tranche d’âge", '2020³ ⁴']]

# Rename the column
df_income_age = df_income_age.rename(columns={'2020³ ⁴': "MEDIAN_INCOME"})

# Rename the age groups
renaming_dict = {
    "Moins de 18 ans": "0_17",
    "De 18 à 29 ans": "18_29",
    "De 30 à 39 ans": "30_39",
    "De 40 à 49 ans": "40_49",
    "De 50 à 64 ans": "50_64",
    "De 65 à 74 ans": "65_74",
    "75 ans et plus": "75P"
}
df_income_age["AGE_GROUP"] = df_income_age["Tranche d’âge"].replace(renaming_dict)

# Get rid of the metadata lines
df_income_age = df_income_age[df_income_age['AGE_GROUP'].isin(age_groups)].reset_index(drop=True)

# Keep only the relevant columns
df_income_age = df_income_age[["AGE_GROUP", "MEDIAN_INCOME"]]


In [4]:
# Load the Excel sheet
df_pop_temp = pd.read_excel("../data/base-ic-evol-struct-pop-2020.xlsx", sheet_name=0, skiprows=5)

# Directly accessible age groups
df_pop_temp["AGE_0_17"] = df_pop_temp["P20_POP0002"] + df_pop_temp["P20_POP0305"] + df_pop_temp["P20_POP0610"] + df_pop_temp["P20_POP1117"]
df_pop_temp["AGE_18_29"] = df_pop_temp["P20_POP0014"] + df_pop_temp["P20_POP1529"] - df_pop_temp["AGE_0_17"]
df_pop_temp["AGE_30_39"] = df_pop_temp["P20_POP2539"] + df_pop_temp["P20_POP1824"] - df_pop_temp["AGE_18_29"]
df_pop_temp["AGE_65_74"] = df_pop_temp["P20_POP65P"] - df_pop_temp["P20_POP75P"]
# No processing needed for 75+ age group

# Approximation of the 45–54 group to help deduce 40–49 and 50–64
df_pop_temp["AGE_45_54"] = (
    df_pop_temp["AGE_0_17"] + df_pop_temp["P20_POP1824"] + df_pop_temp["P20_POP2539"] + df_pop_temp["P20_POP4054"]
    - (df_pop_temp["P20_POP0014"] + df_pop_temp["P20_POP1529"] + df_pop_temp["P20_POP3044"])
)

# Approximate 40–49 from 40–54 and part of 45–54
df_pop_temp["AGE_40_49"] = df_pop_temp["P20_POP4054"] - 0.5 * df_pop_temp["AGE_45_54"]

# Approximate 50–64 from 55–64 and part of 45–54
df_pop_temp["AGE_50_64"] = df_pop_temp["P20_POP5564"] + 0.5 * df_pop_temp["AGE_45_54"]

# Renaming for clarity
df_pop_temp["0_17"] = df_pop_temp["AGE_0_17"]
df_pop_temp["18_29"] = df_pop_temp["AGE_18_29"]
df_pop_temp["30_39"] = df_pop_temp["AGE_30_39"]
df_pop_temp["40_49"] = df_pop_temp["AGE_40_49"]
df_pop_temp["50_64"] = df_pop_temp["AGE_50_64"]
df_pop_temp["65_74"] = df_pop_temp["AGE_65_74"]
df_pop_temp["75P"] = df_pop_temp["P20_POP75P"]

# Final columns to keep
final_cols = ["IRIS", "REG", "DEP", "LIBCOM", "LIBIRIS"] + age_groups

df_pop = df_pop_temp[final_cols]


In [5]:
# Compute national median income per age group
age_income_medians = df_income_age.set_index("AGE_GROUP").loc[age_groups, "MEDIAN_INCOME"].to_numpy()

# Find common IRIS identifiers present in both datasets
common_iris = sorted(set(df_income["IRIS"]).intersection(df_pop["IRIS"]))

# Compute joint distributions
joint_distributions = []
iris_ids = []

for iris in tqdm(common_iris):
    row_income = df_income[df_income["IRIS"] == iris]
    row_pop = df_pop[df_pop["IRIS"] == iris]

    if row_income.empty or row_pop.empty:
        continue  # Skip if data is missing for this IRIS

    # Age group weights (normalized)
    age_counts = row_pop.iloc[0][age_groups].to_numpy()
    age_weights = age_counts / age_counts.sum()

    # Income decile values
    income_deciles = row_income.iloc[0][decile_cols].to_numpy()

    # Compute the joint distribution π using optimal transport
    pi = reconstruct_joint_distribution_ot(age_weights, income_deciles, age_income_medians)

    joint_distributions.append(pi)
    iris_ids.append(iris)

# Stack the joint distributions into a 3D array: (n_iris, n_age_groups, n_income_deciles)
joint_distributions = np.stack(joint_distributions)


  0%|          | 0/14703 [00:00<?, ?it/s]

In [6]:
compute_and_save_distance_matrix(joint_distributions[:1000])

Computing pairwise distance matrix...:   0%|          | 0/1000 [00:00<?, ?it/s]

array([[0.        , 1.75111964, 1.90899429, ..., 2.73203544, 2.56326438,
        5.52496944],
       [1.75111964, 0.        , 1.94374102, ..., 2.13301804, 2.44706484,
        4.66401493],
       [1.90899429, 1.94374102, 0.        , ..., 2.2127974 , 2.40139508,
        4.70093659],
       ...,
       [2.73203544, 2.13301804, 2.2127974 , ..., 0.        , 1.31374488,
        3.44032596],
       [2.56326438, 2.44706484, 2.40139508, ..., 1.31374488, 0.        ,
        2.51698014],
       [5.52496944, 4.66401493, 4.70093659, ..., 3.44032596, 2.51698014,
        0.        ]], shape=(1000, 1000))

In [8]:
compute_and_save_distance_matrix2(joint_distributions)

KeyboardInterrupt: 