# Clustering over INCOME and AGE across the IRIS

## Import

In [1]:
!pip install openpyxl # for pd.read_excel
!pip install ipywidgets # for tqdm.notebook




In [2]:
import numpy as np
import ot    # pip install pot
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import sys
sys.path.append("../src")
from clustering_methods import wbarycenter_clustering_nd
from utils import reconstruct_joint_distribution_ot

from tqdm.notebook import tqdm
import geopandas as gpd


### Income Deciles per IRIS

In [3]:
# INCOME DECILES PER IRIS

# Load the file
df_income_temp = pd.read_csv('../data/BASE_TD_FILO_DEC_IRIS_2020.csv', sep=';', decimal=',')


# Decile columns
decile_cols = ['DEC_D120', 'DEC_D220', 'DEC_D320', 'DEC_D420', 'DEC_MED20', 'DEC_D620', 'DEC_D720', 'DEC_D820', 'DEC_D920']

# Convert from object to float64
df_income_temp[decile_cols] = df_income_temp[decile_cols].apply(pd.to_numeric, errors='coerce')

# Remove the 1319 rows with missing deciles
df_income = df_income_temp.dropna(subset=decile_cols).reset_index(drop=True)

distributions = df_income[decile_cols].to_numpy()

# Useful 
support = np.arange(1, 10)  # D1 to D9
colors = ['cornflowerblue', 'forestgreen', 'red', 'deeppink', 'orange', 'brown', 'purple']
age_groups = ["0_17", "18_29", "30_39", "40_49", "50_64", "65_74", "75P"]


n_distributions = distributions.shape[0]

### Average Income per age

In [4]:
# AVERAGE INCOME PER AGE

# Load the file
df_income_age = pd.read_excel("../data/reve-niv-vie-individu-age-med.xlsx", skiprows=3)


In [5]:
# Keep only the 2020 column
df_income_age = df_income_age[["Tranche d’âge", '2020³ ⁴']]

# Rename the column
df_income_age = df_income_age.rename(columns={'2020³ ⁴': "MEDIAN_INCOME"})

# Rename the age groups
renaming_dict = {
    "Moins de 18 ans": "0_17",
    "De 18 à 29 ans": "18_29",
    "De 30 à 39 ans": "30_39",
    "De 40 à 49 ans": "40_49",
    "De 50 à 64 ans": "50_64",
    "De 65 à 74 ans": "65_74",
    "75 ans et plus": "75P"
}
df_income_age["AGE_GROUP"] = df_income_age["Tranche d’âge"].replace(renaming_dict)

# Get rid of the metadata lines
df_income_age = df_income_age[df_income_age['AGE_GROUP'].isin(age_groups)].reset_index(drop=True)

# Keep only the relevant columns
df_income_age = df_income_age[["AGE_GROUP", "MEDIAN_INCOME"]]


### Population per age 

To apply our algorithms using income by age group, we need to estimate the population in each corresponding age range for every IRIS. However, the INSEE population dataset provides age groupings that do not match exactly with those used in the national income-by-age statistics. To reconcile the two, we reconstruct the target age groups — (0–17), (18–29), (30–39), (40–49), (50–64), (65–74), and 75+ — by combining and subtracting available groups from the population dataset. In some cases, we rely on approximations (e.g., estimating the 45–54 group to split 40–49 and 50–64) to ensure consistency with the income data structure. This harmonization step is essential to later compute joint distributions of age and income at the IRIS level.

In [6]:
# POPULATION PER AGE

# Load the Excel sheet
df_pop_temp = pd.read_excel("../data/base-ic-evol-struct-pop-2020.xlsx", sheet_name=0, skiprows=5)


In [7]:
# Directly accessible age groups
df_pop_temp["AGE_0_17"] = df_pop_temp["P20_POP0002"] + df_pop_temp["P20_POP0305"] + df_pop_temp["P20_POP0610"] + df_pop_temp["P20_POP1117"]
df_pop_temp["AGE_18_29"] = df_pop_temp["P20_POP0014"] + df_pop_temp["P20_POP1529"] - df_pop_temp["AGE_0_17"]
df_pop_temp["AGE_30_39"] = df_pop_temp["P20_POP2539"] + df_pop_temp["P20_POP1824"] - df_pop_temp["AGE_18_29"]
df_pop_temp["AGE_65_74"] = df_pop_temp["P20_POP65P"] - df_pop_temp["P20_POP75P"]
# No processing needed for 75+ age group

# Approximation of the 45–54 group to help deduce 40–49 and 50–64
df_pop_temp["AGE_45_54"] = (
    df_pop_temp["AGE_0_17"] + df_pop_temp["P20_POP1824"] + df_pop_temp["P20_POP2539"] + df_pop_temp["P20_POP4054"]
    - (df_pop_temp["P20_POP0014"] + df_pop_temp["P20_POP1529"] + df_pop_temp["P20_POP3044"])
)

# Approximate 40–49 from 40–54 and part of 45–54
df_pop_temp["AGE_40_49"] = df_pop_temp["P20_POP4054"] - 0.5 * df_pop_temp["AGE_45_54"]

# Approximate 50–64 from 55–64 and part of 45–54
df_pop_temp["AGE_50_64"] = df_pop_temp["P20_POP5564"] + 0.5 * df_pop_temp["AGE_45_54"]

# Renaming for clarity
df_pop_temp["0_17"] = df_pop_temp["AGE_0_17"]
df_pop_temp["18_29"] = df_pop_temp["AGE_18_29"]
df_pop_temp["30_39"] = df_pop_temp["AGE_30_39"]
df_pop_temp["40_49"] = df_pop_temp["AGE_40_49"]
df_pop_temp["50_64"] = df_pop_temp["AGE_50_64"]
df_pop_temp["65_74"] = df_pop_temp["AGE_65_74"]
df_pop_temp["75P"] = df_pop_temp["P20_POP75P"]

# Final columns to keep
final_cols = ["IRIS", "REG", "DEP", "LIBCOM", "LIBIRIS"] + age_groups

df_pop = df_pop_temp[final_cols]


## Joint Age-Income Distributions

### Reconstructing the joint age–income distribution

For each IRIS, we reconstruct a joint probability distribution π over age groups and income levels. The marginal distributions are:
* the age distribution of the IRIS (based on population counts per age group), and
* a uniform distribution over the income deciles observed in the IRIS.

We use optimal transport to find the coupling π that best aligns the income levels with the national average income for each age group, minimizing the cost:

$$
C_{i,j} = |R_j - m(T_i)|
$$

where $R_j$ is the j-th income level (decile), and $m(T_i)$ is the national average income for age group $T_i$.

The result is a joint distribution π (a matrix), interpreted as the most plausible alignment between age and income in the IRIS, under minimal deviation from national trends.

### Computation for Each IRIS

In [8]:
print(df_income_age["AGE_GROUP"].unique())
print(age_groups)


['0_17' '18_29' '30_39' '40_49' '50_64' '65_74' '75P']
['0_17', '18_29', '30_39', '40_49', '50_64', '65_74', '75P']


In [9]:
# Compute national median income per age group
age_income_medians = df_income_age.set_index("AGE_GROUP").loc[age_groups, "MEDIAN_INCOME"].to_numpy()

# Find common IRIS identifiers present in both datasets
common_iris = sorted(set(df_income["IRIS"]).intersection(df_pop["IRIS"]))

# Compute joint distributions
joint_distributions = []
iris_ids = []

for iris in tqdm(common_iris):
    row_income = df_income[df_income["IRIS"] == iris]
    row_pop = df_pop[df_pop["IRIS"] == iris]

    if row_income.empty or row_pop.empty:
        continue  # Skip if data is missing for this IRIS

    # Age group weights (normalized)
    age_counts = row_pop.iloc[0][age_groups].to_numpy()
    age_weights = age_counts / age_counts.sum()

    # Income decile values
    income_deciles = row_income.iloc[0][decile_cols].to_numpy()

    # Compute the joint distribution π using optimal transport
    pi = reconstruct_joint_distribution_ot(age_weights, income_deciles, age_income_medians)

    joint_distributions.append(pi)
    iris_ids.append(iris)

# Stack the joint distributions into a 3D array: (n_iris, n_age_groups, n_income_deciles)
joint_distributions = np.stack(joint_distributions)


  0%|          | 0/14703 [00:00<?, ?it/s]

In [10]:
assignments, barycenters = wbarycenter_clustering_nd(
    data=joint_distributions,  # (n_iris, 7, 9)
    n_clusters=3,
    n_iter=5,
    reg=0.1
)


Wasserstein clustering iterations:   0%|          | 0/5 [00:00<?, ?it/s]

  u = (u.T * geometricBar(weights, UKv)).T / UKv


Reassigning clusters:   0%|          | 0/14703 [00:00<?, ?it/s]

Reassigning clusters:   0%|          | 0/14703 [00:00<?, ?it/s]

Reassigning clusters:   0%|          | 0/14703 [00:00<?, ?it/s]

Reassigning clusters:   0%|          | 0/14703 [00:00<?, ?it/s]

Reassigning clusters:   0%|          | 0/14703 [00:00<?, ?it/s]