# Join Grid-to-District Mapping with Demographics

Produces a cell-level demographic dataset by joining the grid-to-district mapping with
district-level demographic data. The result allows direct lookup of demographic features
by grid cell `(x_grid, y_grid)`, which is needed for the demographic context-aware g(D)
estimator used in the causal fairness term F_causal.

**Inputs**:
- `source_data/grid_to_district_mapping.pkl` — cell-to-district mapping (from `clean_grid_to_district_mapping.ipynb`)
- `source_data/all_demographics_by_district.csv` — district-level demographic data

**Output**:
- `source_data/cell_demographics.pkl` — cell-level demographic features (grid array + dict lookups)
- `source_data/cell_demographics.sample.json` — human-readable schema sample (git-tracked)

## 1. Load source data

In [1]:
import pandas as pd
import numpy as np
import pickle
import json
from pathlib import Path

PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == 'demographic_data':
    PROJECT_ROOT = PROJECT_ROOT.parent.parent
elif PROJECT_ROOT.name == 'data':
    PROJECT_ROOT = PROJECT_ROOT.parent

GRID_MAPPING_PKL = PROJECT_ROOT / 'source_data' / 'grid_to_district_mapping.pkl'
DEMOGRAPHICS_CSV = PROJECT_ROOT / 'source_data' / 'all_demographics_by_district.csv'
OUTPUT_PKL = PROJECT_ROOT / 'source_data' / 'cell_demographics.pkl'
OUTPUT_JSON = PROJECT_ROOT / 'source_data' / 'cell_demographics.sample.json'

GRID_ROWS, GRID_COLS = 48, 90

In [2]:
# Load grid-to-district mapping
with open(GRID_MAPPING_PKL, 'rb') as f:
    grid_mapping = pickle.load(f)

cell_to_district = grid_mapping['cell_to_district']
valid_cells = grid_mapping['valid_cells']
valid_mask = grid_mapping['valid_mask']
district_names = grid_mapping['district_names']
district_to_id = grid_mapping['district_to_id']
district_id_grid = grid_mapping['district_id_grid']

print(f'Grid mapping: {len(cell_to_district)} mapped cells, '
      f'{len(district_names)} districts')

Grid mapping: 2605 mapped cells, 10 districts


In [3]:
# Load demographic data
demo_df = pd.read_csv(DEMOGRAPHICS_CSV, index_col=0)
print(f'Demographics: {len(demo_df)} districts, {len(demo_df.columns)} columns')
print(f'Districts: {sorted(demo_df["DistrictName"].values)}')

# Identify numeric feature columns (everything except DistrictName)
feature_cols = [c for c in demo_df.columns if c != 'DistrictName']
print(f'\nFeature columns ({len(feature_cols)}):')
for col in feature_cols:
    print(f'  {col}: {demo_df[col].dtype}')

demo_df.head()

Demographics: 10 districts, 14 columns
Districts: ["Bao'an", 'Dapeng', 'Futian', 'Guangming', 'Longgang', 'Longhua', 'Luohu', 'Nanshan', 'Pingshan', 'Yantian']

Feature columns (13):
  AreaKm2: float64
  YearEndPermanentPop10k: float64
  RegisteredPermanentPop10k: float64
  NonRegisteredPermanentPop10k: float64
  PopDensityPerKm2: int64
  HouseholdRegisteredPop10k: float64
  MalePop10k: float64
  FemalePop10k: float64
  SexRatio100: float64
  EmployeeCompensation100MYuan: float64
  AvgEmployedPersons: int64
  AvgHousingPricePerSqM: float64
  GDPin10000Yuan: int64


Unnamed: 0,DistrictName,AreaKm2,YearEndPermanentPop10k,RegisteredPermanentPop10k,NonRegisteredPermanentPop10k,PopDensityPerKm2,HouseholdRegisteredPop10k,MalePop10k,FemalePop10k,SexRatio100,EmployeeCompensation100MYuan,AvgEmployedPersons,AvgHousingPricePerSqM,GDPin10000Yuan
0,Futian,78.66,150.17,95.35,54.82,19091,98.97,49.4,49.57,99.65,552.17,557197,48902.66667,35572870
1,Luohu,78.75,100.4,59.18,41.22,12749,61.19,30.54,30.65,99.61,181.27,174393,48108.0,19724939
2,Yantian,74.91,22.65,6.66,15.98,3024,7.2,3.62,3.58,101.11,31.56,30965,50334.33333,5375327
3,Nanshan,187.47,135.63,81.02,54.61,7235,85.79,45.55,40.24,113.21,462.45,329039,49847.83333,38452711
4,Bao'an,396.61,301.71,47.75,253.96,7607,50.94,26.07,24.87,104.76,164.28,186976,50647.66667,30038215


## 2. Validate join keys

Verify that every district in the grid mapping has a corresponding row in the demographic data.

In [4]:
# Build district name -> demographic row lookup
demo_by_name = demo_df.set_index('DistrictName')

# Check that all mapped districts exist in demographic data
mapped_districts = set(cell_to_district.values())
demo_districts = set(demo_by_name.index)

missing = mapped_districts - demo_districts
if missing:
    raise ValueError(f'Districts in grid mapping but not in demographics: {missing}')

extra = demo_districts - mapped_districts
if extra:
    print(f'Note: Districts in demographics with no grid cells: {extra}')

print(f'Join keys validated: {len(mapped_districts)} mapped districts, '
      f'all found in demographic data.')

Join keys validated: 10 mapped districts, all found in demographic data.


## 3. Build cell-level demographic data

For each valid grid cell, look up its district and copy that district's demographic features.
All cells in the same district get identical demographic values (district-level granularity).

In [5]:
# 1. demographics_grid: numpy array (48, 90, n_features)
#    NaN for unmapped cells, district demographics for mapped cells
n_features = len(feature_cols)
demographics_grid = np.full((GRID_ROWS, GRID_COLS, n_features), np.nan, dtype=np.float64)

for (x, y), district_name in cell_to_district.items():
    demographics_grid[x, y, :] = demo_by_name.loc[district_name, feature_cols].values

# 2. cell_to_demographics: dict {(x, y): {feature: value, ...}}
#    Only for mapped cells
cell_to_demographics = {}
for (x, y), district_name in cell_to_district.items():
    row = demo_by_name.loc[district_name, feature_cols]
    cell_to_demographics[(x, y)] = row.to_dict()

# 3. district_demographics: dict {district_name: {feature: value, ...}}
#    One entry per district for compact lookup
district_demographics = {}
for district_name in demo_by_name.index:
    district_demographics[district_name] = demo_by_name.loc[district_name, feature_cols].to_dict()

print(f'demographics_grid: shape {demographics_grid.shape}')
print(f'  Valid cells (non-NaN): {(~np.isnan(demographics_grid[:, :, 0])).sum()}')
print(f'  Unmapped cells (NaN): {np.isnan(demographics_grid[:, :, 0]).sum()}')
print(f'cell_to_demographics: {len(cell_to_demographics)} entries')
print(f'district_demographics: {len(district_demographics)} entries')

demographics_grid: shape (48, 90, 13)
  Valid cells (non-NaN): 2605
  Unmapped cells (NaN): 1715
cell_to_demographics: 2605 entries
district_demographics: 10 entries


## 4. Validate output

In [6]:
# Check that mapped cells have valid data and unmapped cells are NaN
for x in range(GRID_ROWS):
    for y in range(GRID_COLS):
        if valid_mask[x, y]:
            assert not np.any(np.isnan(demographics_grid[x, y])), \
                f'Mapped cell ({x},{y}) has NaN features'
        else:
            assert np.all(np.isnan(demographics_grid[x, y])), \
                f'Unmapped cell ({x},{y}) should be all NaN'

# Spot-check a few cells against the source data
for (x, y), district_name in list(cell_to_district.items())[:5]:
    expected = demo_by_name.loc[district_name, feature_cols].values.astype(np.float64)
    actual = demographics_grid[x, y, :]
    assert np.allclose(expected, actual), \
        f'Value mismatch at ({x},{y}) for district {district_name}'

# Verify that cells in the same district have identical features
from collections import defaultdict
cells_by_district = defaultdict(list)
for (x, y), name in cell_to_district.items():
    cells_by_district[name].append((x, y))

for name, cells in cells_by_district.items():
    ref = demographics_grid[cells[0][0], cells[0][1], :]
    for x, y in cells[1:]:
        assert np.allclose(ref, demographics_grid[x, y, :]), \
            f'Intra-district mismatch in {name} between {cells[0]} and ({x},{y})'

# Geographic sanity
if (0, 0) in cell_to_district:
    d = cell_to_district[(0, 0)]
    pop = cell_to_demographics[(0, 0)]['PopDensityPerKm2']
    print(f'Cell (0, 0) [south-west]: {d}, density={pop:,.0f}/km\u00b2')

print(f'\nFeature ranges across valid cells:')
valid_data = demographics_grid[valid_mask]
for i, col in enumerate(feature_cols):
    vals = valid_data[:, i]
    print(f'  {col}: [{vals.min():.2f}, {vals.max():.2f}]')

print(f'\nAll validation checks passed.')

Cell (0, 0) [south-west]: Nanshan, density=7,235/km²

Feature ranges across valid cells:
  AreaKm2: [74.91, 396.61]
  YearEndPermanentPop10k: [14.09, 301.71]
  RegisteredPermanentPop10k: [3.93, 95.35]
  NonRegisteredPermanentPop10k: [10.16, 253.96]
  PopDensityPerKm2: [477.00, 19091.00]
  HouseholdRegisteredPop10k: [3.97, 98.97]
  MalePop10k: [2.13, 49.40]
  FemalePop10k: [1.84, 49.57]
  SexRatio100: [99.61, 115.02]
  EmployeeCompensation100MYuan: [2.58, 552.17]
  AvgEmployedPersons: [4526.00, 557197.00]
  AvgHousingPricePerSqM: [48108.00, 51138.67]
  GDPin10000Yuan: [3074578.00, 38452711.00]

All validation checks passed.


## 5. Save pickle file

| Key | Type | Description |
|-----|------|-------------|
| `demographics_grid` | `np.ndarray (48,90,13) float64` | Demographic features per cell; NaN for ocean cells |
| `feature_names` | `list[str]` | Column names for the feature dimension (length 13) |
| `cell_to_demographics` | `dict{(int,int): dict}` | (x,y) to {feature: value} for mapped cells |
| `district_demographics` | `dict{str: dict}` | District name to {feature: value} (compact, 10 entries) |
| `district_names` | `list[str]` | Canonical district ordering (from grid mapping) |
| `district_to_id` | `dict{str: int}` | District name to numeric ID |

In [7]:
output_data = {
    'demographics_grid': demographics_grid,
    'feature_names': feature_cols,
    'cell_to_demographics': cell_to_demographics,
    'district_demographics': district_demographics,
    'district_names': district_names,
    'district_to_id': district_to_id,
}

OUTPUT_PKL.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_PKL, 'wb') as f:
    pickle.dump(output_data, f, protocol=pickle.HIGHEST_PROTOCOL)

print(f'Saved: {OUTPUT_PKL}')
print(f'Size: {OUTPUT_PKL.stat().st_size:,} bytes')

Saved: /home/robert/FAMAIL/source_data/cell_demographics.pkl
Size: 850,198 bytes


## 6. Save JSON sample file

In [8]:
sample_cell = sorted(list(valid_cells))[:3]
sample_unmapped = [(x, y) for x in range(GRID_ROWS) for y in range(GRID_COLS)
                   if (x, y) not in valid_cells][:2]

json_data = {
    '_description': (
        'Schema sample for cell_demographics.pkl. '
        'Cell-level demographic features joined from district-level data.'
    ),
    '_sources': {
        'grid_mapping': 'source_data/grid_to_district_mapping.pkl',
        'demographics': 'source_data/all_demographics_by_district.csv'
    },
    '_note': (
        'All cells in the same district have identical demographic values. '
        'This is district-level granularity mapped to cell level.'
    ),
    'feature_names': feature_cols,
    'demographics_grid': {
        'dtype': 'float64',
        'shape': [GRID_ROWS, GRID_COLS, n_features],
        'unmapped_value': 'NaN',
        'sample_mapped': {
            f'[{x}][{y}]': {
                col: round(float(demographics_grid[x, y, i]), 4)
                for i, col in enumerate(feature_cols)
            }
            for x, y in sample_cell
        },
        'sample_unmapped': {
            f'[{x}][{y}]': 'all NaN' for x, y in sample_unmapped
        }
    },
    'cell_to_demographics (sample)': {
        f'({x}, {y})': {
            col: round(float(cell_to_demographics[(x, y)][col]), 4)
            for col in feature_cols[:4]
        }
        for x, y in sample_cell
    },
    'district_demographics': {
        name: {
            col: round(float(district_demographics[name][col]), 4)
            for col in feature_cols[:4]
        }
        for name in sorted(district_demographics.keys())[:3]
    },
    'district_names': district_names,
    'district_to_id': district_to_id
}

with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
    json.dump(json_data, f, indent=2, ensure_ascii=False)

print(f'Saved: {OUTPUT_JSON}')

Saved: /home/robert/FAMAIL/source_data/cell_demographics.sample.json


## 7. Verify saved files can be loaded

In [9]:
# Reload and verify pickle
with open(OUTPUT_PKL, 'rb') as f:
    loaded = pickle.load(f)

print('Pickle contents:')
print(f'  Keys: {list(loaded.keys())}')
print(f'  demographics_grid: {loaded["demographics_grid"].shape} '
      f'{loaded["demographics_grid"].dtype}')
print(f'  feature_names: {loaded["feature_names"]}')
print(f'  cell_to_demographics: {len(loaded["cell_to_demographics"])} entries')
print(f'  district_demographics: {len(loaded["district_demographics"])} districts')

# Demo: look up demographics for a cell
test_cell = sorted(list(valid_cells))[0]
x, y = test_cell
print(f'\nDemo lookup for cell ({x}, {y}):')
print(f'  District: {cell_to_district[(x, y)]}')
print(f'  Via grid array: {dict(zip(loaded["feature_names"], loaded["demographics_grid"][x, y]))}')
print(f'  Via dict: {loaded["cell_to_demographics"][(x, y)]}')

# Demo: get all valid cells as feature matrix
valid_mask_loaded = ~np.isnan(loaded['demographics_grid'][:, :, 0])
feature_matrix = loaded['demographics_grid'][valid_mask_loaded]
print(f'\nFeature matrix for valid cells: {feature_matrix.shape}')

# Reload JSON
with open(OUTPUT_JSON, 'r') as f:
    json_loaded = json.load(f)
print(f'\nJSON sample keys: {list(json_loaded.keys())}')

Pickle contents:
  Keys: ['demographics_grid', 'feature_names', 'cell_to_demographics', 'district_demographics', 'district_names', 'district_to_id']
  demographics_grid: (48, 90, 13) float64
  feature_names: ['AreaKm2', 'YearEndPermanentPop10k', 'RegisteredPermanentPop10k', 'NonRegisteredPermanentPop10k', 'PopDensityPerKm2', 'HouseholdRegisteredPop10k', 'MalePop10k', 'FemalePop10k', 'SexRatio100', 'EmployeeCompensation100MYuan', 'AvgEmployedPersons', 'AvgHousingPricePerSqM', 'GDPin10000Yuan']
  cell_to_demographics: 2605 entries
  district_demographics: 10 districts

Demo lookup for cell (0, 0):
  District: Nanshan
  Via grid array: {'AreaKm2': np.float64(187.47), 'YearEndPermanentPop10k': np.float64(135.63), 'RegisteredPermanentPop10k': np.float64(81.02), 'NonRegisteredPermanentPop10k': np.float64(54.61), 'PopDensityPerKm2': np.float64(7235.0), 'HouseholdRegisteredPop10k': np.float64(85.79), 'MalePop10k': np.float64(45.55), 'FemalePop10k': np.float64(40.24), 'SexRatio100': np.float64(