# Appuyer sur le bouton "Play"

In [None]:
import altair as alt
import geopandas as gpd
import pandas as pd

# Load 2024 ANES dataset
ANES_URL = 'https://raw.githubusercontent.com/datamisc/ts-2024/main/data.csv'
N_THRESHOLD = None
VARS = {
    "V243001": 'state',  # state
    "V241551": 'gender',  # gender
    # "V241003",  # sex (only 20 % of the sample)
    # 'V241610',  # e.g., political knowledge catch
    "V241612": 'polk_years',  # e.g., "How many years in full term for US Senator"
    "V241613": 'polk_spend',  # e.g., "Federal gov spending least"
    "V241614": 'polk_house',  # party with most members in House
    "V241615": 'polk_senat',   # party with most members in Senate
}

def make_data():
    df = pd.read_csv(ANES_URL, compression='gzip')
    df = df[VARS.keys()].rename(columns=VARS)

    # Keep male/female
    df = df[df['gender'].between(1,2)]
    df['male']   = (df['gender'] == 1).astype(int)
    df['female'] = (df['gender'] == 2).astype(int)

    if N_THRESHOLD:
        # Keep states where n>=30
        mask = df['state'].value_counts() >= N_THRESHOLD
        mask = (df['state'].value_counts())[mask].index
        df = df[df['state'].isin(mask)]

    pol_knowledge_vars = df.columns[df.columns.str.contains('polk')]

    def clean_knowledge_variable(series, correct_values):
        # Replace invalid codes with NaN
        series_cleaned = series.replace([-9, -7, -6, -5, -4, -1], pd.NA)
        # Recode correct answers as 1, others as 0
        series_cleaned = series_cleaned.isin(
            correct_values
        ).astype(int)
        return series_cleaned


    df["polk_years"] = clean_knowledge_variable(
        df["polk_years"], [6]
    )
    df["polk_spend"] = clean_knowledge_variable(
        df["polk_spend"], [4]
    )
    df["polk_house"] = clean_knowledge_variable(
        df["polk_house"], [2]
    )
    df["polk_senat"] = clean_knowledge_variable(
        df["polk_senat"], [1]
    )
    df['polk_score'] = df[pol_knowledge_vars].sum(axis=1, skipna=True)/4


    grouped = df.groupby(['state', 'gender'])['polk_score'].agg(['mean','count','std'])
    # TODO: maybe include count so that we can shade figure by N?
    gap = grouped['mean'].unstack().assign(
        gender_gap=lambda x: x[1] - x[2]
    ).sort_values(
        'gender_gap', ascending=False
    )
    gap['count'] =  grouped['count'].unstack().sum(axis=1)

    gap = gap.reset_index()
    gap
    gap.columns.name = None

    return gap[['state', 'gender_gap', 'count']]

df_gap = make_data()

# Dynamic view in browser... Oui j'utilise encore et encore vim...
# alt.renderers.enable("browser")

# Load Datasets (No ANES ATM)
# TODO include some useful ANES 2024 variable

# Update title, reading example accordingly
data_hex_url = (
    "https://raw.githubusercontent.com/holtzy/"
    "R-graph-gallery/refs/heads/master/DATA/us_states_hexgrid.geojson.json"
)

gdf = gpd.read_file(data_hex_url)
gdf = gdf.rename(columns={"iso3166_2": "state"})

# Compute centroids for labels
gdf['centroid_lon'] = gdf.geometry.centroid.x
gdf['centroid_lat'] = gdf.geometry.centroid.y

# Merge Gap Variable
gdf = gdf.merge(df_gap, on='state', how='left')

gdf['gender_gap'] = gdf['gender_gap'] *-1


gdf['label'] = gdf['state'] + "\n" + (gdf['gender_gap'] * 100).round(1).astype(str) + "%"

# Chart Prep

tmp_val = max([abs(gdf['gender_gap'].min()), abs(gdf['gender_gap'].max())])
domain_range = [-tmp_val, tmp_val]

## Hexes Layer
hexes = (
    alt.Chart(gdf)
    .mark_geoshape(stroke="white", strokeWidth=3)
    .encode(
        color=alt.Color(
            "gender_gap:Q",
            scale=alt.Scale(scheme="redgrey", domainMid=0, domain=domain_range),
            legend=alt.Legend(title=["Inégalité de", "Genre"])
        ),
        tooltip=["state:N", alt.Tooltip("gender_gap:Q", format=".1%"), "count:Q"]
    )
)

## Labels Layer
hex_labels = (
    alt.Chart(gdf)
    .mark_text(
        fontSize=14,
        fontWeight="bold",
        color="black",
        align="center",
        baseline="middle"
    )
    .encode(
        longitude="centroid_lon:Q",
        latitude="centroid_lat:Q",
        text="state:N"
    )
)

subtitle = [
    "Au Massachusetts, en 2024, l'écart de connaissance politique entre les hommes et les femmes",
    "est de -18,7 points de pourcentage. Les valeurs négatives indiquent un biais en faveur des hommes."
]

## Text and stuff...
chart_title = alt.TitleParams(
    "Inégalités de connaissance politique aux États-Unis selon le genre",
    subtitle=subtitle,
    fontSize=20,
    subtitleFontSize=14,
    anchor="start",
    fontWeight="bold"
)

source_text = alt.Chart().mark_text(
    align='right',
    baseline='bottom',
    fontSize=12,
    color='gray'
).encode(
    text=alt.value("Source: 2024 American National Election Study"),
    x=alt.value(800 - 10),
    y=alt.value(525 - 10),
)

hexmap = (hexes + hex_labels + source_text).project(
    type="mercator"
).properties(
    width=800,
    height=500,
    title=chart_title
).configure_view(stroke=None)

hexmap

In [None]:
# Hack-time try it out!
