In [26]:
import altair as alt
import pandas as pd
import numpy as np
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

## 0. Data Preprocessing

In [27]:
# Load the dataset
df = pd.read_csv('dpt2020.csv', sep=';')
df.head()

Unnamed: 0,sexe,preusuel,annais,dpt,nombre
0,1,_PRENOMS_RARES,1900,2,7
1,1,_PRENOMS_RARES,1900,4,9
2,1,_PRENOMS_RARES,1900,5,8
3,1,_PRENOMS_RARES,1900,6,23
4,1,_PRENOMS_RARES,1900,7,9


In [28]:
# Convert annais to integer and nombre to numeric. Discard errors in conversion
df['annais'] = pd.to_numeric(df['annais'], errors='coerce')
df['nombre'] = pd.to_numeric(df['nombre'], errors='coerce')

# Remove rows with missing values
df = df.dropna(subset=['nombre', 'preusuel'])

# Remove _PRENOMS_RARES	
df = df[df['preusuel'] != '_PRENOMS_RARES']

print(df.shape)
df.head()

(3705515, 5)


Unnamed: 0,sexe,preusuel,annais,dpt,nombre
10882,1,A,,XX,27
10883,1,AADAM,,XX,30
10884,1,AADEL,,XX,56
10885,1,AADIL,1983.0,84,3
10886,1,AADIL,1992.0,92,3


## Visualization 1

In [29]:
# Data preprocessing for viz 1
# Group by year and name, sum the counts across all departments and genders
df_grouped = df.groupby(['annais', 'preusuel'])['nombre'].sum().reset_index()

# Get the top 15 names for each year to make the visualization manageable
def get_top_names_by_year(data, top_n=15):
    result = []
    for year in data['annais'].unique():
        year_data = data[data['annais'] == year].nlargest(top_n, 'nombre')
        result.append(year_data)
    return pd.concat(result, ignore_index=True)

df_top = get_top_names_by_year(df_grouped, top_n=15)


In [30]:
# Create persistent color mapping for names
unique_names = df_top['preusuel'].unique()
np.random.seed(42)  # Set seed for reproducible colors
colors = [f'#{np.random.randint(0, 16777215):06x}' for _ in range(len(unique_names))]
color_mapping = dict(zip(unique_names, colors))

# Add color column to dataframe
df_top['color'] = df_top['preusuel'].map(color_mapping)

# Create the interactive histogram with time slider
slider = alt.binding_range(min=int(df_top['annais'].min()), 
                          max=int(df_top['annais'].max()), 
                          step=1, 
                          name='Year: ')
select_year = alt.param(bind=slider, value=2000)

# Base chart
base = alt.Chart(df_top).add_params(
    select_year
).transform_filter(
    alt.datum.annais == select_year
).transform_window(
    rank='rank(nombre)',
    sort=[alt.SortField('nombre', order='descending')]
).transform_filter(
    alt.datum.rank <= 15
)

# Main histogram
histogram = base.mark_bar(
    stroke='white',
    strokeWidth=1
).encode(
    x=alt.X('nombre:Q', 
            title='Number of Births',
            scale=alt.Scale(nice=True)),
    y=alt.Y('preusuel:N', 
            title='Name',
            sort=alt.SortField('nombre', order='descending')),
    color=alt.Color('preusuel:N', 
                   scale=alt.Scale(
                       domain=list(unique_names),
                       range=list(color_mapping.values())
                   ),
                   legend=None),
    tooltip=['preusuel:N', 'nombre:Q', 'annais:O']
).properties(
    width=600,
    height=400,
    title=alt.TitleParams(
        text=['Baby Name Popularity by Year', 'Top 15 Names'],
        fontSize=16,
        anchor='start'
    )
)

# Add value labels on bars
text = base.mark_text(
    align='left',
    baseline='middle',
    dx=3,
    fontSize=10,
    color='black'
).encode(
    x=alt.X('nombre:Q'),
    y=alt.Y('preusuel:N', sort=alt.SortField('nombre', order='descending')),
    text=alt.Text('nombre:Q', format='.0f')
)


# Combine the chart
chart = (histogram + text).resolve_scale(
    color='independent'
).properties(
    title=alt.TitleParams(
        text='Evolution of Baby Names Over Time',
        subtitle='Use the slider to explore different years',
        fontSize=18,
        subtitleFontSize=12,
        anchor='start',
        offset=20
    )
)

chart.show()