In [21]:
import altair as alt
import pandas as pd

## 0. Data Preprocessing

In [22]:
# Load the dataset
df = pd.read_csv('dpt2020.csv', sep=';')
df.head()

Unnamed: 0,sexe,preusuel,annais,dpt,nombre
0,1,_PRENOMS_RARES,1900,2,7
1,1,_PRENOMS_RARES,1900,4,9
2,1,_PRENOMS_RARES,1900,5,8
3,1,_PRENOMS_RARES,1900,6,23
4,1,_PRENOMS_RARES,1900,7,9


In [None]:
# Convert annais to integer and nombre to numeric. Discard errors in conversion
df['annais'] = pd.to_numeric(df['annais'], errors='coerce')
df['nombre'] = pd.to_numeric(df['nombre'], errors='coerce')

# Remove rows with missing values
df = df.dropna(subset=['nombre', 'preusuel'])

# Remove _PRENOMS_RARES	
df = df[df['preusuel'] != '_PRENOMS_RARES']

print(df.shape)
df.head()

(3705515, 5)


Unnamed: 0,sexe,preusuel,annais,dpt,nombre
10882,1,A,,XX,27
10883,1,AADAM,,XX,30
10884,1,AADEL,,XX,56
10885,1,AADIL,1983.0,84,3
10886,1,AADIL,1992.0,92,3


## Visualization 1

In [24]:
# Data preprocessing for viz 1
# Group by year and name, sum the counts across all departments and genders
df_grouped = df.groupby(['annais', 'preusuel'])['nombre'].sum().reset_index()

# Get the top 15 names for each year to make the visualization manageable
def get_top_names_by_year(data, top_n=15):
    result = []
    for year in data['annais'].unique():
        year_data = data[data['annais'] == year].nlargest(top_n, 'nombre')
        result.append(year_data)
    return pd.concat(result, ignore_index=True)

df_top = get_top_names_by_year(df_grouped, top_n=15)


In [None]:
# Create dropdown for selecting specific names to highlight
name_dropdown = alt.binding_select(
    options=['All'] + sorted(top_names_overall),
    name='Focus on Name: '
)
selected_name = alt.param(bind=name_dropdown, value='All')

# Main chart with all lines
main_chart = alt.Chart(df_timeseries).add_params(
    selected_name
).mark_line(
    strokeWidth=2,
    point=alt.OverlayMarkDef(size=30)
).encode(
    x=alt.X('annais:O', 
            title='Year',
            axis=alt.Axis(labelAngle=45)),
    y=alt.Y('nombre:Q', 
            title='Number of Births',
            scale=alt.Scale(nice=True)),
    color=alt.Color('preusuel:N', 
                   title='Name',
                   scale=alt.Scale(scheme='category20')),
    opacity=alt.condition(
        (alt.datum.preusuel == selected_name) | (selected_name == 'All'),
        alt.value(0.9),
        alt.value(0.2)
    ),
    strokeWidth=alt.condition(
        alt.datum.preusuel == selected_name,
        alt.value(4),
        alt.value(2)
    ),
    tooltip=['preusuel:N', 'annais:O', 'nombre:Q']
).properties(
    width=800,
    height=500
)

# Add text labels for the end of lines
end_labels = alt.Chart(df_timeseries).add_params(
    selected_name
).mark_text(
    align='left',
    dx=5,
    fontSize=10,
    fontWeight='bold'
).encode(
    x=alt.X('annais:O'),
    y=alt.Y('nombre:Q'),
    text='preusuel:N',
    color=alt.Color('preusuel:N', scale=alt.Scale(scheme='category20')),
    opacity=alt.condition(
        (alt.datum.preusuel == selected_name) | (selected_name == 'All'),
        alt.value(1.0),
        alt.value(0.0)
    )
).transform_aggregate(
    max_year='max(annais)',
    groupby=['preusuel']
).transform_filter(
    alt.datum.annais == alt.datum.max_year
)

# Combine the charts
timeseries_chart = (main_chart + end_labels).resolve_scale(
    color='independent'
).properties(
    title=alt.TitleParams(
        text='Baby Name Popularity Trends Over Time',
        subtitle='Use dropdown to focus on specific names - showing top 20 most popular names overall',
        fontSize=18,
        subtitleFontSize=12,
        anchor='start',
        offset=20
    )
)

timeseries_chart.show()

## Visualization 2

In [26]:
## Visualization 2

# Data preprocessing for viz 2
# Get the most popular names overall to focus on
top_names_overall = df_grouped.groupby('preusuel')['nombre'].sum().nlargest(20).index.tolist()

# Filter data for these top names
df_timeseries = df_grouped[df_grouped['preusuel'].isin(top_names_overall)]

In [30]:
# Create dropdown for selecting specific names to highlight
name_dropdown = alt.binding_select(
    options=['All'] + sorted(top_names_overall),
    name='Focus on Name: '
)
selected_name = alt.param(bind=name_dropdown, value='All')

# Main chart with all lines
main_chart = alt.Chart(df_timeseries).add_params(
    selected_name
).mark_line(
    strokeWidth=2,
    point=alt.OverlayMarkDef(size=30)
).encode(
    x=alt.X('annais:O', 
            title='Year',
            axis=alt.Axis(labelAngle=45)),
    y=alt.Y('nombre:Q', 
            title='Number of Births',
            scale=alt.Scale(nice=True)),
    color=alt.Color('preusuel:N', 
                   title='Name',
                   scale=alt.Scale(scheme='category20')),
    opacity=alt.condition(
        (alt.datum.preusuel == selected_name) | (selected_name == 'All'),
        alt.value(0.9),
        alt.value(0.2)
    ),
    strokeWidth=alt.condition(
        alt.datum.preusuel == selected_name,
        alt.value(4),
        alt.value(2)
    ),
    tooltip=['preusuel:N', 'annais:O', 'nombre:Q']
).properties(
    width=800,
    height=500
)

# Add text labels for the end of lines
end_labels = alt.Chart(df_timeseries).add_params(
    selected_name
).mark_text(
    align='left',
    dx=5,
    fontSize=10,
    fontWeight='bold'
).encode(
    x=alt.X('annais:O'),
    y=alt.Y('nombre:Q'),
    text='preusuel:N',
    color=alt.Color('preusuel:N', scale=alt.Scale(scheme='category20')),
    opacity=alt.condition(
        (alt.datum.preusuel == selected_name) | (selected_name == 'All'),
        alt.value(1.0),
        alt.value(0.0)
    )
).transform_aggregate(
    max_year='max(annais)',
    groupby=['preusuel']
).transform_filter(
    alt.datum.annais == alt.datum.max_year
)

# Combine the charts
timeseries_chart = (main_chart + end_labels).resolve_scale(
    color='independent'
).properties(
    title=alt.TitleParams(
        text='Baby Name Popularity Trends Over Time',
        subtitle='Use dropdown to focus on specific names - showing top 20 most popular names overall',
        fontSize=18,
        subtitleFontSize=12,
        anchor='start',
        offset=20
    )
)

timeseries_chart.show()