# IMT.fi Customer Segments comparison 2019-2024

Olli Salli, 2025

All rights reserved.

Note! A typical notebook would have commentary and more informative headings; these are OMITTED HERE to avoid leaking business details to those without access to the source data. 

In [None]:
start_date = '2019-01-01'
end_date = '2024-12-31'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle

from db_connection import configure_pandas_display, get_redshift_engine

configure_pandas_display()

engine = get_redshift_engine('imt_aws_redshift', 'dev')

%load_ext sql
%sql engine
%config SqlMagic.autopandas = True
%config SqlMagic.named_parameters = 'enabled'

In [None]:
%%sql --no-execute --save res_with_pg
select *

from dbt_osalli.fct_reservation_funnel as res
inner join dbt_osalli.dim_passengergroup as pg using (reservationid, departureyear)

where departuredate between :start_date and :end_date
and departureyear in (2019, 2024)

# Only reservations past the departure step, because only there are the passenger counts and adult/child
# defined for many reservations; we would have an overabundance of default 2-adult reservations otherwise
and funnel_progress > 0

## Exploration of individual dimensions

### Biographic

#### Early-stage reservations, broad categories

In [None]:
%%sql --save passgroup_categories_19_24 --with res_with_pg
select
    groupcategory,

    sum(case when departureyear = 2019 then 1 else 0 end) as res_2019,
    sum(case when departureyear = 2024 then 1 else 0 end) as res_2024,

    sum(case when departureyear = 2019 and finalstage = 'Confirmed' then 1 else 0 end) as confirmed_2019,
    sum(case when departureyear = 2024 and finalstage = 'Confirmed' then 1 else 0 end) as confirmed_2024

from res_with_pg
group by groupcategory
order by 1

We can see some segments clearly dropping more than others in started reservations, while their conversion rates don't drop similarly. Thus it might be possible to fill out some gaps with suitable marketing targeted at those segments.

#### Later-stage, with exact ages known, more specific categories

In [None]:
%%sql --save passgroup_adultage_categories_19_24 --with res_with_pg
select
    groupcategory,
    adultagecategory,

    sum(case when departureyear = 2019 then 1 else 0 end) as res_2019,
    sum(case when departureyear = 2024 then 1 else 0 end) as res_2024,

    sum(case when departureyear = 2019 and finalstage = 'Confirmed' then 1 else 0 end) as confirmed_2019,
    sum(case when departureyear = 2024 and finalstage = 'Confirmed' then 1 else 0 end) as confirmed_2024

from res_with_pg

# Passenger info fully filled or then onwards, or data from previous reservations picked through customer account or internal salesperson
where allagesknown

group by groupcategory, adultagecategory

order by 1, 2

## Customer segment X destination

In [None]:
%%sql --with res_with_pg agesegm_trip_df << select
    definingtype,
    coalesce(destinationcityname, 'Muut ' || coalesce(destinationcountryname, definingtype)) as destination,

    tripnumhotelphases,
    tripnumshipphases,
    tripnumflightphases,

    groupcategory,
    adultagecategory,

    sum(case when departureyear = 2019 then 1 else 0 end) as res_2019,
    sum(case when departureyear = 2024 then 1 else 0 end) as res_2024,

    sum(case when departureyear = 2019 and allagesknown then 1 else 0 end) as ages_2019,
    sum(case when departureyear = 2024 and allagesknown then 1 else 0 end) as ages_2024,

    sum(case when departureyear = 2019 and finalstage = 'Confirmed' then 1 else 0 end) as confirmed_2019,
    sum(case when departureyear = 2024 and finalstage = 'Confirmed' then 1 else 0 end) as confirmed_2024

from res_with_pg as respg
inner join dbt_osalli.dim_trip as trip using (tripid, tripname)

where groupcategory != 'Other'

group by all

having definingtype is not null and destination is not null

### Most popular destinations, loss of starters by segment

In [None]:
# Calculate global decrease percentage (weighted by volume)
total_2019 = agesegm_trip_df['res_2019'].sum()
total_2024 = agesegm_trip_df['res_2024'].sum()
global_pct_change = (total_2024 - total_2019) / total_2019 * 100

# Get top destinations by total reservations
agesegm_trip_df['total_res'] = agesegm_trip_df['res_2019'] + agesegm_trip_df['res_2024']
top_destinations = (
    agesegm_trip_df.groupby('destination')['total_res']
    .sum()
    .nlargest(10)
    .index
    .tolist()
)

# Create destination_group column with just the top destinations and "others" buckets
agesegm_trip_df['destination_group'] = agesegm_trip_df['destination'].apply(
    lambda x: x if x in top_destinations else 'Others'
)

# Aggregate by definingtype, destination_group, and groupcategory
pivot_data = (
    agesegm_trip_df
    .groupby(['definingtype', 'destination_group', 'groupcategory'])
    .agg({
        'res_2019': 'sum',
        'res_2024': 'sum',
        'confirmed_2024': 'sum'
    })
    .reset_index()
)

# Calculate percentage change and absolute change for aggregated data
pivot_data['pct_change'] = (
    (pivot_data['res_2024'] - pivot_data['res_2019']) /
    pivot_data['res_2019'] * 100
)
pivot_data['abs_change'] = pivot_data['res_2024'] - pivot_data['res_2019']

# Calculate total row and category summary data
total_by_groupcat = (
    agesegm_trip_df.groupby('groupcategory')
    .agg({
        'res_2019': 'sum',
        'res_2024': 'sum',
        'confirmed_2024': 'sum'
    })
)
total_by_groupcat['pct_change'] = (
    (total_by_groupcat['res_2024'] - total_by_groupcat['res_2019']) /
    total_by_groupcat['res_2019'] * 100
)
total_by_groupcat['abs_change'] = total_by_groupcat['res_2024'] - total_by_groupcat['res_2019']
total_by_groupcat['conversion_rate'] = (
    total_by_groupcat['confirmed_2024'] / total_by_groupcat['res_2024'] * 100
)

# Create pivot tables for visualization
heatmap_pct = pivot_data.pivot_table(
    values='pct_change',
    index=['definingtype', 'destination_group'],
    columns='groupcategory',
    aggfunc='first'
)

heatmap_abs = pivot_data.pivot_table(
    values='abs_change',
    index=['definingtype', 'destination_group'],
    columns='groupcategory',
    aggfunc='first'
)

# Sort destinations by total volume within each definingtype
dest_order = []
definingtype_boundaries = []
for dtype in sorted(pivot_data['definingtype'].unique()):
    dtype_data = pivot_data[pivot_data['definingtype'] == dtype]
    dest_totals = (
        dtype_data.groupby('destination_group')
        .agg({'res_2019': 'sum', 'res_2024': 'sum'})
    )
    dest_totals['total'] = dest_totals['res_2019'] + dest_totals['res_2024']
    sorted_dests = dest_totals.sort_values('total', ascending=False).index.tolist()

    if dest_order:
        definingtype_boundaries.append(len(dest_order))

    dest_order.extend([(dtype, dest) for dest in sorted_dests])

# Reindex to match the desired order
heatmap_pct = heatmap_pct.reindex(dest_order)
heatmap_abs = heatmap_abs.reindex(dest_order)

# Add total row
total_row_pct = pd.DataFrame([total_by_groupcat['pct_change']],
                              index=[('TOTAL', 'All Trips')],
                              columns=heatmap_pct.columns)
total_row_abs = pd.DataFrame([total_by_groupcat['abs_change']],
                              index=[('TOTAL', 'All Trips')],
                              columns=heatmap_abs.columns)

heatmap_pct = pd.concat([heatmap_pct, total_row_pct])
heatmap_abs = pd.concat([heatmap_abs, total_row_abs])

# Track position of total row for separator
total_row_idx = len(heatmap_pct) - 1

# Create the visualization
fig, ax = plt.subplots(figsize=(16, len(heatmap_pct) * 0.5))

# Create custom colormap centered on global average
vmin = max(heatmap_pct.min().min(), global_pct_change - 30)
vmax = min(heatmap_pct.max().max(), global_pct_change + 30)
cmap = sns.diverging_palette(10, 130, as_cmap=True)  # Red to Green

# Create heatmap with percentage for colors, absolute values for annotations
sns.heatmap(
    heatmap_pct,
    annot=heatmap_abs,
    fmt='.0f',
    cmap=cmap,
    center=global_pct_change,
    vmin=vmin,
    vmax=vmax,
    cbar_kws={'label': 'Change in Reservations (%)'},
    linewidths=0.5,
    ax=ax
)

# Make total row text bold
for i, text in enumerate(ax.texts):
    row_idx = i // len(heatmap_pct.columns)
    if row_idx == total_row_idx:
        text.set_weight('bold')
        text.set_fontsize(10)

# Draw thick separator lines between definingtype groups
for boundary_idx in definingtype_boundaries:
    ax.hlines(boundary_idx, *ax.get_xlim(), colors='black', linewidth=2)

# Draw thick separator line before total row
ax.hlines(total_row_idx, *ax.get_xlim(), colors='black', linewidth=2.5)

# Customize y-axis labels to show definingtype grouping
yticklabels = []
current_type = None
for dtype, dest in heatmap_pct.index[:-1]:  # Exclude total row
    if dtype != current_type:
        yticklabels.append(f"**{dtype}**\n  {dest}")
        current_type = dtype
    else:
        yticklabels.append(f"  {dest}")

# Add total row label in bold
yticklabels.append("TOTAL")

ax.set_yticklabels(yticklabels, rotation=0, ha='right')

# Customize x-axis labels to include 2024 totals and conversion rates
xticklabels = []
for col in heatmap_pct.columns:
    res_2024 = total_by_groupcat.loc[col, 'res_2024']
    conv_rate = total_by_groupcat.loc[col, 'conversion_rate']
    xticklabels.append(f"{col}\n(n_2024={res_2024:.0f}, conv_2024={conv_rate:.1f}%)")

ax.set_xticklabels(xticklabels, rotation=45, ha='right')
ax.set_xlabel('Group Category (2024 Total, Conv Rate)', fontweight='bold')
ax.set_ylabel('')

plt.title(
    f'Reservation Start Change by Segment (2019â†’2024)\nGlobal Average: {global_pct_change:.0f}%\nCells show absolute change, colors show percentage change',
    fontweight='bold',
    pad=20
)

plt.tight_layout()
plt.show()

print(f"\nGlobal percentage change: {global_pct_change:.1f}%")
print(f"\nTop destinations: {top_destinations}")