# IMT.fi Funnel Analysis 2019-2024

Olli Salli, 2025

Note! A typical notebook would have commentary and more informative headings; these are OMITTED HERE to avoid leaking business details to those without access to the source data. 

In [None]:
start_date = '2019-01-01'
end_date = '2024-12-31'

In [None]:
# Standard data packages
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For connecting to your warehouse (adjust as needed)
from sqlalchemy import create_engine
%load_ext sql

# Display options
pd.set_option("display.max_columns", None)
sns.set_theme(style="whitegrid")


In [None]:
import yaml
from pathlib import Path

# Avoid error from stock psycopg2 trying to set "standard_conforming_strings" setting
from sqlalchemy.dialects import registry
from sqlalchemy.dialects.postgresql.psycopg2 import PGDialect_psycopg2
class RedshiftPostgresDialect(PGDialect_psycopg2):
    def _set_backslash_escapes(self, connection):
        self._backslash_escapes = "off"
registry.register("redshift_custom", __name__, "RedshiftPostgresDialect")

# Load from dbt profiles.yml
with open(Path.home() / ".dbt" / "profiles.yml") as f:
    profiles = yaml.safe_load(f)

p = profiles["imt_aws_redshift"]["outputs"]["dev"]
db_url = f"redshift_custom://{p['user']}:{p['password']}@{p['host']}:5439/{p['dbname']}"
engine = create_engine(db_url, connect_args={"sslmode": "require"})

%sql engine
%config SqlMagic.autopandas = True
%config SqlMagic.named_parameters = "enabled"

## Overall reservation completion comparison

In [None]:
%%sql --save res_by_year
SELECT
        departureyear,
        COUNT(*) AS total_reservations,
        SUM(CASE WHEN finalstage = 'Confirmed' THEN 1 ELSE 0 END) AS confirmed
    FROM dbt_osalli.fct_reservation_funnel
    WHERE departuredate BETWEEN :start_date AND :end_date
    GROUP BY departureyear
    ORDER BY departureyear;

In [None]:
res_by_year_df = %sql SELECT * from res_by_year

# Visualization
fig, ax1 = plt.subplots(figsize=(12, 6))

x = np.arange(len(res_by_year_df['departureyear']))
width = 0.35

# Left y-axis for total_reservations
ax1.set_xlabel('Departure Year')
ax1.set_ylabel('Total Reservations', color='tab:blue')
bars1 = ax1.bar(x - width/2, res_by_year_df['total_reservations'], width, label='Total Reservations', color='tab:blue', alpha=0.7)
ax1.tick_params(axis='y', labelcolor='tab:blue')
ax1.set_xticks(x)
ax1.set_xticklabels(res_by_year_df['departureyear'])

# Set left y-axis range and ticks
ax1.set_ylim(0, 1_100_000)
ax1.set_yticks(np.arange(0, 1_000_001, 200_000))

# Format left y-axis with humanized numbers
from matplotlib.ticker import FuncFormatter
def human_format(num, pos):
    if num >= 1_000_000:
        return f'{num/1_000_000:.1f}M'
    elif num >= 1_000:
        return f'{num/1_000:.0f}k'
    else:
        return f'{num:.0f}'
ax1.yaxis.set_major_formatter(FuncFormatter(human_format))

# Right y-axis for confirmed
ax2 = ax1.twinx()
ax2.set_ylabel('Confirmed', color='tab:green')
bars2 = ax2.bar(x + width/2, res_by_year_df['confirmed'], width, label='Confirmed', color='tab:green', alpha=0.7)
ax2.tick_params(axis='y', labelcolor='tab:green')

# Set right y-axis range and ticks
ax2.set_ylim(0, 55_000)
ax2.set_yticks(np.arange(0, 50_001, 10_000))

# Format right y-axis with humanized numbers
ax2.yaxis.set_major_formatter(FuncFormatter(human_format))

# Add percentage labels above confirmed bars
for i, (total, confirmed) in enumerate(zip(res_by_year_df['total_reservations'], res_by_year_df['confirmed'])):
    percentage = (confirmed / total * 100) if total > 0 else 0
    ax2.text(x[i] + width/2, confirmed, f'{percentage:.1f}%',
             ha='center', va='bottom', fontsize=9, color='darkgreen')

# Title and legend
plt.title('Reservations by Departure Year: Total vs Confirmed')
fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.9))
fig.tight_layout()
plt.show()

## Abandonment stages 2019 vs 2024
### Whole pipeline

In [None]:
%%sql --save abandonment_stages
SELECT
    departureyear,
    finalstage,
    COUNT(*) AS count
FROM dbt_osalli.fct_reservation_funnel
WHERE departuredate BETWEEN :start_date AND :end_date
    AND finalstage != 'Confirmed'
    AND departureyear IN (2019, 2024)
GROUP BY departureyear, finalstage
ORDER BY departureyear, count DESC;

In [None]:
abandonment_df = %sql SELECT * from abandonment_stages

# Pivot data and calculate percentages
pivot_df = abandonment_df.pivot(index='finalstage', columns='departureyear', values='count').fillna(0)
pivot_pct = pivot_df.div(pivot_df.sum(axis=0), axis=1) * 100

# Define stage order
stage_order = ['ProductSelection', 'PassengerInfo', 'ReserverInfo', 'AdditionalServices', 'Confirmation']
# Reindex to ensure correct order
pivot_pct = pivot_pct.reindex(stage_order, fill_value=0)

# Visualization
fig, ax = plt.subplots(figsize=(12, 6))

years = [2019, 2024]
y_pos = np.arange(len(years))

# Create stacked horizontal bars
left_2019 = 0
left_2024 = 0

colors = plt.cm.Set3(np.linspace(0, 1, len(pivot_pct.index)))

for i, stage in enumerate(pivot_pct.index):
    # 2019 bar (position 0 - top)
    bar_2019 = ax.barh(0, pivot_pct.loc[stage, 2019], left=left_2019,
                       height=0.5, label=stage, color=colors[i], alpha=0.8)

    # Add percentage text for ProductSelection and PassengerInfo
    if stage in ['ProductSelection', 'PassengerInfo']:
        pct_val = pivot_pct.loc[stage, 2019]
        ax.text(left_2019 + pct_val/2, 0, f'{pct_val:.1f}%',
                ha='center', va='center', fontsize=9, fontweight='bold')

    left_2019 += pivot_pct.loc[stage, 2019]

    # 2024 bar (position 1 - bottom)
    bar_2024 = ax.barh(1, pivot_pct.loc[stage, 2024], left=left_2024,
                       height=0.5, color=colors[i], alpha=0.8)

    # Add percentage text for ProductSelection and PassengerInfo
    if stage in ['ProductSelection', 'PassengerInfo']:
        pct_val = pivot_pct.loc[stage, 2024]
        ax.text(left_2024 + pct_val/2, 1, f'{pct_val:.1f}%',
                ha='center', va='center', fontsize=9, fontweight='bold')

    left_2024 += pivot_pct.loc[stage, 2024]

ax.invert_yaxis()
ax.set_yticks(y_pos)
ax.set_yticklabels(['2019', '2024'])
ax.set_xlabel('Percentage of Abandoned Reservations')
ax.set_title('Abandonment Stage Distribution: 2019 vs 2024')
ax.set_xlim(0, 100)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=len(stage_order), frameon=False)

fig.tight_layout()
plt.show()

### Within ProductSelection

In [None]:
%%sql --save product_selection_abandonment
SELECT
    departureyear,
    abandonproductsteptype,
    COUNT(*) AS count
FROM dbt_osalli.fct_reservation_funnel
WHERE departuredate BETWEEN :start_date AND :end_date
    AND finalstage = 'ProductSelection'
    AND departureyear IN (2019, 2024)
GROUP BY departureyear, abandonproductsteptype
HAVING abandonproductsteptype IS NOT NULL
ORDER BY departureyear, count DESC;

In [None]:
prod_select_df = %sql SELECT * from product_selection_abandonment

# Pivot data and calculate percentages
pivot_prod_df = prod_select_df.pivot(index='abandonproductsteptype', columns='departureyear', values='count').fillna(0)
pivot_prod_pct = pivot_prod_df.div(pivot_prod_df.sum(axis=0), axis=1) * 100

# Define step order
step_order = ['departure', 'hotel', 'room', 'flight', 'ship', 'cabin']
# Reindex to ensure correct order
pivot_prod_pct = pivot_prod_pct.reindex(step_order, fill_value=0)

# Visualization
fig, ax = plt.subplots(figsize=(12, 6))

years = [2019, 2024]
y_pos = np.arange(len(years))

# Create stacked horizontal bars
left_2019 = 0
left_2024 = 0

colors = plt.cm.Pastel1(np.linspace(0, 1, len(pivot_prod_pct.index)))

for i, step in enumerate(pivot_prod_pct.index):
    # 2019 bar (position 0 - top)
    bar_2019 = ax.barh(0, pivot_prod_pct.loc[step, 2019], left=left_2019,
                       height=0.5, label=step, color=colors[i], alpha=0.8)

    # Add percentage text for larger segments
    pct_val_2019 = pivot_prod_pct.loc[step, 2019]
    if pct_val_2019 > 5:  # Only show if segment is large enough
        ax.text(left_2019 + pct_val_2019/2, 0, f'{pct_val_2019:.1f}%',
                ha='center', va='center', fontsize=9, fontweight='bold')

    left_2019 += pivot_prod_pct.loc[step, 2019]

    # 2024 bar (position 1 - bottom)
    bar_2024 = ax.barh(1, pivot_prod_pct.loc[step, 2024], left=left_2024,
                       height=0.5, color=colors[i], alpha=0.8)

    # Add percentage text for larger segments
    pct_val_2024 = pivot_prod_pct.loc[step, 2024]
    if pct_val_2024 > 5:  # Only show if segment is large enough
        ax.text(left_2024 + pct_val_2024/2, 1, f'{pct_val_2024:.1f}%',
                ha='center', va='center', fontsize=9, fontweight='bold')

    left_2024 += pivot_prod_pct.loc[step, 2024]

ax.invert_yaxis()
ax.set_yticks(y_pos)
ax.set_yticklabels(['2019', '2024'])
ax.set_xlabel('Percentage of ProductSelection Abandonments')
ax.set_title('ProductSelection Abandonment by Step: 2019 vs 2024')
ax.set_xlim(0, 100)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=len(step_order), frameon=False)

fig.tight_layout()
plt.show()

#### 

The above difference in the relative importance of ship-related and hotel-related steps could be explained by changes in trip distribution.