# 1) Attendance Plots & Segmentation Plots

In [None]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
primary_df = pl.read_csv('primary_data.csv')

In [None]:
secondary_df = pl.read_csv('secondary_data.csv')

In [None]:
reference_df = pl.read_csv('primary_data_reference.csv')

In [None]:
primary_df = primary_df.with_columns(
    (pl.col("associate_date").is_not_null()).alias("ACAS"),
    (pl.col("Fellow_Date").is_not_null()).alias("FCAS")
)
primary_df = primary_df.with_columns(
    pl.when(pl.col("FCAS"))
      .then(pl.lit("FCAS"))
      .when(pl.col("ACAS"))
      .then(pl.lit("ACAS"))
      .otherwise(pl.lit("non-member"))
      .alias("membership_type")
)

In [None]:
pdf_pie_IP = primary_df.filter(pl.col('Delivery')=='IP').to_pandas()
pdf_pie_LS = primary_df.filter(pl.col('Delivery')=='LS').to_pandas()
pdf_pie_V = primary_df.filter(pl.col('Delivery')=='V').to_pandas()
pdf_pie_all = primary_df.to_pandas()

In [None]:
import matplotlib.pyplot as plt

def plot_pies(df, group_col, event_col="Event_Code"):
    event_codes = df[event_col].unique()
    n = len(event_codes)

    fig, axes = plt.subplots(1, n, figsize=(4*n, 4))

    if n == 1:
        axes = [axes]

    for ax, event in zip(axes, event_codes):
        subset = df[df[event_col] == event]
        counts = subset[group_col].value_counts()

        ax.pie(counts, labels=counts.index, autopct="%1.1f%%")
        ax.set_title(f"{event} - {group_col}")

    plt.show()


IP

In [None]:
plot_pies(pdf_pie_IP, "registrant_type_code")
plot_pies(pdf_pie_IP, "membership_type")
plot_pies(pdf_pie_IP, "State")
plot_pies(pdf_pie_IP, "Country")

LS

In [None]:
plot_pies(pdf_pie_LS, "registrant_type_code")
plot_pies(pdf_pie_LS, "membership_type")
plot_pies(pdf_pie_LS, "State")
plot_pies(pdf_pie_LS, "Country")

V

In [None]:
plot_pies(pdf_pie_V, "registrant_type_code")
plot_pies(pdf_pie_V, "membership_type")
plot_pies(pdf_pie_V, "State")
plot_pies(pdf_pie_V, "Country")

all events

In [None]:
plot_pies(pdf_pie_all, "registrant_type_code")
plot_pies(pdf_pie_all, "membership_type")
plot_pies(pdf_pie_all, "State")
plot_pies(pdf_pie_all, "Country")

In [None]:
primary_df_parent_events = [
    "2015RPM", "15Spring", "2015REINS", "15CLRS", "15CLRS", "15Annual", "2016RPM", "16Spring", "16Spring", "2016REINS",
    "16CLRS", "16CLRS", "16Annual", "17RPM", "17Spring", "17Spring", "2017REINS", "17CLRS", "17CLRS", "17Annual",
    "17Annual", "18RPM", "18Spring", "18Spring", "2018REINS", "18CLRS", "18CLRS", "18Annual", "18Annual", "19RPM",
    "19RPM", "19Spring", "19Spring", "2019REINS", "2019REINS", "19CLRS", "19CLRS", "19Annual", "19Annual", "20VirSprng",
    "20VREINS", "20VRPM", "20VRCLRS", "20VirAnn", "21VRPM", "21VirSprng", "21VREINS", "21VRCLRS", "21Annual", "21Annual",
    "RPMLive22", "22Spring", "22Spring", "22VREINS", "22CLRS", "22CLRS", "22Annual", "22Annual", "23RPM", "23RPM",
    "23Spring", "23Spring", "23REI", "23REI", "23CLRS", "23CLRS", "23Annual", "23Annual", "24RPM", "24RPM",
    "24Spring", "24Spring", "24Reins", "24Reins", "24CLRS", "24CLRS", "24Annual", "24Annual",
]


In [None]:
def add_event_type_column(df: pl.DataFrame) -> pl.DataFrame:
    return df.with_columns(
    pl.when(pl.col("parent_event").str.to_lowercase().str.contains("ann"))
      .then(pl.lit("Annual"))
    .when(pl.col("parent_event").str.to_lowercase().str.contains("clrs"))
      .then(pl.lit("CLRS"))
    .when(pl.col("parent_event").str.to_lowercase().str.contains("rpm"))
      .then(pl.lit("RPM"))
    .when(pl.col("parent_event").str.to_lowercase().str.contains("rei"))
      .then(pl.lit("REINS"))
    .when(pl.col("parent_event").str.to_lowercase().str.contains("spr"))
      .then(pl.lit("Spring"))
    .otherwise(pl.lit("type not found"))
    .alias("event_type")
)

In [None]:
primary_df_grouped = (
    primary_df.group_by("Event_Title")
      .agg([
          pl.col("Event_Code").first().alias("Event_Code"),

          pl.col("Start_Date").first().alias("Start_Date"),

          pl.col("Delivery").first().alias("Delivery"),

          pl.col("Location_City").first().alias("Location_City"),

          pl.col("Location_State").first().alias("Location_State"),

          pl.len().alias("attendance"),
      ])
).with_columns(
        pl.col("Start_Date")
        .str.strptime(pl.Date, "%m/%d/%Y", strict=False)
        .alias("Start_Date")
).sort(by='Start_Date').with_columns(
   pl.Series("parent_event", primary_df_parent_events)
    )

In [None]:
primary_df_grouped = add_event_type_column(primary_df_grouped)
primary_df_grouped = primary_df_grouped.with_columns(pl.concat_str([pl.col("Location_City"), pl.lit(", "), pl.col("Location_State")]).alias("city_state"))

totals_ip_ls = (
    primary_df_grouped
    .filter(pl.col("Delivery").is_in(["IP", "LS"]))
    .group_by("parent_event")
    .agg([
        pl.col("attendance").sum().alias("attendance"),
        pl.lit("IP+LS").alias("Delivery"),

        pl.col("Location_City").first().alias("Location_City"),
        pl.col("Location_State").first().alias("Location_State"),
        pl.col("city_state").first().alias("city_state"),
        pl.col("event_type").first().alias("event_type"),
        pl.col("Start_Date").first().alias("Start_Date"),

        pl.lit(None, dtype=pl.Utf8).alias("Event_Title"),
        pl.lit(None, dtype=pl.Utf8).alias("Event_Code"),
    ])
)
primary_df_grouped_combined = pl.concat([primary_df_grouped, totals_ip_ls], how="diagonal")

In [None]:
pdf_line_CLRS = primary_df_grouped_combined.filter(pl.col('event_type')=='CLRS').to_pandas()
pdf_line_Annual = primary_df_grouped_combined.filter(pl.col('event_type')=='Annual').to_pandas()
pdf_line_RPM = primary_df_grouped_combined.filter(pl.col('event_type')=='RPM').to_pandas()
pdf_line_REINS = primary_df_grouped_combined.filter(pl.col('event_type')=='REINS').to_pandas()
pdf_line_Spring = primary_df_grouped_combined.filter(pl.col('event_type')=='Spring').to_pandas()

In [None]:
def plot_line_by_delivery(df, event_type):
    plt.figure(figsize=(12,6))
    ax = sns.lineplot(
        data=df,
        x="Start_Date",
        y="attendance",
        marker="o",
        hue="Delivery"
    )
    for (x, y, txt) in zip(df["Start_Date"], df["attendance"], df['city_state']):
        ax.annotate(
            txt,
            xy=(x, y),
            xytext=(0, 6),
            textcoords="offset points",
            ha="center",
            va="bottom",
            fontsize=8
        )
    
    plt.title(f"{event_type} Event Attendance Over Time", fontsize=16)
    plt.xlabel("Event Date")
    plt.ylabel("Attendance")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_line_by_delivery(pdf_line_CLRS, "CLRS")
plot_line_by_delivery(pdf_line_Annual, "Annual")
plot_line_by_delivery(pdf_line_RPM, "RPM")
plot_line_by_delivery(pdf_line_REINS, "REINS")
plot_line_by_delivery(pdf_line_Spring, "Spring")

In [None]:
event_type_order = ["Annual", "CLRS", "REINS", "RPM", "Spring"]

colors = sns.color_palette("tab10", n_colors=len(event_type_order))
EVENT_TYPE_TO_COLOR = dict(zip(event_type_order, colors))

pdf_line_IP = primary_df_grouped_combined.filter(pl.col('Delivery')=='IP').to_pandas()
pdf_line_LS = primary_df_grouped_combined.filter(pl.col('Delivery')=='LS').to_pandas()
pdf_line_IPLS = primary_df_grouped_combined.filter(pl.col('Delivery')=='IP+LS').to_pandas()

for df in (pdf_line_IP, pdf_line_LS, pdf_line_IPLS):
    df["event_type"] = pd.Categorical(df["event_type"], categories=event_type_order)

In [None]:
import polars as pl
import numpy as np
from sklearn.linear_model import LinearRegression
from datetime import date

df = pl.from_pandas(pdf_line_IPLS)
Y_COL = "attendance"

df = (
    df
    .with_columns(
        pl.col("Start_Date").dt.year().alias("year"),
        pl.col(Y_COL).cast(pl.Float64)
    )
)

target_years = np.array([2026, 2027], dtype=np.int32)
target_dates = [date(2026, 1, 1), date(2027, 1, 1)]

event_types = df.select(pl.col("event_type").unique()).to_series().to_list()
pred_rows = []

for et in event_types:
    sub = df.filter(pl.col("event_type") == et).drop_nulls(subset=["year", Y_COL])

    X = sub.select("year").to_numpy()
    y = sub.select(Y_COL).to_numpy().ravel()

    lr = LinearRegression()
    lr.fit(X, y)

    y_pred = lr.predict(target_years.reshape(-1, 1))
    y_pred_int = np.clip(np.round(y_pred).astype(int), 0, None)

    for i, yr in enumerate(target_years):
        pred_rows.append({
            "event_type": et,
            "Start_Date": target_dates[i],
            "year": int(yr),
            Y_COL: int(y_pred_int[i]),
            "Event_Title": None,
            "Event_Code": None,
            "Location_City": "",
            "Location_State": "",
            "city_state": "",
            "Delivery": "IP+LS",
            "parent_event": ""
        })

preds_df = pl.from_records(pred_rows)
cols = df.columns
display(preds_df.filter(pl.col('Location_City')=="").sort(by='event_type').select(['event_type', 'year', 'attendance']))
pdf_line_IPLS = pl.concat([df, preds_df[cols]], how="vertical_relaxed").sort(["event_type", "year"]).to_pandas()

In [None]:
def plot_line_by_event_type(df, event_type, projected=False):
    plt.figure(figsize=(12,6))
    ax = sns.lineplot(
        data=df,
        x="Start_Date",
        y="attendance",
        marker="o",
        hue="event_type",
        hue_order=event_type_order,
        palette=EVENT_TYPE_TO_COLOR
    )
    for (x, y, txt) in zip(df["Start_Date"], df["attendance"], df['city_state']):
        ax.annotate(
            txt,
            xy=(x, y),
            xytext=(0, 6),
            textcoords="offset points",
            ha="center",
            va="bottom",
            fontsize=8
        )
    if projected:
        plt.title(f"{event_type} Event Attendance Over Time (2026 and 2027 points projected)", fontsize=16)
    else:
        plt.title(f"{event_type} Event Attendance Over Time", fontsize=16)
    plt.xlabel("Event Date")
    plt.ylabel("Attendance")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_line_by_event_type(pdf_line_IP, "IP")
plot_line_by_event_type(pdf_line_LS, "LS")
plot_line_by_event_type(pdf_line_IPLS, "IP+LS", projected=True)

In [None]:
secondary_event_types = [
    'CLRS', 'CLRS', 'ANN', 'ANN', 'RPM', 'RPM', 'SPR', 'SPR', 'SPR', 'REI', 'REI', 'CLRS',
    'CLRS', 'CLRS', 'CSAF', 'ANN', 'ANN', 'ANN', 'ANN', 'RPM', 'RPM', 'SPR', 'SPR', 'REI',
    'REI', 'CLRS', 'CLRS', 'ANN', 'ANN', 'ANN', 'RPM', 'RPM', 'SPR', 'SPR', 'REI', 'REI',
    'CLRS', 'CLRS', 'ANN', 'ANN', 'ANN', 'RPM', 'RPM', 'RPM', 'SPR', 'SPR', 'REI', 'REI',
    'CLRS', 'CLRS', 'ANN', 'ANN', 'RPM', 'RPM', 'RPM', 'RPM', 'SPR', 'SPR', 'SPR', 'REI',
    'REI', 'CARE', 'CLRS', 'CLRS', 'ANN', 'ANN', 'ANN', 'RPM', 'SPR', 'SPR', 'SPR', 'REI',
    'REI', 'REI', 'REI', 'RPM', 'CLRS', 'CLRS', 'CLRS', 'CLRS', 'CSAF', 'ANN', 'ANN', 'RPM',
    'RPM', 'RPM', 'RPM', 'SPR', 'SPR', 'SPR', 'REI', 'REI', 'REI', 'REI', 'CLRS', 'CLRS',
    'CLRS', 'CLRS', 'ANN', 'ANN', 'RPM', 'RPM', 'RPM', 'RPM', 'SPR', 'SPR', 'REI', 'REI',
    'CLRS', 'CLRS', 'CLRS', 'CLRS', 'ANN', 'RPM', 'RPM', 'RPM', 'SPR', 'REI', 'REI', 'REI',
    'CLRS', 'CLRS', 'CLRS', 'ANN', 'RPM', 'RPM', 'SPR', 'REI', 'REI', 'REI', 'CLRS', 'CLRS',
    'CLRS', 'ANN', 'RPM', 'RPM', 'RPM', 'SPR', 'REI', 'REI', 'REI'
]


In [None]:
secondary_df_grouped = secondary_df.group_by('Meeting').agg([pl.col('Begin_Date').first(), pl.len().alias('attendance')]).with_columns(
        pl.col("Begin_Date")
        .str.strptime(pl.Date, "%m/%d/%Y", strict=False)
        .alias("Begin_Date")
).sort(by='Begin_Date').with_columns(
   pl.Series("event_type", secondary_event_types),
    pl.col("Meeting").str.slice(0, 4).alias("Year")
    ).with_columns(pl.concat_str(
        [pl.col("Year"), pl.col("event_type")],
        separator=""
    ).alias("year_and_event"))

secondary_event_totals = secondary_df_grouped.group_by('year_and_event').agg([pl.col('attendance').sum(), pl.col('Begin_Date').first(), pl.col('event_type').first()]).sort(by='Begin_Date')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

pdf = secondary_event_totals.filter(pl.col('year_and_event')!='2014ANN').to_pandas()  
# Remove 2014 Annual as it was the CAS Centenniel Celebration and was an outlier, distorting the chart.

fig, ax = plt.subplots(figsize=(10, 5))
sns.lineplot(
    data=pdf,
    x="Begin_Date",
    y="attendance",
    hue="event_type",
    marker="o",
    ax=ax,
)

ax.set_title("Attendance over time by event type (Secondary Data Validation)")
ax.set_xlabel("Date")
ax.set_ylabel("Attendance")
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
ax.grid(True, alpha=0.3)

ax.legend(title="Event type", frameon=False, bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()


# 2) Add Location and Distance

In [None]:
import json
from pathlib import Path
from typing import Iterable

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

CACHE_PATH = Path("geocode_cache.json")

def load_cache() -> dict:
    if CACHE_PATH.exists():
        return json.loads(CACHE_PATH.read_text(encoding="utf-8"))
    return {}

def save_cache(cache: dict) -> None:
    tmp = CACHE_PATH.with_suffix(".tmp")
    tmp.write_text(json.dumps(cache, ensure_ascii=False, indent=2), encoding="utf-8")
    tmp.replace(CACHE_PATH)


def build_unique_place_keys(primary_df) -> list[str]:
    people_keys = primary_df.select(
        pl.concat_str(
            [pl.col("City"), pl.col("State"), pl.col("Country")],
            separator=", ",
            ignore_nulls=True,
        ).alias("place_key")
    )
    meeting_keys = primary_df.select(
        pl.concat_str(
            [pl.col("Location_City"), pl.col("Location_State")],
            separator=", ",
            ignore_nulls=True,
        ).alias("place_key")
    )
    out = (
        pl.concat([people_keys, meeting_keys])
        .unique()
        .filter(pl.col("place_key").str.len_chars() > 0)
        .get_column("place_key")
        .to_list()
    )
    return out

def geocode_all(unique_place_keys: Iterable[str], save_every: int = 50) -> None:
    cache = load_cache()

    geolocator = Nominatim(user_agent="meetings-distance")
    geocode = RateLimiter(
        geolocator.geocode,
        min_delay_seconds=1,         
        max_retries=2,               
        error_wait_seconds=5,
        swallow_exceptions=True,     
    )

    calls_since_save = 0
    retry = [k for k, v in cache.items() if (v.get('lat') is None or v.get('lon') is None)]
    todo = [k for k in unique_place_keys if k and k not in cache] + retry
    print(f"{len(todo)} places to geocode (skipping {len(unique_place_keys) - len(todo)} already cached).")

    for idx, key in enumerate(todo, start=1):
        loc = geocode(key)
        if loc:
            cache[key] = {
                "lat": loc.latitude,
                "lon": loc.longitude,
                "display_name": getattr(loc, "address", None),
            }
            print(f"[{idx}/{len(todo)}] OK  - {key} -> ({loc.latitude:.6f}, {loc.longitude:.6f})")
        else:
            cache[key] = {"lat": None, "lon": None, "display_name": None}
            print(f"[{idx}/{len(todo)}] MISS- {key}")

        calls_since_save += 1
        if calls_since_save >= save_every:
            save_cache(cache)
            print(f"Saved cache after {calls_since_save} new calls.")
            calls_since_save = 0

    save_cache(cache)
    print("All done. Cache saved to", str(CACHE_PATH))

In [None]:
# unique_place_keys = build_unique_place_keys(primary_df)
# geocode_all(unique_place_keys, save_every=50)

In [None]:
cache = json.loads(open("geocode_cache.json", encoding="utf-8").read())
cache_df = pl.DataFrame({
    "place_key": list(cache.keys()),
    "lat": [v["lat"] for v in cache.values()],
    "lon": [v["lon"] for v in cache.values()],
})

In [None]:
people_locations = primary_df.select(
        pl.concat_str(
            [pl.col("City"), pl.col("State"), pl.col("Country")],
            separator=", ",
            ignore_nulls=True,
        ).alias("place_key"))

n = 0
for p in people_locations['place_key']:
    if p in cache_df['place_key'] and not len(cache_df.filter(pl.col('place_key')==p).filter(pl.col('lat').is_null()))>0:
        n +=1
print(n/len(people_locations))

99% of participants' locations returned valid coordinates

In [None]:
meeting_locations = primary_df.select(
        pl.concat_str(
            [pl.col("Location_City"), pl.col("Location_State")],
            separator=", ",
            ignore_nulls=True,
        ).alias("place_key")
    ).filter(pl.col('place_key')!="")

n2 = 0
for m in meeting_locations['place_key']:
    if m in cache_df['place_key'] and not len(cache_df.filter(pl.col('place_key')==m).filter(pl.col('lat').is_null()))>0:
        n2 +=1
print(n2/len(meeting_locations))

100% of meeting locations have coordinates (important)

In [None]:
event_title = [
    '2015 CAS Annual Meeting', '2015 CAS Interactive Live Streaming: CLRS', '2015 CAS Spring Meeting',
    '2015 CLRS & Workshops', '2015 Ratemaking and Product Management Seminar & Workshops', '2015 Seminar on Reinsurance',
    '2016 CAS Annual Meeting', '2016 CAS Interactive Live Streaming: CLRS', '2016 CAS Interactive Live Streaming: Spring',
    '2016 CAS Seminar on Reinsurance', '2016 CAS Spring Meeting', '2016 CLRS & Workshops',
    '2016 Ratemaking and Product Management Seminar & Workshops', '2017 CAS Annual Meeting',
    '2017 CAS Interactive Live Stream: Casualty Loss Reserve Seminar', '2017 CAS Interactive Live Streaming: Annual',
    '2017 CAS Interactive Live Streaming: Spring', '2017 CAS Seminar on Reinsurance', '2017 CAS Spring Meeting',
    '2017 CLRS & Workshops', '2017 Ratemaking and Product Management Seminar & Workshops', '2018 CAS Annual Meeting',
    '2018 CAS Interactive Live Stream: Casualty Loss Reserve Seminar', '2018 CAS Interactive Live Streaming: Annual',
    '2018 CAS Interactive Live Streaming: Spring', '2018 CAS Reinsurance Seminar', '2018 CAS Spring Meeting',
    '2018 CLRS & Workshops', '2018 RPM Seminar & Workshops', '2019 CAS Annual Meeting',
    '2019 CAS Interactive Live Stream: Casualty Loss Reserve Seminar', '2019 CAS Interactive Live Streaming: Annual',
    '2019 CAS Interactive Live Streaming: RPM', '2019 CAS Interactive Live Streaming: Reinsurance Seminar',
    '2019 CAS Interactive Live Streaming: Spring', '2019 CAS Reinsurance Seminar', '2019 CAS Spring Meeting',
    '2019 CLRS & Workshops', '2019 RPM Seminar & Workshops', '2020 Virtual CAS Annual Meeting',
    '2020 Virtual CAS Ratemaking, Product, and Modeling Seminar', '2020 Virtual CAS Seminar on Reinsurance',
    '2020 Virtual CAS Spring Meeting', '2020 Virtual CLRS', '2021 CAS Annual Meeting',
    '2021 CAS Interactive Live Streaming: Annual', '2021 Virtual CAS Ratemaking, Product, and Modeling Seminar',
    '2021 Virtual CAS Seminar on Reinsurance', '2021 Virtual CAS Spring Meeting', '2021 Virtual CLRS',
    '2022 CAS Annual Meeting', '2022 CAS Interactive Live Streaming: Annual', '2022 CAS Interactive Live Streaming: CLRS',
    '2022 CAS Interactive Live Streaming: Spring', '2022 CAS Spring Meeting', '2022 CLRS & Workshops',
    '2022 RPM Virtual Seminar', '2022 Virtual CAS Seminar on Reinsurance', '2023 CAS Annual Meeting',
    '2023 CAS Interactive Live Streaming: Annual', '2023 CAS Interactive Live Streaming: CLRS',
    '2023 CAS Interactive Live Streaming: Spring', '2023 CAS Interactive Livestream RPM',
    '2023 CAS Interactive Livestream Reinsurance', '2023 CAS Seminar on Reinsurance', '2023 CAS Spring Meeting',
    '2023 CLRS & Workshops', '2023 Ratemaking, Product, and Modeling Seminar', '2024 CAS Annual Meeting',
    '2024 CAS Interactive Live Streaming: Annual', '2024 CAS Interactive Live Streaming: CLRS',
    '2024 CAS Interactive Live Streaming: Spring', '2024 CAS Interactive Livestream RPM',
    '2024 CAS Interactive Livestream Reinsurance', '2024 CAS Ratemaking, Product, and Modeling Seminar',
    '2024 CAS Seminar on Reinsurance', '2024 CAS Spring Meeting', '2024 CLRS & Workshops'
]

event_code = [
    '15Annual', '15LIVECLRS', '15Spring', '15CLRS', '2015RPM', '2015REINS', '16Annual', '16LIVECLRS', '16LIVESPR',
    '2016REINS', '16Spring', '16CLRS', '2016RPM', '17Annual', '17CLRSLS', '17LIVEANN', 'LIVESPR', '2017REINS',
    '17Spring', '17CLRS', '17RPM', '18Annual', '18CLRSLS', '18LIVEANN', 'LIVESPR', '2018REINS', '18Spring', '18CLRS',
    '18RPM', '19Annual', '19CLRSLS', '19LIVEANN', '19LIVERPM', 'REILS', 'LIVESPR', '2019REINS', '19Spring', '19CLRS',
    '19RPM', '20VirAnn', '20VRPM', '20VREINS', '20VirSprng', '20VRCLRS', '21Annual', '21LiveAnn', '21VRPM',
    '21VREINS', '21VirSprng', '21VRCLRS', '22Annual', '22LiveANN', '22LiveCLRS', '22LiveSpr', '22Spring', '22CLRS',
    'RPMLive22', '22VREINS', '23Annual', '23Annual Livestream', '23CLRS Livestream', '23LiveSpr', '23LIVERPM',
    '23LIVEREI', '23REI', '23Spring', '23CLRS', '23RPM', '24Annual', '24Annual Livestream', '24CLRS Livestream',
    '24Spring Livestream', '24RPM Livestream', '24Reins Livestream', '24RPM', '24Reins', '24Spring', '24CLRS'
]

parent_event = [
    '15Annual', '15CLRS', '15Spring', '15CLRS', '2015RPM', '2015REINS', '16Annual', '16CLRS', '16Spring',
    '2016REINS', '16Spring', '16CLRS', '2016RPM', '17Annual', '17CLRS', '17Annual', '17Spring', '2017REINS',
    '17Spring', '17CLRS', '17RPM', '18Annual', '18CLRS', '18Annual', '18Spring', '2018REINS', '18Spring',
    '18CLRS', '18RPM', '19Annual', '19CLRS', '19Annual', '19RPM', '2019REINS', '19Spring', '2019REINS', '19Spring',
    '19CLRS', '19RPM', '20VirAnn', '20VRPM', '20VREINS', '20VirSprng', '20VRCLRS', '21Annual', '21Annual',
    '21VRPM', '21VREINS', '21VirSprng', '21VRCLRS', '22Annual', '22Annual', '22CLRS', '22Spring', '22Spring',
    '22CLRS', 'RPMLive22', '22VREINS', '23Annual', '23Annual', '23CLRS', '23Spring', '23RPM', '23REI', '23REI',
    '23Spring', '23CLRS', '23RPM', '24Annual', '24Annual', '24CLRS', '24Spring', '24RPM', '24Reins', '24RPM',
    '24Reins', '24Spring', '24CLRS'
]

parent_event_map = pl.DataFrame({
    'Event_Title': event_title,
    'Event_Code': event_code,
    'parent_event': parent_event
})


In [None]:
df = primary_df.with_columns(
    home_key = pl.concat_str(
        [pl.col("City"), pl.col("State"), pl.col("Country")],
        separator=", ", ignore_nulls=True
    ),
    meet_key = pl.concat_str(
        [pl.col("Location_City"), pl.col("Location_State")],
        separator=", ", ignore_nulls=True
    ),
)

df = (
    df
    .join(
        cache_df.rename({"lat":"home_lat","lon":"home_lon"}),
        left_on="home_key", right_on="place_key", how="left"
    )
    .join(
        cache_df.rename({"lat":"mtg_lat","lon":"mtg_lon"}),
        left_on="meet_key", right_on="place_key", how="left"
    )
)

In [None]:
import math

R_KM = 6371.0088
MI_PER_KM = 0.621371

def haversine_km(s: dict) -> float | None:
    lat1, lon1, lat2, lon2 = s["home_lat"], s["home_lon"], s["mtg_lat"], s["mtg_lon"]
    if None in (lat1, lon1, lat2, lon2):
        return None
    φ1, λ1, φ2, λ2 = map(math.radians, (lat1, lon1, lat2, lon2))
    dφ = φ2 - φ1
    dλ = λ2 - λ1
    a = math.sin(dφ/2)**2 + math.cos(φ1)*math.cos(φ2)*math.sin(dλ/2)**2
    c = 2 * math.asin(math.sqrt(a))
    return R_KM * c

df = (
    df
    .with_columns(
        pl.struct(["home_lat","home_lon","mtg_lat","mtg_lon"])
          .map_elements(haversine_km)
          .alias("distance_km")
    )
    .with_columns(
        (pl.col("distance_km") * MI_PER_KM).alias("distance_mi")
    )
)

df = df.join(parent_event_map,on=['Event_Title','Event_Code'],how='inner')

In [None]:
# df.write_csv('primary_data_extra_columns.csv')

# 3) Supporting Exhibits

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('primary_data_extra_columns.csv')

hybrid_parent_events = df.loc[df['Delivery'] == 'LS', 'parent_event'].unique()

df_hybrid = df[df['parent_event'].isin(hybrid_parent_events) & df['Delivery'].isin(['IP', 'LS'])].copy()

distance_bins = [0, 50, 200, 500, 1000, 2000, np.inf]
bin_labels = ["0-50", "50-200", "200-500", "500-1000", "1000-2000", ">2000"]
df_hybrid['distance_bin'] = pd.cut(df_hybrid['distance_mi'], bins=distance_bins, labels=bin_labels)

grouped = df_hybrid.groupby(['distance_bin', 'membership_type', 'Delivery']).size().unstack('Delivery', fill_value=0)
grouped['in_person_rate'] = grouped['IP'] / (grouped['IP'] + grouped['LS'])
rate_by_bin = grouped['in_person_rate'].unstack('membership_type')

x = np.arange(len(bin_labels))
plt.figure(figsize=(6,4))
for mtype in ['ACAS', 'FCAS', 'non-member']:
    plt.plot(x, rate_by_bin[mtype], marker='o', label=mtype)
plt.xticks(x, bin_labels)
plt.xlabel('Distance from Event (miles)')
plt.ylabel('Percentage Attending In-Person')
plt.title('In-Person Attendance Rate by Distance and Membership Status')
plt.legend(title='Membership')
plt.grid(alpha=0.3)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

attend_counts = df['Customer_ID'].value_counts()

one_time_ids = attend_counts[attend_counts == 1].index
repeat_ids   = attend_counts[attend_counts > 1].index
df['attendee_type'] = np.where(df['Customer_ID'].isin(repeat_ids), 'Repeat', 'One-time')

one_time_dists = df[(df['attendee_type']=='One-time') & (~df['distance_mi'].isna())]['distance_mi']
repeat_dists   = df[(df['attendee_type']=='Repeat') & (~df['distance_mi'].isna())]['distance_mi']

data = [one_time_dists, repeat_dists]
labels = ['One-time', 'Repeat']

plt.figure(figsize=(6,4))
plt.boxplot(data, labels=labels, showmeans=True)
plt.ylabel('Distance Traveled to Event (miles)')
plt.title('Travel Distance Distribution: One-Time vs Repeat Attendees')

for i, group in enumerate(data, start=1):
    mean_val = np.mean(group)
    median_val = np.median(group)
    q25 = np.percentile(group, 25)
    q75 = np.percentile(group, 75)
    
    plt.text(i+0.15, mean_val, f"Mean: {mean_val:.1f}", fontsize=8, va="center", color="green")
    plt.text(i+0.15, median_val, f"Median: {median_val:.1f}", fontsize=8, va="center", color="orange")
    plt.text(i+0.15, q25, f"25%: {q25:.1f}", fontsize=8, va="center")
    plt.text(i+0.15, q75, f"75%: {q75:.1f}", fontsize=8, va="center")

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

events_per_person = df.groupby("Customer_ID")["Event_Code"].nunique()

membership_map = df.groupby("Customer_ID")["membership_type"].first()

member_flags = (
    events_per_person.to_frame("events_attended")
    .join(membership_map)
)

avg_events = member_flags.groupby("membership_type")["events_attended"].mean()

categories = ["ACAS", "FCAS", "non-member"]
avg_values = [avg_events.get(c, 0) for c in categories]

plt.figure(figsize=(5,4))
plt.bar(categories, avg_values, color=['#1f77b4','#ff7f0e','#2ca02c'])
plt.ylabel('Average Events Attended per Person')
plt.title('Engagement by Membership Status')

for i, val in enumerate(avg_values):
    plt.text(i, val+0.05, f"{val:.2f}", ha='center', fontsize=9)

plt.show()

In [None]:
import matplotlib.pyplot as plt

counts = df['membership_type'].value_counts()

categories = ['ACAS','FCAS','non-member']
values = [counts.get(c, 0) for c in categories]

colors = ['#1f77b4','#ff7f0e','#2ca02c']

plt.figure(figsize=(5,5))
plt.pie(
    values,
    labels=categories,
    colors=colors,
    autopct='%1.1f%%',
    startangle=90,
    wedgeprops={'edgecolor':'white'}
)
plt.title('Membership Type Distribution')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

df['is_speaker'] = df['registrant_type_code'].str.contains('Speaker')

speaker_pct = df.groupby('membership_type')['is_speaker'].mean() * 100

categories = ['ACAS','FCAS','non-member']
perc_values = [speaker_pct[c] for c in categories]

plt.figure(figsize=(5,4))
bars = plt.bar(categories, perc_values, color=['#1f77b4','#ff7f0e','#2ca02c'])
plt.ylabel('Percentage of Attendees who are Speakers')
plt.title('Speaker vs Non-Speaker Breakdown by Credential Status')

for bar, val in zip(bars, perc_values):
    plt.text(
        bar.get_x() + bar.get_width()/2,
        val/2,
        f"{val:.1f}%",
        ha='center', va='center',
        fontsize=9, fontweight='bold',
        color='white'
    )

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

def categorize_event(code, title):
    code = str(code).upper()
    title = str(title).upper()
    if 'RPM' in code: return 'RPM Seminar'
    if 'SPRING' in code or 'SPR' in code: return 'Spring Meeting'
    if 'ANNUAL' in code or 'ANN' in code: return 'Annual Meeting'
    if 'CLRS' in code: return 'CLRS'
    if 'REINS' in code or 'REINSURANCE' in title: return 'Reinsurance Seminar'
    return 'Other'
df['Event_Series'] = df.apply(lambda row: categorize_event(row['Event_Code'], row['Event_Title']), axis=1)

series_nonmem_pct = df.groupby('Event_Series').apply(lambda x: (x['membership_type']=='non-member').mean()*100)

main_series = ['RPM Seminar','Spring Meeting','Annual Meeting','CLRS','Reinsurance Seminar']
series_nonmem_pct = series_nonmem_pct[series_nonmem_pct.index.isin(main_series)]
series_nonmem_pct = series_nonmem_pct.sort_values(ascending=False)

plt.figure(figsize=(6,4))
bars = plt.bar(series_nonmem_pct.index, series_nonmem_pct.values, color='gray')
plt.xticks(rotation=45, ha='right')
plt.ylabel('% of Attendees who are CAS Non-Members')
plt.title('CAS Non-Member Participation by Event Series')

for bar, val in zip(bars, series_nonmem_pct.values):
    plt.text(
        bar.get_x() + bar.get_width()/2,
        val/2,
        f"{val:.1f}%",
        ha='center', va='center',
        fontsize=9, fontweight='bold',
        color='white'
    )

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

mode_counts = (
    df[df['Delivery'] != 'V']
      .groupby('membership_type')['Delivery']
      .value_counts(normalize=True)
      .unstack(fill_value=0)
)
mode_counts = mode_counts.reindex(['ACAS','FCAS','non-member'])
mode_pct = mode_counts * 100

modes = ['IP','LS']
colors = ['#1f77b4','#ff7f0e']
categories = ['ACAS','FCAS','non-member']

plt.figure(figsize=(6,4))
bottom = np.zeros(len(categories))

for idx, mode in enumerate(modes):
    values = mode_pct[mode].values
    bars = plt.bar(categories, values, bottom=bottom, color=colors[idx], label=mode)

    for bar, val, base in zip(bars, values, bottom):
        if val > 0:
            plt.text(
                bar.get_x() + bar.get_width()/2,
                base + val/2,
                f"{val:.1f}%",
                ha="center", va="center",
                fontsize=8, color="white", fontweight="bold"
            )
    bottom += values

plt.ylabel('Percentage of Attendance Records')
plt.title('Attendance Option by Membership Status (In-Person or Live-Stream)')
plt.legend(title='Mode', loc='upper right')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

attend_counts = df['Customer_ID'].value_counts()
dist = attend_counts.value_counts().sort_index()

max_events = dist.index.max()
if max_events > 10:
    num_10_plus = dist.loc[10:].sum()
    dist = dist.loc[:9]
    dist[10] = num_10_plus
x_labels = [str(x) if x < 10 else "10+" for x in dist.index]

plt.figure(figsize=(6,4))
plt.bar(x_labels, dist.values, color='gray')
plt.xlabel('Number of Events Attended')
plt.ylabel('Number of Individuals')
plt.title('Histogram of Event Attendance Count per Person')
plt.show()

In [None]:
import matplotlib.pyplot as plt

one_time_count = (attend_counts == 1).sum()
repeat_count   = (attend_counts > 1).sum()

sizes = [one_time_count, repeat_count]
labels = ['One-Time Attendees', 'Repeat Attendees']

plt.figure(figsize=(4,4))
plt.pie(sizes, labels=labels, autopct='%1.0f%%', startangle=140, colors=['#8c564b','#9467bd'])
plt.title('Overall Attendee Retention')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df['Start_Date'] = pd.to_datetime(df['Start_Date'])

first_year = df.groupby('Customer_ID')['Start_Date'].min().dt.year
years_attended = df.groupby('Customer_ID')['Start_Date'].apply(lambda x: sorted(set(x.dt.year)))

cohort_ids = first_year[(first_year == 2017) | (first_year == 2018)].index
gaps = []
for cid in cohort_ids:
    yr_list = years_attended[cid]
    if len(yr_list) < 2:
        gap = None
    else:
        gap = yr_list[1] - yr_list[0]
    gaps.append(gap if gap is not None else 99)

gaps = pd.Series(gaps)
cohort_size = len(cohort_ids)
years = [0,1,2,3,4,5]
return_rates = []
for n in years:
    if n == 0:
        return_rates.append(0.0)
    else:
        rate = (gaps <= n).mean() * 100
        return_rates.append(rate)

plt.figure(figsize=(6,4))
plt.plot(years, return_rates, marker='o')
plt.xticks(years)
plt.xlabel('Years After Initial Event')
plt.ylabel('Percentage of Attendees Returned')
plt.title('Cumulative Retention of First-Time Attendee Cohorts (2017 & 2018)')
plt.ylim(0, 100)
plt.grid(alpha=0.3)

for x, y in zip(years, return_rates):
    plt.text(
        x, y + 2,
        f"{y:.1f}%", 
        ha='center', va='bottom',
        fontsize=9, fontweight='bold'
    )

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df['Start_Date'] = pd.to_datetime(df['Start_Date'])
first_year = df.groupby('Customer_ID')['Start_Date'].min().dt.year
years_attended = df.groupby('Customer_ID')['Start_Date'].apply(lambda x: sorted(set(x.dt.year)))

cohort_ids = first_year[(first_year == 2017) | (first_year == 2018)].index
gaps = []
for cid in cohort_ids:
    yr_list = years_attended[cid]
    if len(yr_list) < 2:
        gap = None
    else:
        gap = yr_list[1] - yr_list[0]
    gaps.append(gap if gap is not None else 99)

gaps = pd.Series(gaps)

return_dist = gaps[gaps < 99].value_counts().sort_index()
return_probs = return_dist / len(gaps) * 100

plt.figure(figsize=(6,4))
bars = plt.bar(return_probs.index.astype(str), return_probs.values, color='steelblue')
plt.xlabel('Years After Initial Event')
plt.ylabel('Percent of Cohort Returning in That Year')
plt.title('Return Probability by Year (2017–2018 First-Time Attendees)')

for bar, val in zip(bars, return_probs.values):
    plt.text(
        bar.get_x() + bar.get_width()/2,
        val/2,
        f"{val:.1f}%",
        ha='center', va='center',
        color='white', fontsize=9, fontweight='bold'
    )
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

TOP_N = 10

df = pd.read_csv('primary_data_extra_columns.csv')

members = (
    df[df['membership_type'].isin(['ACAS', 'FCAS'])]
    .drop_duplicates(subset='Customer_ID')
)

members_with_coords = members.dropna(subset=['home_lat', 'home_lon'])

fig_points = px.scatter_geo(
    members_with_coords,
    lat='home_lat', lon='home_lon',
    scope='usa',
    title='CAS Member Locations (ACAS & FCAS)'
)
fig_points.show()

state_counts = (
    members_with_coords['State']
    .dropna()
    .value_counts()
    .reset_index()
)
state_counts.columns = ['State', 'Count']
top_states = state_counts.head(TOP_N).sort_values('Count', ascending=True)

city_state = (
    members_with_coords[['City', 'State']].copy()
    .dropna(subset=['City', 'State'])
)
city_state['City'] = city_state['City'].astype(str).str.strip()
city_state['State'] = city_state['State'].astype(str).str.strip()
city_state['CityState'] = city_state['City'] + ', ' + city_state['State']

city_counts = (
    city_state['CityState']
    .value_counts()
    .reset_index()
)
city_counts.columns = ['CityState', 'Count']
top_cities = city_counts.head(TOP_N).sort_values('Count', ascending=True)

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=(f"Top {TOP_N} States by CAS Member Count",
                    f"Top {TOP_N} Cities by CAS Member Count")
)

fig.add_trace(
    go.Bar(
        x=top_states['Count'], y=top_states['State'],
        orientation='h',
        text=top_states['Count'],
        textposition='inside',
        insidetextanchor='middle',
        textfont=dict(color='white'),
        hovertemplate='%{y}: %{x}<extra></extra>'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(
        x=top_cities['Count'], y=top_cities['CityState'],
        orientation='h',
        text=top_cities['Count'],
        textposition='inside',
        insidetextanchor='middle',
        textfont=dict(color='white'),
        hovertemplate='%{y}: %{x}<extra></extra>'
    ),
    row=1, col=2
)

fig.update_layout(
    height=600,
    showlegend=False,
    margin=dict(l=80, r=40, t=80, b=60)
)
fig.update_xaxes(title_text='Members', row=1, col=1)
fig.update_xaxes(title_text='Members', row=1, col=2)
fig.update_yaxes(title_text='', row=1, col=1)
fig.update_yaxes(title_text='', row=1, col=2)

fig.show()

## Important Note - These Charts do not print when printed
## to PDF, but they are included in the final presentation