In [None]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
primary_df = pl.read_csv('primary_data.csv')

In [None]:
secondary_df = pl.read_csv('secondary_data.csv')

In [None]:
reference_df = pl.read_csv('primary_data_reference.csv')

In [None]:
primary_df = primary_df.with_columns(
    (pl.col("associate_date").is_not_null()).alias("ACAS"),
    (pl.col("Fellow_Date").is_not_null()).alias("FCAS")
)
primary_df = primary_df.with_columns(
    pl.when(pl.col("FCAS"))
      .then(pl.lit("FCAS"))
      .when(pl.col("ACAS"))
      .then(pl.lit("ACAS"))
      .otherwise(pl.lit("non-member"))
      .alias("membership_type")
)

In [None]:
pdf_pie_IP = primary_df.filter(pl.col('Delivery')=='IP').to_pandas()
pdf_pie_LS = primary_df.filter(pl.col('Delivery')=='LS').to_pandas()
pdf_pie_V = primary_df.filter(pl.col('Delivery')=='V').to_pandas()
pdf_pie_all = primary_df.to_pandas()

In [None]:
import matplotlib.pyplot as plt

def plot_pies(df, group_col, event_col="Event_Code"):
    event_codes = df[event_col].unique()
    n = len(event_codes)

    fig, axes = plt.subplots(1, n, figsize=(4*n, 4))

    if n == 1:
        axes = [axes]

    for ax, event in zip(axes, event_codes):
        subset = df[df[event_col] == event]
        counts = subset[group_col].value_counts()

        ax.pie(counts, labels=counts.index, autopct="%1.1f%%")
        ax.set_title(f"{event} - {group_col}")

    plt.show()


IP

In [None]:
plot_pies(pdf_pie_IP, "registrant_type_code")
plot_pies(pdf_pie_IP, "membership_type")
plot_pies(pdf_pie_IP, "State")
plot_pies(pdf_pie_IP, "Country")

LS

In [None]:
plot_pies(pdf_pie_LS, "registrant_type_code")
plot_pies(pdf_pie_LS, "membership_type")
plot_pies(pdf_pie_LS, "State")
plot_pies(pdf_pie_LS, "Country")

V

In [None]:
plot_pies(pdf_pie_V, "registrant_type_code")
plot_pies(pdf_pie_V, "membership_type")
plot_pies(pdf_pie_V, "State")
plot_pies(pdf_pie_V, "Country")

all events

In [None]:
plot_pies(pdf_pie_all, "registrant_type_code")
plot_pies(pdf_pie_all, "membership_type")
plot_pies(pdf_pie_all, "State")
plot_pies(pdf_pie_all, "Country")

In [None]:
primary_df_parent_events = [
    "2015RPM", "15Spring", "2015REINS", "15CLRS", "15CLRS", "15Annual", "2016RPM", "16Spring", "16Spring", "2016REINS",
    "16CLRS", "16CLRS", "16Annual", "17RPM", "17Spring", "17Spring", "2017REINS", "17CLRS", "17CLRS", "17Annual",
    "17Annual", "18RPM", "18Spring", "18Spring", "2018REINS", "18CLRS", "18CLRS", "18Annual", "18Annual", "19RPM",
    "19RPM", "19Spring", "19Spring", "2019REINS", "2019REINS", "19CLRS", "19CLRS", "19Annual", "19Annual", "20VirSprng",
    "20VREINS", "20VRPM", "20VRCLRS", "20VirAnn", "21VRPM", "21VirSprng", "21VREINS", "21VRCLRS", "21Annual", "21Annual",
    "RPMLive22", "22Spring", "22Spring", "22VREINS", "22CLRS", "22CLRS", "22Annual", "22Annual", "23RPM", "23RPM",
    "23Spring", "23Spring", "23REI", "23REI", "23CLRS", "23CLRS", "23Annual", "23Annual", "24RPM", "24RPM",
    "24Spring", "24Spring", "24Reins", "24Reins", "24CLRS", "24CLRS", "24Annual", "24Annual",
]


In [None]:
def add_event_type_column(df: pl.DataFrame) -> pl.DataFrame:
    return df.with_columns(
    pl.when(pl.col("parent_event").str.to_lowercase().str.contains("ann"))
      .then(pl.lit("Annual"))
    .when(pl.col("parent_event").str.to_lowercase().str.contains("clrs"))
      .then(pl.lit("CLRS"))
    .when(pl.col("parent_event").str.to_lowercase().str.contains("rpm"))
      .then(pl.lit("RPM"))
    .when(pl.col("parent_event").str.to_lowercase().str.contains("rei"))
      .then(pl.lit("REINS"))
    .when(pl.col("parent_event").str.to_lowercase().str.contains("spr"))
      .then(pl.lit("Spring"))
    .otherwise(pl.lit("type not found"))
    .alias("event_type")
)

In [None]:
primary_df_grouped = (
    primary_df.group_by("Event_Title")
      .agg([
          pl.col("Event_Code").first().alias("Event_Code"),

          pl.col("Start_Date").first().alias("Start_Date"),

          pl.col("Delivery").first().alias("Delivery"),

          pl.col("Location_City").first().alias("Location_City"),

          pl.col("Location_State").first().alias("Location_State"),

          pl.len().alias("attendance"),
      ])
).with_columns(
        pl.col("Start_Date")
        .str.strptime(pl.Date, "%m/%d/%Y", strict=False)
        .alias("Start_Date")
).sort(by='Start_Date').with_columns(
   pl.Series("parent_event", primary_df_parent_events)
    )

In [None]:
primary_df_grouped = add_event_type_column(primary_df_grouped)
primary_df_grouped = primary_df_grouped.with_columns(pl.concat_str([pl.col("Location_City"), pl.lit(", "), pl.col("Location_State")]).alias("city_state"))

totals_ip_ls = (
    primary_df_grouped
    .filter(pl.col("Delivery").is_in(["IP", "LS"]))
    .group_by("parent_event")
    .agg([
        pl.col("attendance").sum().alias("attendance"),
        pl.lit("IP+LS").alias("Delivery"),

        pl.col("Location_City").first().alias("Location_City"),
        pl.col("Location_State").first().alias("Location_State"),
        pl.col("city_state").first().alias("city_state"),
        pl.col("event_type").first().alias("event_type"),
        pl.col("Start_Date").first().alias("Start_Date"),

        pl.lit(None, dtype=pl.Utf8).alias("Event_Title"),
        pl.lit(None, dtype=pl.Utf8).alias("Event_Code"),
    ])
)
primary_df_grouped_combined = pl.concat([primary_df_grouped, totals_ip_ls], how="diagonal")

In [None]:
pdf_line_CLRS = primary_df_grouped_combined.filter(pl.col('event_type')=='CLRS').to_pandas()
pdf_line_Annual = primary_df_grouped_combined.filter(pl.col('event_type')=='Annual').to_pandas()
pdf_line_RPM = primary_df_grouped_combined.filter(pl.col('event_type')=='RPM').to_pandas()
pdf_line_REINS = primary_df_grouped_combined.filter(pl.col('event_type')=='REINS').to_pandas()
pdf_line_Spring = primary_df_grouped_combined.filter(pl.col('event_type')=='Spring').to_pandas()

In [None]:
def plot_line_by_delivery(df, event_type):
    plt.figure(figsize=(12,6))
    ax = sns.lineplot(
        data=df,
        x="Start_Date",
        y="attendance",
        marker="o",
        hue="Delivery"
    )
    for (x, y, txt) in zip(df["Start_Date"], df["attendance"], df['city_state']):
        ax.annotate(
            txt,
            xy=(x, y),
            xytext=(0, 6),
            textcoords="offset points",
            ha="center",
            va="bottom",
            fontsize=8
        )
    
    plt.title(f"{event_type} Event Attendance Over Time", fontsize=16)
    plt.xlabel("Event Date")
    plt.ylabel("Attendance")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_line_by_delivery(pdf_line_CLRS, "CLRS")
plot_line_by_delivery(pdf_line_Annual, "Annual")
plot_line_by_delivery(pdf_line_RPM, "RPM")
plot_line_by_delivery(pdf_line_REINS, "REINS")
plot_line_by_delivery(pdf_line_Spring, "Spring")

In [None]:
event_type_order = ["Annual", "CLRS", "REINS", "RPM", "Spring"]

colors = sns.color_palette("tab10", n_colors=len(event_type_order))
EVENT_TYPE_TO_COLOR = dict(zip(event_type_order, colors))

pdf_line_IP = primary_df_grouped_combined.filter(pl.col('Delivery')=='IP').to_pandas()
pdf_line_LS = primary_df_grouped_combined.filter(pl.col('Delivery')=='LS').to_pandas()
pdf_line_IPLS = primary_df_grouped_combined.filter(pl.col('Delivery')=='IP+LS').to_pandas()

for df in (pdf_line_IP, pdf_line_LS, pdf_line_IPLS):
    df["event_type"] = pd.Categorical(df["event_type"], categories=event_type_order)

In [None]:
def plot_line_by_event_type(df, event_type):
    plt.figure(figsize=(12,6))
    ax = sns.lineplot(
        data=df,
        x="Start_Date",
        y="attendance",
        marker="o",
        hue="event_type",
        hue_order=event_type_order,
        palette=EVENT_TYPE_TO_COLOR
    )
    for (x, y, txt) in zip(df["Start_Date"], df["attendance"], df['city_state']):
        ax.annotate(
            txt,
            xy=(x, y),
            xytext=(0, 6),
            textcoords="offset points",
            ha="center",
            va="bottom",
            fontsize=8
        )
    
    plt.title(f"{event_type} Event Attendance Over Time", fontsize=16)
    plt.xlabel("Event Date")
    plt.ylabel("Attendance")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_line_by_event_type(pdf_line_IP, "IP")
plot_line_by_event_type(pdf_line_LS, "LS")
plot_line_by_event_type(pdf_line_IPLS, "IP+LS")

In [None]:
secondary_event_types = [
    'CLRS', 'CLRS', 'ANN', 'ANN', 'RPM', 'RPM', 'SPR', 'SPR', 'SPR', 'REI', 'REI', 'CLRS',
    'CLRS', 'CLRS', 'CSAF', 'ANN', 'ANN', 'ANN', 'ANN', 'RPM', 'RPM', 'SPR', 'SPR', 'REI',
    'REI', 'CLRS', 'CLRS', 'ANN', 'ANN', 'ANN', 'RPM', 'RPM', 'SPR', 'SPR', 'REI', 'REI',
    'CLRS', 'CLRS', 'ANN', 'ANN', 'ANN', 'RPM', 'RPM', 'RPM', 'SPR', 'SPR', 'REI', 'REI',
    'CLRS', 'CLRS', 'ANN', 'ANN', 'RPM', 'RPM', 'RPM', 'RPM', 'SPR', 'SPR', 'SPR', 'REI',
    'REI', 'CARE', 'CLRS', 'CLRS', 'ANN', 'ANN', 'ANN', 'RPM', 'SPR', 'SPR', 'SPR', 'REI',
    'REI', 'REI', 'REI', 'RPM', 'CLRS', 'CLRS', 'CLRS', 'CLRS', 'CSAF', 'ANN', 'ANN', 'RPM',
    'RPM', 'RPM', 'RPM', 'SPR', 'SPR', 'SPR', 'REI', 'REI', 'REI', 'REI', 'CLRS', 'CLRS',
    'CLRS', 'CLRS', 'ANN', 'ANN', 'RPM', 'RPM', 'RPM', 'RPM', 'SPR', 'SPR', 'REI', 'REI',
    'CLRS', 'CLRS', 'CLRS', 'CLRS', 'ANN', 'RPM', 'RPM', 'RPM', 'SPR', 'REI', 'REI', 'REI',
    'CLRS', 'CLRS', 'CLRS', 'ANN', 'RPM', 'RPM', 'SPR', 'REI', 'REI', 'REI', 'CLRS', 'CLRS',
    'CLRS', 'ANN', 'RPM', 'RPM', 'RPM', 'SPR', 'REI', 'REI', 'REI'
]


In [None]:
secondary_df_grouped = secondary_df.group_by('Meeting').agg([pl.col('Begin_Date').first(), pl.len().alias('attendance')]).with_columns(
        pl.col("Begin_Date")
        .str.strptime(pl.Date, "%m/%d/%Y", strict=False)
        .alias("Begin_Date")
).sort(by='Begin_Date').with_columns(
   pl.Series("event_type", secondary_event_types),
    pl.col("Meeting").str.slice(0, 4).alias("Year")
    ).with_columns(pl.concat_str(
        [pl.col("Year"), pl.col("event_type")],
        separator=""
    ).alias("year_and_event"))

secondary_event_totals = secondary_df_grouped.group_by('year_and_event').agg([pl.col('attendance').sum(), pl.col('Begin_Date').first(), pl.col('event_type').first()]).sort(by='Begin_Date')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

pdf = secondary_event_totals.filter(pl.col('year_and_event')!='2014ANN').to_pandas()  # Remove 2014 Annual as it was the CAS Centenniel Celebration and was an outlier, distorting the chart.

fig, ax = plt.subplots(figsize=(10, 5))
sns.lineplot(
    data=pdf,
    x="Begin_Date",
    y="attendance",
    hue="event_type",
    marker="o",
    ax=ax,
)

ax.set_title("Attendance over time by event type")
ax.set_xlabel("Date")
ax.set_ylabel("Attendance")
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
ax.grid(True, alpha=0.3)

ax.legend(title="Event type", frameon=False, bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()
