In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode

# Initialize Plotly for notebook mode
init_notebook_mode(connected=True)

In [2]:
df = pd.read_json("../data/customer_churn_mini.json", lines=True)
df_logged_in = df[df["auth"] == "Logged In"].copy()
df_logged_in["ts"] = pd.to_datetime(df_logged_in["ts"], unit="ms")

print("--- Verifying Lack of Explicit Churn Event ---")
page_events = df_logged_in["page"].unique()
cancellation_events = [event for event in page_events if "cancel" in event.lower()]

if not cancellation_events:
    print("Confirmed: No page events containing the word 'cancel' were found.")
else:
    print(f"Found potential cancellation events: {cancellation_events}")

--- Verifying Lack of Explicit Churn Event ---
Found potential cancellation events: ['Cancel']


# ---
###  Defining Inferred Churn
# 
# Since no explicit event exists, we must infer churn from user behavior. A user who is merely inactive is not churned; they could be on vacation. A robust definition requires two components:
# 
# 1.  **A Trigger Event:** An action indicating dissatisfaction or intent to leave (e.g., `'Submit Downgrade'`, `'Thumbs Down'`).
# 2.  **Subsequent Inactivity:** A significant period of no activity *after* the trigger event, suggesting the user has truly left.
# 
# Let's implement this.
# ---

In [3]:
# 1. Define the observation window and inactivity threshold (30 days)
max_date = df_logged_in["ts"].max()
INACTIVITY_THRESHOLD = pd.Timedelta(days=30)
cutoff_date = max_date - INACTIVITY_THRESHOLD

# 2. Identify users who performed a trigger action
downgrade_users = df_logged_in[df_logged_in["page"] == "Submit Downgrade"][
    "userId"
].unique()
thumbs_down_users = df_logged_in[df_logged_in["page"] == "Thumbs Down"][
    "userId"
].unique()
potential_churners = np.union1d(downgrade_users, thumbs_down_users)

# 3. Get the last interaction for each of these potential churners
last_interaction = (
    df_logged_in[df_logged_in["userId"].isin(potential_churners)]
    .groupby("userId")["ts"]
    .max()
)

# 4. Final churn definition
churned_user_ids = last_interaction[last_interaction < cutoff_date].index

# 5. Create our final user-level DataFrame for analysis
user_df = (
    df_logged_in.groupby("userId")
    .agg(
        tenure_days=(
            "registration",
            lambda x: (max_date - pd.to_datetime(x.min(), unit="ms")).days,
        ),
        total_songs_played=("song", "count"),
        total_listen_time=("length", "sum"),
        num_thumbs_down=("page", lambda x: (x == "Thumbs Down").sum()),
    )
    .reset_index()
)

user_df["churn"] = user_df["userId"].isin(churned_user_ids).astype(int)

print(
    f"Successfully identified {len(churned_user_ids)} churned users using our inferred definition."
)

Successfully identified 37 churned users using our inferred definition.


In [4]:
# ---
# ## Visualizing the Churn Cohorts
#
# Now that we have successfully labeled our users, let's analyze the results.
# ---

churn_counts = user_df["churn"].value_counts()
fig = px.pie(
    values=churn_counts.values,
    names=churn_counts.index.map({0: "Active", 1: "Churned"}),
    title="<b>Churn Distribution Based on Inferred Definition</b>",
    hole=0.4,
    color_discrete_map={"Active": "royalblue", "Churned": "crimson"},
)
fig.update_layout(title_x=0.5, legend_title_text="User Status")
fig.show()

# You can also generate this chart using the create_chart tool:
# chart_id = create_chart(...)
# display(chart_id)

In [5]:
# ---
# Let's compare the behavior of our two cohorts across key engagement metrics.
# ---

# Tenure Distribution
fig = px.histogram(
    user_df,
    x="tenure_days",
    color="churn",
    barmode="overlay",
    title="<b>User Tenure: Churned users tend to have been on the platform longer</b>",
    labels={"tenure_days": "Tenure (Days)", "churn": "User Status"},
    color_discrete_map={0: "royalblue", 1: "crimson"},
)
fig.update_layout(title_x=0.5)
fig.show()

# You can also generate this chart using the create_chart tool:
# chart_id = create_chart(...)
# display(chart_id)

In [6]:
# Total Songs Played
fig = px.box(
    user_df,
    x="churn",
    y="total_songs_played",
    color="churn",
    title="<b>Engagement: Churned users played significantly fewer songs</b>",
    labels={"total_songs_played": "Total Songs Played", "churn": "User Status"},
    color_discrete_map={0: "royalblue", 1: "crimson"},
)
fig.update_layout(title_x=0.5)
fig.show()

In [7]:
# ---
# ## Our Strategy for Handling Class Imbalance
#
# The EDA confirms we have a classic imbalanced classification problem. Relying on accuracy would be misleading. Our model training strategy directly addresses this:
#
# 1.  **Appropriate Metrics:** We will use the **AUC-ROC score** as our primary evaluation metric, as it is insensitive to class imbalance. We will also monitor the **F1-Score** and the **Precision-Recall Curve**, which provide a more nuanced view of performance on the positive (churn) class.
#
# 2.  **Model-Based Handling:** Instead of altering the dataset with techniques like SMOTE or downsampling (which can introduce noise or discard information), we will use a model-native solution. The LightGBM classifier has a `scale_pos_weight` parameter.
#
#     - We calculate this value as `(count of negative class / count of positive class)`.
#     - This parameter increases the penalty for misclassifying the minority (churn) class during model training, forcing the model to pay more attention to it.
#
# This is a clean, effective, and standard industry practice for handling imbalance in tree-based models.
# ---

# Example calculation as performed in our training script:
y_train_counts = user_df["churn"].value_counts()
scale_pos_weight_value = y_train_counts[0] / y_train_counts[1]
print(
    f"The calculated 'scale_pos_weight' for our model is: {scale_pos_weight_value:.2f}"
)

The calculated 'scale_pos_weight' for our model is: 5.08
