### Project Overview & Problem Statement

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)

# ---
# # Advanced EDA: Inferred Churn and Imbalance Handling
#
#
# This notebook addresses two core challenges:
# 1.  **Defining Churn:** We develop a robust, inferred churn definition based on user behavior triggers and subsequent inactivity, as no explicit churn event exists in the data.
# 2.  **Handling Class Imbalance:** We not only identify the class imbalance but also demonstrate our strategy for mitigating it using SMOTE for visualization and `scale_pos_weight` for modeling.
# ---

In [2]:
# %pip install imblearn

###  Data Loading & Inferred Churn Definition

In [3]:
df = pd.read_json("../data/customer_churn_mini.json", lines=True)
df_logged_in = df[df["auth"] == "Logged In"].copy()
df_logged_in["ts"] = pd.to_datetime(df_logged_in["ts"], unit="ms")
max_date = df_logged_in["ts"].max()

# --- Inferred Churn Logic (as developed before) ---
INACTIVITY_THRESHOLD = pd.Timedelta(days=30)
cutoff_date = max_date - INACTIVITY_THRESHOLD
downgrade_users = df_logged_in[df_logged_in["page"] == "Submit Downgrade"][
    "userId"
].unique()
thumbs_down_users = df_logged_in[df_logged_in["page"] == "Thumbs Down"][
    "userId"
].unique()
potential_churners = np.union1d(downgrade_users, thumbs_down_users)
last_interaction = (
    df_logged_in[df_logged_in["userId"].isin(potential_churners)]
    .groupby("userId")["ts"]
    .max()
)
churned_user_ids = last_interaction[last_interaction < cutoff_date].index

# Create user-level features for EDA
user_df = (
    df_logged_in.groupby("userId")
    .agg(
        tenure_days=(
            "registration",
            lambda x: (max_date - pd.to_datetime(x.min(), unit="ms")).days,
        ),
        total_songs_played=("song", "count"),
    )
    .reset_index()
)
user_df["churn"] = user_df["userId"].isin(churned_user_ids).astype(int)

print(
    f"Identified {len(churned_user_ids)} churned users. Churn Rate: {user_df['churn'].mean():.2%}"
)

Identified 37 churned users. Churn Rate: 16.44%


### Visualizing the Original Imbalanced Data

In [4]:
# ---
# ## Visualizing the Original Data
#
# As we can see, churned users are a small minority. Any model trained on this raw data will be heavily biased towards predicting 'Active'.
# ---

# Churn Distribution Pie Chart
churn_counts = user_df["churn"].value_counts()
fig = px.pie(
    values=churn_counts.values,
    names=churn_counts.index.map({0: "Active", 1: "Churned"}),
    title="<b>Initial State: Severe Class Imbalance</b>",
    hole=0.4,
)
fig.show()

# Scatter plot showing the imbalanced distribution
fig = px.scatter(
    user_df,
    x="tenure_days",
    y="total_songs_played",
    color="churn",
    title="<b>Original Data: Churners are Hard to Distinguish</b>",
    color_discrete_map={0: "blue", 1: "red"},
)
fig.show()

### Demonstrating Imbalance Handling with SMOTE

In [5]:
# ---
# ## Strategy: Handling Imbalance with SMOTE for Analysis
#
# To better understand the feature space of our churned users, we can use SMOTE to create a balanced dataset *for visualization purposes*. This technique generates synthetic minority samples, allowing us to see the underlying patterns more clearly.
#
# **Note:** For our actual model, we will use the `scale_pos_weight` parameter in LightGBM, as it's more computationally efficient and avoids introducing artificial data into the training process itself. This visualization is purely for EDA.
# ---

# Separate features and target for SMOTE
X = user_df[["tenure_days", "total_songs_played"]]
y = user_df["churn"]

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new DataFrame with the balanced data
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled["churn"] = y_resampled

print("Shape of original data:", X.shape)
print("Shape of resampled data:", X_resampled.shape)

Shape of original data: (225, 2)
Shape of resampled data: (376, 2)


### Visualizing the Balanced Data

In [6]:
# ---
# ## Visualizing the Balanced Data
#
# After applying SMOTE, we can see a much clearer separation and relationship between the features for the two classes.
# ---

# Churn Distribution on Resampled Data
resampled_counts = df_resampled["churn"].value_counts()
fig = px.pie(
    values=resampled_counts.values,
    names=resampled_counts.index.map({0: "Active", 1: "Synthetic Churned"}),
    title="<b>After SMOTE: A Perfectly Balanced Dataset for Analysis</b>",
    hole=0.4,
)
fig.show()

# Scatter plot on Resampled Data
fig = px.scatter(
    df_resampled,
    x="tenure_days",
    y="total_songs_played",
    color="churn",
    title="<b>Resampled Data: Clearer Separation Between Classes</b>",
    color_discrete_map={0: "blue", 1: "red"},
)
fig.show()