In [1]:
import pandas as pd
import numpy as np

snap = pd.read_csv("../data/processed/account_snapshots_weekly.csv", parse_dates=["snapshot_date"])

snap.shape, snap.head()

((13096, 19),
   account_id signup_date snapshot_date first_churn_date  tenure_days  \
 0   A-2e4581  2024-10-16    2024-10-21       2024-11-23            5   
 1   A-2e4581  2024-10-16    2024-10-28       2024-11-23           12   
 2   A-2e4581  2024-10-16    2024-11-04       2024-11-23           19   
 3   A-2e4581  2024-10-16    2024-11-11       2024-11-23           26   
 4   A-2e4581  2024-10-16    2024-11-18       2024-11-23           33   
 
    churn_next_30d current_plan_tier  current_seats current_is_trial  \
 0               0               Pro            9.0            False   
 1               1               Pro           13.0            False   
 2               1               Pro           66.0             True   
 3               1               Pro           66.0             True   
 4               1               Pro            9.0             True   
 
   billing_frequency auto_renew_flag  current_mrr  current_arr  \
 0           monthly            True        44

### Feature hypotheses
- Usage decline precedes churn
- Support friction increases churn risk
- Recent subscription instability increases churn
- Very new or unsubscribed accounts churn more

In [2]:
# Ensure sorted for rolling operations
snap = snap.sort_values(["account_id", "snapshot_date"]).copy()

# Weekly usage change (difference)
snap["usage_diff_week"] = (
    snap.groupby("account_id")["usage_events_30d"].diff()
)

# Approximate 30d trend as mean of last 4 weekly diffs
snap["usage_trend_30d"] = (
    snap.groupby("account_id")["usage_diff_week"]
        .rolling(window=4, min_periods=2)
        .mean()
        .reset_index(level=0, drop=True)
)

snap[["usage_events_30d", "usage_trend_30d"]].head()

Unnamed: 0,usage_events_30d,usage_trend_30d
8079,40,
8080,35,
8081,23,-8.5
8082,43,1.0
8083,27,-3.25


In [3]:
snap["usage_per_seat_30d"] = (
    snap["usage_events_30d"] / snap["current_seats"].replace(0, np.nan)
)

snap["usage_per_seat_30d"] = snap["usage_per_seat_30d"].fillna(0)

snap[["usage_events_30d", "current_seats", "usage_per_seat_30d"]].head()

Unnamed: 0,usage_events_30d,current_seats,usage_per_seat_30d
8079,40,61.0,0.655738
8080,35,61.0,0.57377
8081,23,61.0,0.377049
8082,43,61.0,0.704918
8083,27,61.0,0.442623


In [4]:
# Tenure in weeks (avoid divide-by-zero)
snap["tenure_weeks"] = (snap["tenure_days"] / 7).clip(lower=1)

snap["ticket_rate_30d"] = snap["tickets_30d"] / snap["tenure_weeks"]

snap[["tickets_30d", "tenure_weeks", "ticket_rate_30d"]].head()

Unnamed: 0,tickets_30d,tenure_weeks,ticket_rate_30d
8079,0,1.0,0.0
8080,0,1.857143,0.0
8081,0,2.857143,0.0
8082,0,3.857143,0.0
8083,1,4.857143,0.205882


In [9]:
# Recent subscription change flags
# NaN means "no subscription record for that snapshot" -> treat as 0
snap["recent_upgrade_flag"] = (
    snap["current_upgrade_flag"]
    .fillna(False)
    .astype(bool)
    .astype(int)
)

snap["recent_downgrade_flag"] = (
    snap["current_downgrade_flag"]
    .fillna(False)
    .astype(bool)
    .astype(int)
)

snap[["current_upgrade_flag", "current_downgrade_flag", "recent_upgrade_flag", "recent_downgrade_flag"]].head(10)

  .fillna(False)
  .fillna(False)


Unnamed: 0,current_upgrade_flag,current_downgrade_flag,recent_upgrade_flag,recent_downgrade_flag
8079,True,False,1,0
8080,True,False,1,0
8081,True,False,1,0
8082,True,False,1,0
8083,True,False,1,0
8084,True,False,1,0
8085,True,False,1,0
659,False,False,0,0
660,False,False,0,0
661,False,False,0,0


In [6]:
snap["seat_change_30d"] = (
    snap.groupby("account_id")["current_seats"]
        .diff(periods=4)
        .abs()
)

snap["seat_change_30d"] = snap["seat_change_30d"].fillna(0)

snap[["current_seats", "seat_change_30d"]].head()

Unnamed: 0,current_seats,seat_change_30d
8079,61.0,0.0
8080,61.0,0.0
8081,61.0,0.0
8082,61.0,0.0
8083,61.0,0.0


In [10]:
snap["no_active_subscription_flag"] = snap["current_plan_tier"].isna().astype(int)

snap[["current_plan_tier", "no_active_subscription_flag"]].head()

Unnamed: 0,current_plan_tier,no_active_subscription_flag
8079,Basic,0
8080,Basic,0
8081,Basic,0
8082,Basic,0
8083,Basic,0


In [11]:
FEATURE_COLS = [
    # Usage
    "usage_events_30d",
    "usage_events_90d",
    "usage_trend_30d",
    "usage_per_seat_30d",
    
    # Support
    "tickets_30d",
    "escalations_90d",
    "ticket_rate_30d",
    
    # Subscription dynamics
    "recent_upgrade_flag",
    "recent_downgrade_flag",
    "seat_change_30d",
    
    # Account maturity / state
    "tenure_days",
    "no_active_subscription_flag",
]

TARGET = "churn_next_30d"

cols = ["account_id", "snapshot_date"] + FEATURE_COLS + [TARGET]

features_v1 = snap[cols].copy()

features_v1.shape, features_v1.head()

((13096, 15),
      account_id snapshot_date  usage_events_30d  usage_events_90d  \
 8079   A-00bed1    2023-11-20                40                66   
 8080   A-00bed1    2023-11-27                35                66   
 8081   A-00bed1    2023-12-04                23                53   
 8082   A-00bed1    2023-12-11                43                73   
 8083   A-00bed1    2023-12-18                27                67   
 
       usage_trend_30d  usage_per_seat_30d  tickets_30d  escalations_90d  \
 8079              NaN            0.655738            0                0   
 8080              NaN            0.573770            0                0   
 8081            -8.50            0.377049            0                0   
 8082             1.00            0.704918            0                0   
 8083            -3.25            0.442623            1                0   
 
       ticket_rate_30d  recent_upgrade_flag  recent_downgrade_flag  \
 8079         0.000000              

In [12]:
# Missing values check
features_v1.isna().mean().sort_values(ascending=False)

usage_trend_30d                0.070403
snapshot_date                  0.000000
account_id                     0.000000
usage_events_30d               0.000000
usage_events_90d               0.000000
usage_per_seat_30d             0.000000
tickets_30d                    0.000000
escalations_90d                0.000000
ticket_rate_30d                0.000000
recent_upgrade_flag            0.000000
recent_downgrade_flag          0.000000
seat_change_30d                0.000000
tenure_days                    0.000000
no_active_subscription_flag    0.000000
churn_next_30d                 0.000000
dtype: float64

In [13]:
# Basic distributions
features_v1[FEATURE_COLS].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
usage_events_30d,13096.0,21.005422,16.507526,0.0,9.0,18.0,31.0,108.0
usage_events_90d,13096.0,62.783827,32.581157,0.0,39.0,59.0,83.0,192.0
usage_trend_30d,12174.0,-0.033261,5.301594,-32.5,-3.25,0.0,3.25,21.5
usage_per_seat_30d,13096.0,1.246512,2.501251,0.0,0.0,0.555556,1.388889,69.0
tickets_30d,13096.0,0.162263,0.402189,0.0,0.0,0.0,0.0,3.0
escalations_90d,13096.0,0.023748,0.156229,0.0,0.0,0.0,0.0,2.0
ticket_rate_30d,13096.0,0.023455,0.118982,0.0,0.0,0.0,0.0,3.0
recent_upgrade_flag,13096.0,0.089722,0.285794,0.0,0.0,0.0,0.0,1.0
recent_downgrade_flag,13096.0,0.032758,0.17801,0.0,0.0,0.0,0.0,1.0
seat_change_30d,13096.0,4.304215,12.366893,0.0,0.0,0.0,0.0,181.0


In [14]:
# Correlation quick look (numeric only)
features_v1[FEATURE_COLS + [TARGET]].corr()[TARGET].sort_values(ascending=False)

churn_next_30d                 1.000000
ticket_rate_30d                0.039179
no_active_subscription_flag    0.010510
recent_downgrade_flag          0.010053
usage_events_90d               0.006609
recent_upgrade_flag            0.005110
usage_per_seat_30d             0.002164
usage_events_30d              -0.002880
usage_trend_30d               -0.004706
tickets_30d                   -0.007502
escalations_90d               -0.009240
seat_change_30d               -0.022778
tenure_days                   -0.085921
Name: churn_next_30d, dtype: float64

In [15]:
output_path = "../data/processed/account_features_v1.csv"
features_v1.to_csv(output_path, index=False)

print("Wrote:", output_path)
print("Rows:", len(features_v1), "Cols:", features_v1.shape[1])

Wrote: ../data/processed/account_features_v1.csv
Rows: 13096 Cols: 15


In [16]:
features_v1.shape

(13096, 15)

In [19]:
# Top 5 Correlations with churn_next_30d of each numeric feature with chrun
corr = (
    features_v1
        .drop(columns=["account_id", "snapshot_date"])
        .corr()["churn_next_30d"]
        .sort_values(ascending=False)
)

corr.head(5)


churn_next_30d                 1.000000
ticket_rate_30d                0.039179
no_active_subscription_flag    0.010510
recent_downgrade_flag          0.010053
usage_events_90d               0.006609
Name: churn_next_30d, dtype: float64

In [20]:
# To check unexpected missing values
# Fraction of missing values per column
features_v1.isna().mean().sort_values(ascending=False)

usage_trend_30d                0.070403
snapshot_date                  0.000000
account_id                     0.000000
usage_events_30d               0.000000
usage_events_90d               0.000000
usage_per_seat_30d             0.000000
tickets_30d                    0.000000
escalations_90d                0.000000
ticket_rate_30d                0.000000
recent_upgrade_flag            0.000000
recent_downgrade_flag          0.000000
seat_change_30d                0.000000
tenure_days                    0.000000
no_active_subscription_flag    0.000000
churn_next_30d                 0.000000
dtype: float64