# About

- RFM Segmentation
- RFM and Churn Connection

# Preparation

## Libraries

In [57]:
import pandas as pd

In [58]:
import numpy as np

In [59]:
from dotenv import load_dotenv
import os

In [60]:
import maika_eda_pandas as mk

In [61]:
from scipy import stats

In [62]:
from src.core.transforms import (
    transform_transactions_df,
    transform_customers_df,
    get_customers_screenshot_summary_from_transactions_df,
    rfm_segment,
    add_churn_status,
)

In [63]:
import plotly.express as px
import plotly.graph_objects as go

In [130]:
# Features Processing

from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif

In [65]:
def check_nan_in_df_cols(df):
    # Get relative percentage of nulls by column
    null_features_proportion = (
        df.isna().sum() / len(df)
    ).sort_values(ascending=False)

    high_proportion = []
    medium_proportion = []
    low_proportion = []

    for feature, proportion in null_features_proportion.items():
        if proportion >= 0.20:
            high_proportion.append(feature)
        elif 0.05 <= proportion < 0.20:
            medium_proportion.append(feature)
        else:
            low_proportion.append(feature)

    # Build features DataFrame
    features_df = null_features_proportion.reset_index()
    features_df.columns = ["feature", "nan_proportion"]

    features_df["NaN group"] = features_df["feature"].apply(
        lambda f: (
            "High" if f in high_proportion
            else "Medium" if f in medium_proportion
            else "Low"
        )
    )

    # Print counts (same behavior as before)
    print("Total features:", len(df.columns))
    print("Information on NaN values")
    print("====================================")
    print("Number of High Proportion Features:", len(high_proportion))
    print("Number of Medium Proportion Features:", len(medium_proportion))
    print("Number of Low Proportion Features:", len(low_proportion))

    return features_df


## Environment

In [66]:
load_dotenv()

True

In [67]:
SEED_CUSTOMERS=os.getenv("SEED_CUSTOMERS")
SEED_TRANSACTIONS=os.getenv("SEED_TRANSACTIONS")

In [68]:
MAX_DATA_DATE = pd.Timestamp('2025-12-31')

In [69]:
TRAIN_SNAPSHOT_DATE = MAX_DATA_DATE - pd.Timedelta(90, 'day')

## Data

### Read all time data

In [70]:
customers_df = pd.read_csv(f"../{SEED_CUSTOMERS}")

In [71]:
transactions_df = pd.read_csv(f"../{SEED_TRANSACTIONS}")

In [72]:
mk.read_data_info(transactions_df)

Number of columns: 3
Column names: ['customer_id', 'transaction_date', 'amount']
Number of rows: 46,704
Data Preview: 

  customer_id transaction_date  amount
0      C00000       2025-09-10  195.78
1      C00000       2025-09-12   50.87
2      C00000       2025-10-01  133.25
3      C00000       2025-10-16   37.44
4      C00000       2025-10-18  101.95


In [73]:
mk.read_data_info(customers_df)

Number of columns: 3
Column names: ['customer_id', 'signup_date', 'true_lifetime_days']
Number of rows: 3,000
Data Preview: 

  customer_id signup_date  true_lifetime_days
0      C00000  2025-08-22                 204
1      C00001  2025-03-07                 365
2      C00002  2025-08-18                  48
3      C00003  2025-09-22                  84
4      C00004  2025-05-28                 113


### Transform all time data

In [74]:
transactions_df = transform_transactions_df(transactions_df)

In [75]:
customers_df = transform_customers_df(customers_df)

### Limit data

In [76]:
transactions_modeling_df = transactions_df[transactions_df['transaction_date'] <= TRAIN_SNAPSHOT_DATE]

In [77]:
customers_modeling_df = pd.merge(
    pd.DataFrame({'customer_id': transactions_modeling_df['customer_id'].unique()}),
    customers_df,
    on='customer_id',
    how='inner'
)

In [78]:
customers_modeling_df

Unnamed: 0,customer_id,signup_date,true_lifetime_days,termination_date
0,C00000,2025-08-22,204,2026-03-14
1,C00001,2025-03-07,365,2026-03-07
2,C00002,2025-08-18,48,2025-10-05
3,C00004,2025-05-28,113,2025-09-18
4,C00006,2025-08-22,117,2025-12-17
...,...,...,...,...
2259,C02990,2025-02-01,307,2025-12-05
2260,C02993,2025-03-01,134,2025-07-13
2261,C02994,2025-01-30,112,2025-05-22
2262,C02996,2025-06-03,308,2026-04-07


### Define churn labels

Logic to create training set:
- MAX_DATA_DATE: cut off of observation time.
- MAX_DATA_DATE - 90: the observation time cutoff for the data used to train our models.

In [79]:
CUTOFF_TRAINING_DATE = MAX_DATA_DATE - pd.Timedelta(90, unit='day')

In [80]:
ndays = [30, 60, 90]
for nday in ndays:
    var_name = f"is_churn_{nday}_days"
    timestamp_date = MAX_DATA_DATE - pd.Timedelta(nday, unit='day')
    customers_modeling_df[var_name] = add_churn_status(transformed_customers_df=customers_df, observed_date=timestamp_date, desired_df=None)

# Feature Engineering

## Transaction Features

Adding more features to transactions data:
- days_since_last_transaction
- days_until_next_transaction
- customer_transaction_order

In [81]:
transactions_modeling_df = transactions_modeling_df.sort_values(['customer_id', 'transaction_date'])

In [82]:
transactions_modeling_df['customer_transaction_order'] = transactions_modeling_df.groupby('customer_id').cumcount()

In [83]:
transactions_modeling_df['prev_transaction_date'] = transactions_modeling_df.groupby('customer_id')['transaction_date'].shift(1)
transactions_modeling_df['next_transaction_date'] = transactions_modeling_df.groupby('customer_id')['transaction_date'].shift(-1)

In [84]:
transactions_modeling_df['days_since_previous_transaction'] = (transactions_modeling_df['transaction_date'] - transactions_modeling_df['prev_transaction_date']).dt.days
transactions_modeling_df['days_until_next_transaction'] = (transactions_modeling_df['next_transaction_date'] - transactions_modeling_df['transaction_date']).dt.days

In [85]:
# Get the first transaction date for each customer
transactions_modeling_df['first_transaction_date'] = transactions_modeling_df.groupby('customer_id')['transaction_date'].transform('min')

# Compute days since first transaction
transactions_modeling_df['days_since_first_transaction'] = (
    transactions_modeling_df['transaction_date'] - transactions_modeling_df['first_transaction_date']
).dt.days

In [86]:
transactions_modeling_df

Unnamed: 0,customer_id,transaction_date,amount,customer_transaction_order,prev_transaction_date,next_transaction_date,days_since_previous_transaction,days_until_next_transaction,first_transaction_date,days_since_first_transaction
0,C00000,2025-09-10,195.78,0,NaT,2025-09-12,,2.0,2025-09-10,0
1,C00000,2025-09-12,50.87,1,2025-09-10,2025-10-01,2.0,19.0,2025-09-10,2
2,C00000,2025-10-01,133.25,2,2025-09-12,NaT,19.0,,2025-09-10,21
12,C00001,2025-03-17,66.11,0,NaT,2025-04-23,,37.0,2025-03-17,0
13,C00001,2025-04-23,38.28,1,2025-03-17,2025-05-22,37.0,29.0,2025-03-17,37
...,...,...,...,...,...,...,...,...,...,...
46670,C02999,2025-09-16,8.02,41,2025-09-14,2025-09-16,2.0,0.0,2025-05-26,113
46671,C02999,2025-09-16,30.10,42,2025-09-16,2025-09-28,0.0,12.0,2025-05-26,113
46672,C02999,2025-09-28,11.59,43,2025-09-16,2025-09-28,12.0,0.0,2025-05-26,125
46673,C02999,2025-09-28,103.22,44,2025-09-28,2025-10-02,0.0,4.0,2025-05-26,125


In [87]:
check_nan_in_df_cols(transactions_modeling_df)

Total features: 10
Information on NaN values
Number of High Proportion Features: 0
Number of Medium Proportion Features: 4
Number of Low Proportion Features: 6


Unnamed: 0,feature,nan_proportion,NaN group
0,prev_transaction_date,0.088327,Medium
1,next_transaction_date,0.088327,Medium
2,days_since_previous_transaction,0.088327,Medium
3,days_until_next_transaction,0.088327,Medium
4,customer_id,0.0,Low
5,transaction_date,0.0,Low
6,amount,0.0,Low
7,customer_transaction_order,0.0,Low
8,first_transaction_date,0.0,Low
9,days_since_first_transaction,0.0,Low


## RFM Features

RFM can be used to show two information:
- lifetime behavior
- behavior trends

So I wrote a loop to create RFM features based on different time windows: All time, within the last 30 days, within the last 60 days and within the last 90 days. I technically can add more.
- I also added tenure: Days between the first purchase and the cutoff observed date. If the time window is 30: It is days between the first purchase and 30 days before the cutoff observed date.
- Reason: I believe tenure is a reflection of a customer's loyalty. Also, the summary table has enough data to create this feature easily.

In [88]:
def get_rfm_window_features(customers_df, transactions_df, observed_date):

    rfm_time_windows = ["all_time", "30d", "60d", "90d"]

    for rfm_time_window in rfm_time_windows:

        if rfm_time_window == "all_time":
            filtered_transactions_df = transactions_df
        else:
            # Limit data to the new cutoff
            days = int(rfm_time_window.strip("d"))
            filtered_transactions_df = transactions_df[
                (transactions_df['transaction_date'] <= observed_date - pd.Timedelta(days=days))
            ]

        # Get a Customers Screenshot Summary DataFrame. It has RFM features and other variables that RFM features depend on.
        summary_modeling_df = get_customers_screenshot_summary_from_transactions_df(
            transactions_df=filtered_transactions_df,
            observed_date=observed_date,
            column_names=["customer_id", "transaction_date", "amount"]
        )

        # Keep only customer_id and the RFM columns we care about
        summary_modeling_df = summary_modeling_df[[
            'customer_id',
            'days_until_observed',
            'period_transaction_count',
            'period_total_amount',
            'period_tenure_days'
        ]]

        # Rename columns in the summary DF, not the main DF
        summary_modeling_df = summary_modeling_df.rename(columns={
            'days_until_observed': f'rfm_recency_{rfm_time_window}',
            'period_transaction_count': f'rfm_frequency_{rfm_time_window}',
            'period_total_amount': f'rfm_monetary_{rfm_time_window}',
            'period_tenure_days': f'tenure_{rfm_time_window}'
        })
        
        # Merge with current data used for modelling.
        customers_df = pd.merge(
            customers_df,
            summary_modeling_df,
            on="customer_id",
            how="left"
        )

    return customers_df

In [89]:
customers_modeling_df = get_rfm_window_features(customers_df=customers_modeling_df, transactions_df=transactions_modeling_df, observed_date=CUTOFF_TRAINING_DATE)

In [90]:
customers_modeling_df

Unnamed: 0,customer_id,signup_date,true_lifetime_days,termination_date,is_churn_30_days,is_churn_60_days,is_churn_90_days,rfm_recency_all_time,rfm_frequency_all_time,rfm_monetary_all_time,...,rfm_monetary_30d,tenure_30d,rfm_recency_60d,rfm_frequency_60d,rfm_monetary_60d,tenure_60d,rfm_recency_90d,rfm_frequency_90d,rfm_monetary_90d,tenure_90d
0,C00000,2025-08-22,204,2026-03-14,0,0,0,1,3,379.90,...,,,,,,,,,,
1,C00001,2025-03-07,365,2026-03-07,0,0,0,21,11,620.79,...,585.34,138.0,61.0,10.0,585.34,138.0,100.0,6.0,226.67,99.0
2,C00002,2025-08-18,48,2025-10-05,1,1,0,6,11,910.64,...,620.80,11.0,,,,,,,,
3,C00004,2025-05-28,113,2025-09-18,0,0,0,18,19,2018.94,...,1866.80,69.0,61.0,13.0,1451.43,55.0,95.0,6.0,663.50,21.0
4,C00006,2025-08-22,117,2025-12-17,1,1,1,28,1,20.20,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2259,C02990,2025-02-01,307,2025-12-05,1,1,1,4,18,2207.01,...,1666.78,209.0,61.0,10.0,1588.01,180.0,117.0,9.0,1479.59,124.0
2260,C02993,2025-03-01,134,2025-07-13,0,0,0,102,8,1090.93,...,1090.93,112.0,102.0,8.0,1090.93,112.0,102.0,8.0,1090.93,112.0
2261,C02994,2025-01-30,112,2025-05-22,0,0,0,140,20,1474.70,...,1474.70,102.0,140.0,20.0,1474.70,102.0,140.0,20.0,1474.70,102.0
2262,C02996,2025-06-03,308,2026-04-07,1,1,1,4,6,235.07,...,206.78,72.0,72.0,3.0,163.75,32.0,104.0,1.0,30.96,0.0


In [91]:
customers_modeling_df.count()

customer_id               2264
signup_date               2264
true_lifetime_days        2264
termination_date          2264
is_churn_30_days          2264
is_churn_60_days          2264
is_churn_90_days          2264
rfm_recency_all_time      2264
rfm_frequency_all_time    2264
rfm_monetary_all_time     2264
tenure_all_time           2264
rfm_recency_30d           1994
rfm_frequency_30d         1994
rfm_monetary_30d          1994
tenure_30d                1994
rfm_recency_60d           1731
rfm_frequency_60d         1731
rfm_monetary_60d          1731
tenure_60d                1731
rfm_recency_90d           1443
rfm_frequency_90d         1443
rfm_monetary_90d          1443
tenure_90d                1443
dtype: int64

In [92]:
customers_modeling_df.columns

Index(['customer_id', 'signup_date', 'true_lifetime_days', 'termination_date',
       'is_churn_30_days', 'is_churn_60_days', 'is_churn_90_days',
       'rfm_recency_all_time', 'rfm_frequency_all_time',
       'rfm_monetary_all_time', 'tenure_all_time', 'rfm_recency_30d',
       'rfm_frequency_30d', 'rfm_monetary_30d', 'tenure_30d',
       'rfm_recency_60d', 'rfm_frequency_60d', 'rfm_monetary_60d',
       'tenure_60d', 'rfm_recency_90d', 'rfm_frequency_90d',
       'rfm_monetary_90d', 'tenure_90d'],
      dtype='object')

In [93]:
check_nan_in_df_cols(customers_modeling_df)

Total features: 23
Information on NaN values
Number of High Proportion Features: 8
Number of Medium Proportion Features: 4
Number of Low Proportion Features: 11


Unnamed: 0,feature,nan_proportion,NaN group
0,tenure_90d,0.362633,High
1,rfm_monetary_90d,0.362633,High
2,rfm_frequency_90d,0.362633,High
3,rfm_recency_90d,0.362633,High
4,tenure_60d,0.235424,High
5,rfm_monetary_60d,0.235424,High
6,rfm_frequency_60d,0.235424,High
7,rfm_recency_60d,0.235424,High
8,rfm_frequency_30d,0.119258,Medium
9,tenure_30d,0.119258,Medium


It is expected that the window RFM features will have lots of NaNs. This is because transactions occur more at the later dates.

## Activity Trend Features

Some possile features:
- Number of actions (activity) -> Unavailable
- Slope of transaction features
    - Say a customer k have n transactions.
    - For each customer, we fit a linear regression line: y = b0 + b1*x1
        - where y is a feature from the transactions dataset
        - x1 is the time index (starts at 0, first signup day of all customers)
- Statistics of transaction features
    - Min
    - Mean
    - Mode
    - Max
    - q1
    - q5
    - q10
    - q20
    - q30
    - ...
    - q90
    - q95
    - q99

### Slope

In [94]:
def get_slope_features(customers_df, transactions_df, observed_date, feature_list):

    time_windows = ["all_time", "30d", "60d", "90d"]

    for time_window in time_windows:

        if time_window == "all_time":
            filtered_transactions_df = transactions_df
        else:
            # Limit data to the new cutoff
            days = int(time_window.strip("d"))
            filtered_transactions_df = transactions_df[
                (transactions_df['transaction_date'] <= observed_date - pd.Timedelta(days=days))
            ]

    customers_list = filtered_transactions_df['customer_id'].unique()

    slopes = {}

    for customer_id in customers_list:

        customer_transactions = filtered_transactions_df[filtered_transactions_df['customer_id'] == customer_id]

        x = np.arange(len(customer_transactions)) #time axis
        slopes[customer_id] = {} #initiate value list

        for feature_name in feature_list:
            y = customer_transactions[feature_name].values
            x_valid = x[~np.isnan(y)]
            y_valid = y[~np.isnan(y)]

            if len(y_valid) < 2:
                slopes[customer_id][feature_name] = np.nan
            else:
                slope = np.polyfit(x_valid, y_valid, 1)[0]
                slopes[customer_id][feature_name] = slope

    # Convert dict of dicts into dataframe
    slope_features_df = pd.DataFrame.from_dict(slopes, orient='index')

    # Rename columns to have slope_ prefix
    slope_features_df = slope_features_df.rename(columns={f: f'slope_{f}' for f in slope_features_df.columns})

    # Reset index to have customer_id as a column
    slope_features_df = slope_features_df.reset_index().rename(columns={'index': 'customer_id'})

    # Merge with current data used for modelling.
    customers_df = pd.merge(
        customers_df,
        slope_features_df,
        on="customer_id",
        how="left"
    )

    return customers_df

In [95]:
customers_modeling_df = get_slope_features(
    customers_df=customers_modeling_df,
    transactions_df=transactions_modeling_df,
    observed_date=CUTOFF_TRAINING_DATE,
    feature_list=[
        'amount',
        'days_since_previous_transaction',
        'days_until_next_transaction',
        'customer_transaction_order',
        'days_since_first_transaction'
    ]
)

In [96]:
customers_modeling_df.count()

customer_id                              2264
signup_date                              2264
true_lifetime_days                       2264
termination_date                         2264
is_churn_30_days                         2264
is_churn_60_days                         2264
is_churn_90_days                         2264
rfm_recency_all_time                     2264
rfm_frequency_all_time                   2264
rfm_monetary_all_time                    2264
tenure_all_time                          2264
rfm_recency_30d                          1994
rfm_frequency_30d                        1994
rfm_monetary_30d                         1994
tenure_30d                               1994
rfm_recency_60d                          1731
rfm_frequency_60d                        1731
rfm_monetary_60d                         1731
tenure_60d                               1731
rfm_recency_90d                          1443
rfm_frequency_90d                        1443
rfm_monetary_90d                  

In [97]:
customers_modeling_df.columns

Index(['customer_id', 'signup_date', 'true_lifetime_days', 'termination_date',
       'is_churn_30_days', 'is_churn_60_days', 'is_churn_90_days',
       'rfm_recency_all_time', 'rfm_frequency_all_time',
       'rfm_monetary_all_time', 'tenure_all_time', 'rfm_recency_30d',
       'rfm_frequency_30d', 'rfm_monetary_30d', 'tenure_30d',
       'rfm_recency_60d', 'rfm_frequency_60d', 'rfm_monetary_60d',
       'tenure_60d', 'rfm_recency_90d', 'rfm_frequency_90d',
       'rfm_monetary_90d', 'tenure_90d', 'slope_amount',
       'slope_days_since_previous_transaction',
       'slope_days_until_next_transaction', 'slope_customer_transaction_order',
       'slope_days_since_first_transaction'],
      dtype='object')

In [98]:
check_nan_in_df_cols(customers_modeling_df)

Total features: 28
Information on NaN values
Number of High Proportion Features: 13
Number of Medium Proportion Features: 4
Number of Low Proportion Features: 11


Unnamed: 0,feature,nan_proportion,NaN group
0,slope_days_since_previous_transaction,0.5,High
1,slope_days_until_next_transaction,0.463781,High
2,slope_days_since_first_transaction,0.435954,High
3,slope_customer_transaction_order,0.435954,High
4,slope_amount,0.435954,High
5,tenure_90d,0.362633,High
6,rfm_monetary_90d,0.362633,High
7,rfm_frequency_90d,0.362633,High
8,rfm_recency_90d,0.362633,High
9,rfm_recency_60d,0.235424,High


### Statistics

In [100]:
def get_transaction_statistics_features(customers_df, transactions_df, observed_date, feature_list):

    time_windows = ["all_time", "30d", "60d", "90d"]

    all_stats_df_list = []

    for time_window in time_windows:

        if time_window == "all_time":
            filtered_transactions_df = transactions_df
        else:
            # Limit data to the new cutoff
            days = int(time_window.strip("d"))
            filtered_transactions_df = transactions_df[
                (transactions_df['transaction_date'] <= observed_date - pd.Timedelta(days=days))
            ]

        customers_list = filtered_transactions_df['customer_id'].unique()
        stats_dict = {}

        for customer_id in customers_list:

            customer_transactions = filtered_transactions_df[
                filtered_transactions_df['customer_id'] == customer_id
            ]

            stats_dict[customer_id] = {}

            for feature_name in feature_list:

                y = customer_transactions[feature_name].dropna().values

                if len(y) < 2:
                    # Less than 2 observations -> return NaN for all stats
                    stats_dict[customer_id][f"min_{feature_name}"] = np.nan
                    stats_dict[customer_id][f"mean_{feature_name}"] = np.nan
                    stats_dict[customer_id][f"mode_{feature_name}"] = np.nan
                    stats_dict[customer_id][f"max_{feature_name}"] = np.nan
                    for q in [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99]:
                        stats_dict[customer_id][f"q{q}_{feature_name}"] = np.nan
                    continue

                # Compute stats
                stats_dict[customer_id][f"min_{feature_name}"] = np.min(y)
                stats_dict[customer_id][f"mean_{feature_name}"] = np.mean(y)

                # Compute mode safely
                mode_result = stats.mode(y, nan_policy='omit')
                if hasattr(mode_result.mode, "__len__"):
                    # old SciPy: mode is array
                    mode_val = mode_result.mode[0] if len(mode_result.mode) > 0 else np.nan
                else:
                    # new SciPy: mode is scalar
                    mode_val = mode_result.mode if mode_result.count > 0 else np.nan

                stats_dict[customer_id][f"mode_{feature_name}"] = mode_val

                stats_dict[customer_id][f"max_{feature_name}"] = np.max(y)

                # Quantiles
                for q in [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99]:
                    stats_dict[customer_id][f"q{q}_{feature_name}"] = np.percentile(y, q)

        # Convert to dataframe
        stats_df = pd.DataFrame.from_dict(stats_dict, orient='index').reset_index().rename(columns={'index': 'customer_id'})
        all_stats_df_list.append(stats_df)

    # Merge with customers_df (only keep last time_window stats)
    final_stats_df = all_stats_df_list[-1]  # or merge all windows if needed
    customers_df = pd.merge(customers_df, final_stats_df, on='customer_id', how='left')

    return customers_df


In [101]:
customers_modeling_df = get_transaction_statistics_features(
    customers_df=customers_modeling_df,
    transactions_df=transactions_modeling_df,
    observed_date=CUTOFF_TRAINING_DATE,
    feature_list=[
        'amount',
        'days_since_previous_transaction',
        'days_until_next_transaction',
        'customer_transaction_order',
        'days_since_first_transaction'
    ]
)

In [102]:
check_nan_in_df_cols(customers_modeling_df)

Total features: 113
Information on NaN values
Number of High Proportion Features: 98
Number of Medium Proportion Features: 4
Number of Low Proportion Features: 11


Unnamed: 0,feature,nan_proportion,NaN group
0,q60_days_since_previous_transaction,0.5,High
1,q50_days_since_previous_transaction,0.5,High
2,min_days_since_previous_transaction,0.5,High
3,slope_days_since_previous_transaction,0.5,High
4,mean_days_since_previous_transaction,0.5,High
...,...,...,...
108,rfm_recency_all_time,0.0,Low
109,rfm_frequency_all_time,0.0,Low
110,rfm_monetary_all_time,0.0,Low
111,tenure_all_time,0.0,Low


In [232]:
customers_modeling_df.count()

customer_id                         2264
signup_date                         2264
true_lifetime_days                  2264
termination_date                    2264
is_churn_30_days                    2264
                                    ... 
q70_days_since_first_transaction    1476
q80_days_since_first_transaction    1476
q90_days_since_first_transaction    1476
q95_days_since_first_transaction    1476
q99_days_since_first_transaction    1476
Length: 113, dtype: int64

In [233]:
customers_modeling_df.columns

Index(['customer_id', 'signup_date', 'true_lifetime_days', 'termination_date',
       'is_churn_30_days', 'is_churn_60_days', 'is_churn_90_days',
       'rfm_recency_all_time', 'rfm_frequency_all_time',
       'rfm_monetary_all_time',
       ...
       'q20_days_since_first_transaction', 'q30_days_since_first_transaction',
       'q40_days_since_first_transaction', 'q50_days_since_first_transaction',
       'q60_days_since_first_transaction', 'q70_days_since_first_transaction',
       'q80_days_since_first_transaction', 'q90_days_since_first_transaction',
       'q95_days_since_first_transaction', 'q99_days_since_first_transaction'],
      dtype='object', length=113)

In [234]:
customers_modeling_df

Unnamed: 0,customer_id,signup_date,true_lifetime_days,termination_date,is_churn_30_days,is_churn_60_days,is_churn_90_days,rfm_recency_all_time,rfm_frequency_all_time,rfm_monetary_all_time,...,q20_days_since_first_transaction,q30_days_since_first_transaction,q40_days_since_first_transaction,q50_days_since_first_transaction,q60_days_since_first_transaction,q70_days_since_first_transaction,q80_days_since_first_transaction,q90_days_since_first_transaction,q95_days_since_first_transaction,q99_days_since_first_transaction
0,C00000,2025-08-22,204,2026-03-14,0,0,0,1,3,379.90,...,0.8,1.2,1.6,2.0,5.8,9.6,13.4,17.2,19.1,20.62
1,C00001,2025-03-07,365,2026-03-07,0,0,0,21,11,620.79,...,119.0,122.4,129.2,136.0,136.8,137.6,146.0,162.0,170.0,176.40
2,C00002,2025-08-18,48,2025-10-05,1,1,0,6,11,910.64,...,4.0,5.0,8.0,11.0,15.0,15.0,22.0,27.0,32.0,36.00
3,C00004,2025-05-28,113,2025-09-18,0,0,0,18,19,2018.94,...,36.0,43.2,50.8,55.0,59.2,60.0,65.4,83.4,91.4,96.68
4,C00006,2025-08-22,117,2025-12-17,1,1,1,28,1,20.20,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2259,C02990,2025-02-01,307,2025-12-05,1,1,1,4,18,2207.01,...,199.4,211.0,215.2,220.0,225.6,229.4,231.0,232.2,234.6,236.52
2260,C02993,2025-03-01,134,2025-07-13,0,0,0,102,8,1090.93,...,,,,,,,,,,
2261,C02994,2025-01-30,112,2025-05-22,0,0,0,140,20,1474.70,...,,,,,,,,,,
2262,C02996,2025-06-03,308,2026-04-07,1,1,1,4,6,235.07,...,31.4,37.6,48.8,60.0,64.8,69.6,77.6,88.8,94.4,98.88


In [None]:
#customers_modeling_df.to_csv(f"../data/gold/customers_features_{MAX_DATA_DATE.strftime("%d_%m_%Y")}.csv", index=None)

## Activity Trend Features (% Relative Change)

# Data Split

In [103]:
customers_modeling_df = pd.read_csv('../data/gold/customers_features_31_12_2025.csv')

In [104]:
customers_modeling_df = customers_modeling_df.drop(columns=['signup_date', 'true_lifetime_days', 'termination_date'])

In [105]:
X_df = customers_modeling_df.drop(columns=['is_churn_30_days', 'is_churn_60_days', 'is_churn_90_days'])
X_df = X_df.set_index('customer_id', drop=True)

In [106]:
y_df =customers_modeling_df[['customer_id', 'is_churn_30_days', 'is_churn_60_days', 'is_churn_90_days']]
y_df = y_df.set_index('customer_id', drop=True)

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.33, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

# Feature Processing

Available techniques:
- Filter methods: Evaluate feaftures using statistical properties of the data, not model performance.
- Wrapper methods: Use different combination of features to learn an algorithm.
    - Forward selection
    - Backward elimination
    - Recursive feature elimination
- Embedded methods

## Split to Numeric and Categorical

There isn't a numeric feature, I'm just adding it for clarity.

In [108]:
X_train_numeric_df = X_train.select_dtypes(include="number")
X_train_categorical_df = X_train.select_dtypes(exclude="number")

## Impute

Since there are lots of Nans in my data (the Nans actually have meaning though), and I don't want the lack of values to affect my model performance, so I'm imputing them. I'm using a model so the imputation is as similar to the range of each feature as possible.
I'm using an IterativeImputer from sklearn. It:
- Do a random guess for values of NaN cells.
- Pick a feature with NaN and use that as target
- Split the data into two sets:
    - Rows where target feature is non-null (training data)
    - Rows where target feature is null (prediction input)
- Train the regression model
- Predict missing values
- Move to the next column
- Iterate (use new column values to train a new model)
    - Total models p x k
    - p: number of columns with at least 1 NaN
    - k: max_iter in IterativeImputer

In [121]:
numeric_imputer = IterativeImputer(
    estimator=LinearRegression(),
    max_iter=20,
    random_state=42
)

In [122]:
X_train_numeric_imputed = numeric_imputer.fit_transform(X_train_numeric_df)


[IterativeImputer] Early stopping criterion not reached.



In [128]:
X_train_numeric_imputed_df = pd.DataFrame(
    X_train_numeric_imputed,
    columns=X_train_numeric_df.columns,
    index=X_train_numeric_df.index
)

## Scale

In [135]:
scaler = StandardScaler()

In [136]:
X_train_numeric_imputed_scaled = scaler.fit_transform(X_train_numeric_imputed_df)

X_train_numeric_imputed_scaled_df = pd.DataFrame(
    X_train_numeric_imputed_scaled,
    columns=X_train_numeric_df.columns,
    index=X_train_numeric_df.index
)

## Feature Selection

### Filter Methods

#### Variance

In [137]:
variance_selector = VarianceThreshold(threshold=0.5)

In [140]:
X_train_numeric_imputed_scaled_selected = variance_selector.fit_transform(
    X_train_numeric_imputed_scaled_df
)

selected_columns = X_train_numeric_imputed_scaled_df.columns[
    variance_selector.get_support()
]

X_train_numeric_imputed_scaled_selected_df = pd.DataFrame(
    X_train_numeric_imputed_scaled_selected,
    columns=selected_columns,
    index=X_train_numeric_imputed_scaled_df.index
)

In [141]:
print(f"Before: Total {X_train_numeric_imputed_scaled_df.shape[1]} features")
print(f"After:  Total {X_train_numeric_imputed_scaled_selected_df.shape[1]} features")
print(list(X_train_numeric_imputed_scaled_selected_df.columns))

Before: Total 106 features
After:  Total 105 features
['rfm_recency_all_time', 'rfm_frequency_all_time', 'rfm_monetary_all_time', 'tenure_all_time', 'rfm_recency_30d', 'rfm_frequency_30d', 'rfm_monetary_30d', 'tenure_30d', 'rfm_recency_60d', 'rfm_frequency_60d', 'rfm_monetary_60d', 'tenure_60d', 'rfm_recency_90d', 'rfm_frequency_90d', 'rfm_monetary_90d', 'tenure_90d', 'slope_amount', 'slope_days_since_previous_transaction', 'slope_days_until_next_transaction', 'slope_days_since_first_transaction', 'min_amount', 'mean_amount', 'mode_amount', 'max_amount', 'q1_amount', 'q5_amount', 'q10_amount', 'q20_amount', 'q30_amount', 'q40_amount', 'q50_amount', 'q60_amount', 'q70_amount', 'q80_amount', 'q90_amount', 'q95_amount', 'q99_amount', 'min_days_since_previous_transaction', 'mean_days_since_previous_transaction', 'mode_days_since_previous_transaction', 'max_days_since_previous_transaction', 'q1_days_since_previous_transaction', 'q5_days_since_previous_transaction', 'q10_days_since_previous_

#### Correlation

Remove features that:
- Are weakly correlated with the target
- Are highly correlated with other features

In [None]:
#X_train_numeric_scaled_selected_df

Unnamed: 0_level_0,rfm_recency_all_time,rfm_frequency_all_time,rfm_monetary_all_time,tenure_all_time,rfm_recency_30d,rfm_frequency_30d,rfm_monetary_30d,tenure_30d,rfm_recency_60d,rfm_frequency_60d,...,q20_days_since_first_transaction,q30_days_since_first_transaction,q40_days_since_first_transaction,q50_days_since_first_transaction,q60_days_since_first_transaction,q70_days_since_first_transaction,q80_days_since_first_transaction,q90_days_since_first_transaction,q95_days_since_first_transaction,q99_days_since_first_transaction
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C01552,0.383419,0.605712,0.336296,0.346744,,,,,,,...,0.663671,0.604039,0.544525,0.488992,0.407083,0.326258,0.241753,0.162933,0.125281,0.096497
C02842,0.192025,-0.457277,-0.717631,-0.071996,,,,,,,...,0.161872,0.092614,0.024010,-0.039896,-0.098779,-0.154792,-0.213598,-0.267614,-0.293035,-0.312093
C01642,-0.844691,-0.866119,-0.774076,-1.231585,-0.933121,-0.889452,-0.721803,-1.290124,-0.868479,-0.920277,...,,,,,,,,,,
C00191,0.893802,-0.866119,-0.749507,-1.231585,,,,,,,...,,,,,,,,,,
C02329,-0.557600,-0.130204,0.378645,1.957285,1.274180,-0.889452,-0.567288,-1.290124,0.296112,-0.745712,...,1.595585,1.503441,1.446751,1.410280,1.385085,1.397840,1.484212,1.569390,1.610698,1.643504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C02161,0.287722,1.750469,1.535458,1.796231,,,,,,,...,2.251509,2.136549,2.022787,1.913576,1.820126,1.729599,1.630575,1.561416,1.527824,1.501828
C01449,-0.445954,-0.293741,-0.648541,0.588325,2.132574,-0.593561,-0.748244,-0.990408,0.749009,-0.745712,...,0.520300,0.586404,0.648628,0.710783,0.633035,0.555172,0.472682,0.395748,0.358907,0.330693
C01496,1.771023,-0.866119,-0.784490,-1.231585,,,,,,,...,,,,,,,,,,
C01726,-0.844691,2.322848,0.292184,1.119804,-0.933121,0.294113,-0.299526,0.707983,-0.868479,1.872772,...,0.513131,0.561714,0.534115,0.506052,0.528490,0.535267,0.606034,0.816728,0.819844,0.834464


In [None]:
#X_train_numeric_scaled_selected_df.dtypes

rfm_recency_all_time                float64
rfm_frequency_all_time              float64
rfm_monetary_all_time               float64
tenure_all_time                     float64
rfm_recency_30d                     float64
                                     ...   
q70_days_since_first_transaction    float64
q80_days_since_first_transaction    float64
q90_days_since_first_transaction    float64
q95_days_since_first_transaction    float64
q99_days_since_first_transaction    float64
Length: 105, dtype: object

In [None]:
#y_train.notna().sum()

is_churn_30_days    1516
is_churn_60_days    1516
is_churn_90_days    1516
dtype: int64

In [None]:
#X_train_numeric_scaled_selected_df.notna().sum()

rfm_recency_all_time                1516
rfm_frequency_all_time              1516
rfm_monetary_all_time               1516
tenure_all_time                     1516
rfm_recency_30d                      832
                                    ... 
q70_days_since_first_transaction     978
q80_days_since_first_transaction     978
q90_days_since_first_transaction     978
q95_days_since_first_transaction     978
q99_days_since_first_transaction     978
Length: 105, dtype: int64

In [None]:
#X_train_numeric_scaled_selected_df.corr()

Unnamed: 0,rfm_recency_all_time,rfm_frequency_all_time,rfm_monetary_all_time,tenure_all_time,rfm_recency_30d,rfm_frequency_30d,rfm_monetary_30d,tenure_30d,rfm_recency_60d,rfm_frequency_60d,...,q20_days_since_first_transaction,q30_days_since_first_transaction,q40_days_since_first_transaction,q50_days_since_first_transaction,q60_days_since_first_transaction,q70_days_since_first_transaction,q80_days_since_first_transaction,q90_days_since_first_transaction,q95_days_since_first_transaction,q99_days_since_first_transaction
rfm_recency_all_time,1.000000,-0.199550,-0.163298,-0.253660,1.000000,-0.441609,-0.314436,-0.620291,1.000000,-0.410160,...,0.229897,0.201510,0.174516,0.146824,0.122137,0.099150,0.074794,0.053200,0.042634,0.034722
rfm_frequency_all_time,-0.199550,1.000000,0.753543,0.653652,-0.184511,0.601769,0.418042,0.457522,-0.115987,0.711170,...,0.515164,0.530893,0.546672,0.559945,0.570885,0.582224,0.591361,0.599838,0.605326,0.608359
rfm_monetary_all_time,-0.163298,0.753543,1.000000,0.506585,-0.142685,0.506291,0.682329,0.371102,-0.122423,0.612924,...,0.366671,0.379831,0.395639,0.408555,0.418719,0.429175,0.441167,0.450461,0.455933,0.459197
tenure_all_time,-0.253660,0.653652,0.506585,1.000000,0.012854,0.077775,0.026929,0.152526,0.014108,0.221948,...,0.940789,0.952559,0.963079,0.971623,0.979337,0.985871,0.992190,0.997517,0.999308,0.999971
rfm_recency_30d,1.000000,-0.184511,-0.142685,0.012854,1.000000,-0.441609,-0.314436,-0.620291,1.000000,-0.330658,...,0.126640,0.114339,0.106044,0.092897,0.080207,0.070483,0.060413,0.052980,0.048364,0.044895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
q70_days_since_first_transaction,0.099150,0.582224,0.429175,0.985871,0.070483,0.000085,-0.033960,0.021382,0.075627,0.113424,...,0.968685,0.978062,0.986195,0.992853,0.997750,1.000000,0.997288,0.992492,0.989507,0.986696
q80_days_since_first_transaction,0.074794,0.591361,0.441167,0.992190,0.060413,-0.000027,-0.030643,0.026276,0.058807,0.124737,...,0.959923,0.970075,0.979008,0.986417,0.992754,0.997288,1.000000,0.997408,0.995132,0.992880
q90_days_since_first_transaction,0.053200,0.599838,0.450461,0.997517,0.052980,-0.005548,-0.032869,0.025926,0.044451,0.136068,...,0.951203,0.962267,0.972015,0.979879,0.986901,0.992492,0.997408,1.000000,0.999295,0.997975
q95_days_since_first_transaction,0.042634,0.605326,0.455933,0.999308,0.048364,-0.006491,-0.031895,0.029959,0.037083,0.142139,...,0.946204,0.957652,0.967816,0.976040,0.983408,0.989507,0.995132,0.999295,1.000000,0.999555


In [None]:
#X_train_numeric_scaled_selected_df.corrwith(y_train).notna().sum()

0

In [None]:
'''
targets = ['is_churn_30_days', 'is_churn_60_days', 'is_churn_90_days']
for target in targets:
    print(X_train_numeric_scaled_selected_df.corrwith(y_train[target]))
'''

rfm_recency_all_time                0.039826
rfm_frequency_all_time             -0.000051
rfm_monetary_all_time              -0.006197
tenure_all_time                    -0.026362
rfm_recency_30d                     0.050541
                                      ...   
q70_days_since_first_transaction   -0.001580
q80_days_since_first_transaction   -0.004653
q90_days_since_first_transaction   -0.005200
q95_days_since_first_transaction   -0.005860
q99_days_since_first_transaction   -0.005810
Length: 105, dtype: float64
rfm_recency_all_time                0.039009
rfm_frequency_all_time              0.001933
rfm_monetary_all_time              -0.004930
tenure_all_time                    -0.034964
rfm_recency_30d                     0.055615
                                      ...   
q70_days_since_first_transaction   -0.006855
q80_days_since_first_transaction   -0.008585
q90_days_since_first_transaction   -0.008353
q95_days_since_first_transaction   -0.008980
q99_days_since_first_transa

#### Information Gain

Information Gain: measures how much a feature provides about the target variable.
- Higher information gain -> More useful features

In [129]:
from sklearn.feature_selection import mutual_info_classif

In [132]:
target = 'is_churn_30_days'
mutual_info_classif(X_train_numeric_scaled_selected_df, y_train[target], random_state=42)

ValueError: Input X contains NaN.

### Wrapper methods

## Combine to Pipeline

# Train

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [None]:
targets = ['is_churn_30_days', 'is_churn_60_days', 'is_churn_90_days']
models = {}
predictions = {}
scores = {}

for target in targets:
    y_train_target = y_train[target]
    y_test_target = y_test[target]

    model = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
    
    model.fit(X_train, y_train)

    y_pred_prob = model.predict_proba(X_test)

In [None]:

    model.fit(X_train, y_train)
    
    y_pred_prob = model.predict_proba(X_test)[:,1]
    
    # Store results
    models[target] = model
    predictions[target] = y_pred_prob
    scores[target] = roc_auc_score(y_test, y_pred_prob)

# Optional: show AUC scores
print("ROC AUC scores per target:", scores)


Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.




ROC AUC scores per target: {'is_churn_30_days': 0.46480858059970254, 'is_churn_60_days': 0.5000689655172413, 'is_churn_90_days': 0.4820922576881407}



Parameters: { "use_label_encoder" } are not used.


