In [3]:
from sklearn.cluster import KMeans
import pandas as pd

def create_proxy_variable(df):
    # Check if necessary columns are available
    required_columns = ['CustomerId', 'TransactionStartTime', 'Transaction_Count', 'Total_Transaction_Amount']
    for col in required_columns:
        if col not in df.columns:
            raise KeyError(f"The column '{col}' is missing from the DataFrame.")

    # Define a snapshot date
    snapshot_date = pd.to_datetime("2025-06-30")
    
    # Calculate Recency
    df['Recency'] = (snapshot_date - pd.to_datetime(df['TransactionStartTime'])).dt.days

    # Group by CustomerId and aggregate RFM metrics
    rfm = df.groupby('CustomerId').agg({
        'Recency': 'min',
        'Transaction_Count': 'sum',
        'Total_Transaction_Amount': 'sum'
    }).reset_index()

    # K-Means Clustering
    kmeans = KMeans(n_clusters=3, random_state=42)
    rfm_scaled = (rfm[['Recency', 'Transaction_Count', 'Total_Transaction_Amount']] - 
                  rfm[['Recency', 'Transaction_Count', 'Total_Transaction_Amount']].mean()) / \
                  rfm[['Recency', 'Transaction_Count', 'Total_Transaction_Amount']].std()
    
    rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

    # Define High-Risk Label
    high_risk_cluster = rfm['Cluster'].value_counts().idxmin()  # Assuming the least engaged cluster is high-risk
    rfm['is_high_risk'] = (rfm['Cluster'] == high_risk_cluster).astype(int)

    return rfm

if __name__ == "__main__":
    df = pd.read_csv('../data/processed/processed_data.csv')
    proxy_df = create_proxy_variable(df)
    proxy_df.to_csv('../data/processed/proxy_data.csv', index=False)

KeyError: "The column 'CustomerId' is missing from the DataFrame."