In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


In [4]:
# Step 0: Import libraries and load data
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load your transactions data CSV (update path if needed)
df_transactions = pd.read_csv('../data/raw/data.csv')

# If 'TransactionStartTime' is not datetime, convert it
df_transactions['TransactionStartTime'] = pd.to_datetime(df_transactions['TransactionStartTime'])

# Now you can paste Step 1 and continue from there


In [5]:
import pandas as pd
import numpy as np

# Set snapshot date (usually the day after the latest transaction date)
snapshot_date = df_transactions['TransactionStartTime'].max() + pd.Timedelta(days=1)

# Convert TransactionStartTime to datetime if not already
df_transactions['TransactionStartTime'] = pd.to_datetime(df_transactions['TransactionStartTime'])

# Aggregate RFM metrics per customer
rfm = df_transactions.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,  # Recency
    'TransactionId': 'count',                                          # Frequency
    'Amount': 'sum'                                                   # Monetary
}).rename(columns={
    'TransactionStartTime': 'Recency',
    'TransactionId': 'Frequency',
    'Amount': 'Monetary'
}).reset_index()

rfm.head()


Unnamed: 0,CustomerId,Recency,Frequency,Monetary
0,CustomerId_1,84,1,-10000.0
1,CustomerId_10,84,1,-10000.0
2,CustomerId_1001,90,5,20000.0
3,CustomerId_1002,26,11,4225.0
4,CustomerId_1003,12,6,20000.0


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])


In [7]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)


In [8]:
# Calculate mean RFM per cluster to analyze
cluster_summary = rfm.groupby('Cluster')[['Recency', 'Frequency', 'Monetary']].mean()
print(cluster_summary)

# Suppose cluster with highest Recency and lowest Frequency & Monetary is high risk:
high_risk_cluster = cluster_summary.sort_values(['Recency', 'Frequency', 'Monetary'], ascending=[False, True, True]).index[0]

# Assign binary target column
rfm['is_high_risk'] = (rfm['Cluster'] == high_risk_cluster).astype(int)


           Recency    Frequency      Monetary
Cluster                                      
0        61.859846     7.726699  8.172379e+04
1        29.000000  4091.000000 -1.049000e+08
2        12.716076    34.807692  2.726546e+05


In [11]:
# If you have saved customer_features previously, load it
customer_features = pd.read_csv('../data/raw/data.csv')

# Or re-run the Task 3 pipelines to generate customer_features


In [12]:
# Merge 'is_high_risk' into your final customer features dataframe
final_df = customer_features.merge(rfm[['CustomerId', 'is_high_risk']], on='CustomerId', how='left')

# Check
final_df[['CustomerId', 'is_high_risk']].head()


Unnamed: 0,CustomerId,is_high_risk
0,CustomerId_4406,0
1,CustomerId_4406,0
2,CustomerId_4683,1
3,CustomerId_988,0
4,CustomerId_988,0
