## Import Required Libraries

In [6]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


## Load and Inspect the Dataset

In [7]:
df = pd.read_csv("../data/raw/data.csv")

# Convert transaction date to datetime
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

df.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0


## Define a Snapshot Date (Very Important)

In [8]:
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)
snapshot_date


Timestamp('2019-02-14 10:01:28+0000', tz='UTC')

## Calculate RFM Metrics

In [9]:
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,
    'CustomerId': 'count',
    'Amount': 'sum'
})

rfm.columns = ['Recency', 'Frequency', 'Monetary']
rfm.reset_index(inplace=True)

rfm


Unnamed: 0,CustomerId,Recency,Frequency,Monetary
0,CustomerId_1,84,1,-10000.0
1,CustomerId_10,84,1,-10000.0
2,CustomerId_1001,90,5,20000.0
3,CustomerId_1002,26,11,4225.0
4,CustomerId_1003,12,6,20000.0
...,...,...,...,...
3737,CustomerId_992,5,6,20000.0
3738,CustomerId_993,26,5,20000.0
3739,CustomerId_994,1,101,543873.0
3740,CustomerId_996,68,17,139000.0


## Pre-process RFM Features

In [10]:
scaler = StandardScaler()

rfm_scaled = scaler.fit_transform(
    rfm[['Recency', 'Frequency', 'Monetary']]
)

rfm_scaled = pd.DataFrame(
    rfm_scaled,
    columns=['Recency_scaled', 'Frequency_scaled', 'Monetary_scaled']
)

rfm_scaled.head()


Unnamed: 0,Recency_scaled,Frequency_scaled,Monetary_scaled
0,1.937605,-0.253459,-0.066891
1,1.937605,-0.253459,-0.066891
2,2.158882,-0.212186,-0.055849
3,-0.201408,-0.150278,-0.061655
4,-0.717722,-0.201868,-0.055849


## Cluster Customers Using K-Means

In [11]:
kmeans = KMeans(
    n_clusters=3,
    random_state=42,
    n_init=10
)

rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)
rfm.head()


[WinError 2] The system cannot find the file specified
  File "c:\Users\HP\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\HP\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HP\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  

Unnamed: 0,CustomerId,Recency,Frequency,Monetary,Cluster
0,CustomerId_1,84,1,-10000.0,0
1,CustomerId_10,84,1,-10000.0,0
2,CustomerId_1001,90,5,20000.0,0
3,CustomerId_1002,26,11,4225.0,1
4,CustomerId_1003,12,6,20000.0,1


## Analyze Clusters to Identify High-Risk Group

In [12]:
cluster_profile = rfm.groupby('Cluster')[['Recency', 'Frequency', 'Monetary']].mean()
cluster_profile


Unnamed: 0_level_0,Recency,Frequency,Monetary
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,61.877279,7.720196,81720.68
1,12.726566,34.8,272574.1
2,29.0,4091.0,-104900000.0


## Identify High-Risk Cluster Programmatically

In [13]:
high_risk_cluster = cluster_profile['Frequency'].idxmin()
high_risk_cluster


np.int32(0)

## Create the Proxy Target Variable

In [14]:
rfm['is_high_risk'] = np.where(
    rfm['Cluster'] == high_risk_cluster, 1, 0
)

rfm[['CustomerId', 'Cluster', 'is_high_risk']]


Unnamed: 0,CustomerId,Cluster,is_high_risk
0,CustomerId_1,0,1
1,CustomerId_10,0,1
2,CustomerId_1001,0,1
3,CustomerId_1002,1,0
4,CustomerId_1003,1,0
...,...,...,...
3737,CustomerId_992,1,0
3738,CustomerId_993,1,0
3739,CustomerId_994,1,0
3740,CustomerId_996,0,1


## Merge Target Variable Back to Main Dataset

In [15]:
df_final = df.merge(
    rfm[['CustomerId', 'is_high_risk']],
    on='CustomerId',
    how='left'
)

df_final


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,is_high_risk
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0,1
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000,2019-02-13 09:54:09+00:00,2,0,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2019-02-13 09:54:25+00:00,2,0,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2019-02-13 09:54:35+00:00,2,0,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000,2019-02-13 10:01:10+00:00,2,0,0


In [16]:
# Re-merge target
df = df.merge(
    rfm[['CustomerId', 'is_high_risk']],
    on='CustomerId',
    how='left'
)


In [17]:
df['is_high_risk'].value_counts()


is_high_risk
0    84653
1    11009
Name: count, dtype: int64

In [18]:
required_cols = ['is_high_risk', 'CustomerId']

for col in required_cols:
    assert col in df.columns, f"{col} is missing from dataset"


In [19]:
X = df.drop(columns=['is_high_risk', 'CustomerId'])
y = df['is_high_risk']


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [33]:
import mlflow
import mlflow.sklearn

# 1️⃣ Set experiment FIRST
mlflow.set_experiment("Credit_Risk_Modeling")

# 2️⃣ Start run
with mlflow.start_run(run_name="Logistic_Regression"):
    mlflow.log_param("model", "logistic_regression")


In [45]:
X.dtypes


CountryCode          int64
Amount             float64
Value                int64
PricingStrategy      int64
FraudResult          int64
dtype: object

In [46]:
X = X.select_dtypes(include=['int64', 'float64'])


In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [48]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(max_iter=1000, random_state=42)
model_lr.fit(X_train, y_train)


In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)

with mlflow.start_run(run_name="Logistic_Regression"):

    model_lr = LogisticRegression(max_iter=1000, random_state=42)
    model_lr.fit(X_train, y_train)

    y_pred = model_lr.predict(X_test)
    y_prob = model_lr.predict_proba(X_test)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }

    mlflow.log_params(model_lr.get_params())
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(model_lr, "model")

    print(metrics)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'accuracy': 0.88491088694925, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'roc_auc': np.float64(0.5196080892736029)}


In [50]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Identify columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LogisticRegression(max_iter=1000, random_state=42))
])

pipeline.fit(X_train, y_train)


In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5]
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    rf,
    param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
grid_search.best_params_


{'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}

In [52]:
with mlflow.start_run(run_name="Random_Forest_Tuned"):

    best_rf.fit(X_train, y_train)

    y_pred = best_rf.predict(X_test)
    y_prob = best_rf.predict_proba(X_test)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }

    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(best_rf, "model")

    print(metrics)




{'accuracy': 0.8853812784194847, 'precision': 0.5412844036697247, 'recall': 0.026793823796548592, 'f1': 0.05106014712245781, 'roc_auc': np.float64(0.6151979737601423)}


In [53]:
import mlflow

mlflow.set_experiment("Credit_Risk_Modeling")

with mlflow.start_run() as run:
    mlflow.log_param("param1", 5)
    mlflow.log_metric("metric1", 0.9)
    print("Run ID:", run.info.run_id)


Run ID: a0437b9a7cff4de08a29472184f0e78f


In [61]:
import mlflow

mlflow.set_experiment("Credit_Risk_Modeling")

with mlflow.start_run() as run:
    mlflow.log_param("param1", 5)
    mlflow.log_metric("metric1", 0.9)
    print("Run ID:", run.info.run_id)


Run ID: 4a5f35ef02ae475795fc4838c1ec5726


In [58]:
mlflow.set_experiment("Credit_Risk_Modeling")


<Experiment: artifact_location='file:c:/absolute/path/to/mlruns/216373809357627911', creation_time=1765788532872, experiment_id='216373809357627911', last_update_time=1765788532872, lifecycle_stage='active', name='Credit_Risk_Modeling', tags={}>

In [60]:
print(mlflow.get_tracking_uri())


file:///absolute/path/to/mlruns


In [62]:
with mlflow.start_run():
    mlflow.log_param("param1", 5)
    mlflow.log_metric("metric1", 0.9)
    print("R4a5f35ef02ae475795fc4838c1ec5726", mlflow.active_run().info.run_id)


R4a5f35ef02ae475795fc4838c1ec5726 1e02c0e50369480195f6abf5673863c1
