## Model


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


#### Preparation

In [2]:
# load data
df = pd.read_csv("data/data_processed/client_data.csv")

# feature target separation
X = df.drop(columns=["target"])  
y = df["target"]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# unbalanced dataset?
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]


#### Stage 1 XGBoost

In [6]:
# stage one xgboost model
model_stage1 = XGBClassifier(n_estimators=500,
    max_depth=5,        # avoids overfitting
    learning_rate=0.05, # Slow learning
    subsample=0.8,      # randomness
    colsample_bytree=0.8, 
    scale_pos_weight=scale_pos_weight,  # Adjust based on fraud ratio
    eval_metric="aucpr") 
# training
model_stage1.fit(X_train, y_train)

# probabilities for fraud
fraud_probs_stage1 = model_stage1.predict_proba(X_test)[:, 1]  

# Fraud probabilities for second stage model > 50%
suspicious_cases = X_test[fraud_probs_stage1 > 0.30]
y_suspicious = y_test[fraud_probs_stage1 > 0.30]

In [4]:
#classification report for stage 1
print(classification_report(y_test, model_stage1.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.97      0.70      0.81     38334
         1.0       0.12      0.66      0.20      2314

    accuracy                           0.70     40648
   macro avg       0.54      0.68      0.51     40648
weighted avg       0.92      0.70      0.78     40648



In [5]:
#confusion matrix for stage 1
print(confusion_matrix(y_test, model_stage1.predict(X_test)))

[[26813 11521]
 [  784  1530]]


#### Stage 2 Decision tree

In [7]:
# decision tree for second stage
dt_model_stage2 = DecisionTreeClassifier(max_depth=10)
dt_model_stage2.fit(suspicious_cases, y_suspicious)


fraud_probs_stage2 = dt_model_stage2.predict_proba(suspicious_cases)[:, 1]

# categorize risk
def risk_category(prob):
    if prob > 0.7:
        return "High Risk"
    elif prob > 0.4:
        return "Medium Risk"
    else:
        return "Low Risk"

suspicious_cases_output = suspicious_cases.copy() 
suspicious_cases_output["Risk Level"] = [risk_category(p) for p in fraud_probs_stage2]


In [8]:
high_risk_cases = suspicious_cases_output[suspicious_cases_output["Risk Level"] == "High Risk"]
high_risk_cases.head()

Unnamed: 0,disrict,client_catg,region,1transactions_count,consommation_level_1_mean,consommation_level_2_mean,consommation_level_3_mean,consommation_level_4_mean,Risk Level
74207,69,11,103,67,646.014925,147.223881,153.328358,260.134328,High Risk
69190,62,11,304,20,769.35,745.6,243.1,0.0,High Risk
74113,69,51,104,66,374.69697,66.666667,132.348485,1094.560606,High Risk
39184,69,11,103,16,607.5,15.125,25.0,25.25,High Risk
70978,69,11,103,17,1066.882353,2224.411765,345.235294,1054.352941,High Risk


In [9]:
#classification report for stage 2
print(classification_report(y_suspicious, dt_model_stage2.predict(suspicious_cases)))

              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96     23173
         1.0       0.84      0.13      0.23      2068

    accuracy                           0.93     25241
   macro avg       0.88      0.56      0.59     25241
weighted avg       0.92      0.93      0.90     25241



In [10]:
#confusion matrix for stage 2
print(confusion_matrix(y_suspicious, dt_model_stage2.predict(suspicious_cases)))

[[23121    52]
 [ 1799   269]]


#### Stage 2 Logistic regression

In [11]:
# logistic regression for second stage
lr_model_stage2 = LogisticRegression(C=0.1, 
    penalty='l1', 
    solver='liblinear', 
    class_weight='balanced', 
    max_iter=500)

lr_model_stage2.fit(suspicious_cases, y_suspicious)
lr_model_stage2.predict_proba(suspicious_cases)[:, 1]

#classification report for stage 2
print(classification_report(y_suspicious, lr_model_stage2.predict(suspicious_cases)))       

              precision    recall  f1-score   support

         0.0       0.94      0.61      0.74     23173
         1.0       0.11      0.55      0.19      2068

    accuracy                           0.61     25241
   macro avg       0.53      0.58      0.46     25241
weighted avg       0.87      0.61      0.70     25241



In [12]:
#confusion matrix for stage 2 lr
print(confusion_matrix(y_suspicious, lr_model_stage2.predict(suspicious_cases)))

[[14248  8925]
 [  937  1131]]


### Testing on new data

In [None]:
# new_data = pd.read_csv("new_test_data.csv")

# # First stage
# new_fraud_probs_stage1 = model_stage1.predict_proba(new_data)[:, 1]
# new_suspicious_cases = new_data[new_fraud_probs_stage1 > 0.3]

# # second stage
# new_fraud_probs_stage2 = model_stage2.predict_proba(new_suspicious_cases)[:, 1]
# new_suspicious_cases["Risk Level"] = [risk_category(p) for p in new_fraud_probs_stage2]

# # printing suspicious cases
# new_suspicious_cases.to_csv("data/output/sunsafe_risk_predictions.csv", index=False)
# print("printed suspicious cases to sunsafe_risk_predictions.csv")