In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, cross_val_predict 
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import cleanlab

import random
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from cleanlab import Datalab
import warnings
warnings.filterwarnings("ignore")

In [25]:
X_encoded = pd.read_pickle("/Users/marianamatos/ppgti/data-centric-ia/weak-supervised-fraud-detection/data/X_encoded.pkl")
y = pd.read_pickle("/Users/marianamatos/ppgti/data-centric-ia/weak-supervised-fraud-detection/data/y.pkl")
X_raw = pd.read_pickle("/Users/marianamatos/ppgti/data-centric-ia/weak-supervised-fraud-detection/data/df_ini.pkl")

# y_test = pd.read_pickle("/Users/marianamatos/ppgti/data-centric-ia/weak-supervised-fraud-detection/data/y_test.pkl")

## Obtain model prediction

We'll train a classifier with cross-validation to obtain ou-of-sample predictions.

Cleanlab has found 2440 label erros, almost 10% of the dataset

In [None]:
clf = LogisticRegression(max_iter=1000, random_state=37)

In [None]:
num_crossval_folds = 5
pred_probs = cross_val_predict(
    clf,
    X_encoded,     # Preprocessed feature matrix
    y,             # Labels
    cv=num_crossval_folds,
    method="predict_proba"  
)

print("Shape of predicted probabilities:", pred_probs.shape)

Shape of predicted probabilities: (23633, 2)


In [19]:
knn = NearestNeighbors(metric="euclidean")
knn.fit(X_encoded.values)

knn_graph = knn.kneighbors_graph(mode="distance")


In [20]:
from cleanlab import Datalab

data = {"X": X_encoded.values, "y": y}
lab = Datalab(data, label_name="y")
lab.find_issues(pred_probs=pred_probs, knn_graph=knn_graph)

Finding label issues ...
Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...

Audit complete. 2440 issues found in the dataset.


In [21]:
lab.report()

Dataset Information: num_examples: 23633, num_classes: 2

Here is a summary of various issues found in your data:

    issue_type  num_issues
near_duplicate        1873
         label         504
       outlier          62
       non_iid           1

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


------------------ near_duplicate issues -------------------

About this issue:
	A (near) duplicate issue refers to two or more examples in
    a dataset that are extremely similar to each other, relative
    to the rest of the dataset.  The examples flagged with this issue
    may be exactly duplicated, or lie atypically close together when
    represented as vectors (i.e. feature embeddings).
    

Number of examples with this issue: 1873
Overall dataset quality in

These cases were labeled as fraud (given_label = 1), but the cleanlab model believes they should be non-fraud (predicted_label = 0), were observed


* Old accounts (is_new_account = 0) in all cases
* Low transactional values (cat_transaction_amount = 0), which can be typical of legitimate transactions, as frauds often vary more or target high values
* Several examples with more common devices (e.g., device_used = 0), which reduces suspicion
* Non-repeated addresses and unique IPs, which also tends to indicate legitimate transactions, considering the first analysis that we saw a few cases where the addresses were duplicated.

According to the dataset's pattern, these transactions seem to fit well within the profile of normal transactions.  This suggests that the label is_fraudulent = 1 may have been applied erroneously.

In [None]:
# Retrieve label issues
label_issues = lab.get_issues("label")

label_issues_filtered = label_issues[label_issues['is_label_issue'] == True]
label_issues_filtered.head()

     is_label_issue  label_score  given_label  predicted_label
36             True     0.059143            1                0
115            True     0.036079            1                0
169            True     0.058270            1                0
204            True     0.042590            1                0
317            True     0.014400            1                0


In [None]:
sorted_issues = label_issues.sort_values("label_score").index

# View the most likely label errors
X_raw.iloc[sorted_issues].assign(
    given_label=y.iloc[sorted_issues],
    predicted_label=label_issues["predicted_label"].iloc[sorted_issues]).head()

Unnamed: 0,payment_method,product_category,quantity,device_used,is_fraudulent,common_customer_location,address_ip_relationship,is_ip_address_double,shipping_address_repeated,billing_address_repeated,day_of_week,month,cat_customer_age,cat_transaction_hour,is_new_account,cat_transaction_amount,given_label,predicted_label
6330,1,3,0.704005,0,1,0,0,0,False,False,3,2,2,0,0,0,1,0.0
9341,0,4,1.408397,2,1,0,0,0,False,False,2,0,2,2,0,0,1,0.0
16162,0,2,-1.409172,2,1,1,0,0,False,False,4,1,1,1,0,0,1,0.0
2077,0,0,1.408397,0,1,1,0,0,False,False,5,3,0,0,0,0,1,0.0
7373,0,0,-1.409172,0,1,0,0,0,False,False,6,0,2,1,0,0,1,0.0


outlier

These transactions were classified as very different from the rest of the data:

* New accounts (is_new_account = 1) in all cases — typical of fraudulent behavior
* Some with atypical amounts (very low or very high)
* One or more with a medium-high transactional value (e.g., cat_transaction_amount = 2), indicating possible sophisticated frauds


In [27]:
outlier_results = lab.get_issues("outlier")
sorted_outliers = outlier_results.sort_values("outlier_score").index

X_raw.iloc[sorted_outliers].head()

Unnamed: 0,payment_method,product_category,quantity,device_used,is_fraudulent,common_customer_location,address_ip_relationship,is_ip_address_double,shipping_address_repeated,billing_address_repeated,day_of_week,month,cat_customer_age,cat_transaction_hour,is_new_account,cat_transaction_amount
17581,0,3,-1.409172,0,0,0,1,0,False,False,5,2,4,1,1,0
13952,1,3,1.408397,1,0,0,0,0,False,False,6,1,1,2,1,2
16507,2,1,-0.000387,0,0,1,1,0,False,False,6,0,4,0,1,1
12508,1,3,-0.70478,1,0,0,1,0,False,False,0,1,3,1,1,0
5171,3,1,-1.409172,1,0,1,1,0,False,False,4,2,1,1,1,2


In [28]:
duplicate_results = lab.get_issues("near_duplicate")
duplicate_results.sort_values("near_duplicate_score").head()


Unnamed: 0,is_near_duplicate_issue,near_duplicate_score,near_duplicate_sets,distance_to_nearest_neighbor
0,True,0.0,[4067],0.0
17845,True,0.0,[636],0.0
17839,True,0.0,[10991],0.0
4101,True,0.0,[1667],0.0
4103,True,0.0,[173],0.0


In [29]:
# Identify the row with the lowest near_duplicate_score
lowest_scoring_duplicate = duplicate_results["near_duplicate_score"].idxmin()

# Extract the indices of the lowest scoring duplicate and its near duplicate sets
indices_to_display = [lowest_scoring_duplicate] + duplicate_results.loc[lowest_scoring_duplicate, "near_duplicate_sets"].tolist()

# Display the relevant rows from the original dataset
X_raw.iloc[indices_to_display]

Unnamed: 0,payment_method,product_category,quantity,device_used,is_fraudulent,common_customer_location,address_ip_relationship,is_ip_address_double,shipping_address_repeated,billing_address_repeated,day_of_week,month,cat_customer_age,cat_transaction_hour,is_new_account,cat_transaction_amount
0,2,1,-1.409172,0,0,1,0,0,False,False,6,2,3,2,0,0
4067,2,1,-1.409172,0,0,1,0,0,False,False,6,2,3,2,0,0


In [None]:
# Identify the next row not in the previous near duplicate set
second_lowest_scoring_duplicate = duplicate_results["near_duplicate_score"].drop(indices_to_display).idxmin()

# Extract the indices of the second lowest scoring duplicate and its near duplicate sets
next_indices_to_display = [second_lowest_scoring_duplicate] + duplicate_results.loc[second_lowest_scoring_duplicate, "near_duplicate_sets"].tolist()
X_raw.iloc[next_indices_to_display]


Unnamed: 0,payment_method,product_category,quantity,device_used,is_fraudulent,common_customer_location,address_ip_relationship,is_ip_address_double,shipping_address_repeated,billing_address_repeated,day_of_week,month,cat_customer_age,cat_transaction_hour,is_new_account,cat_transaction_amount
21,0,1,-1.409172,0,0,1,0,0,False,False,4,1,2,2,1,1
408,0,1,-1.409172,0,0,1,0,0,False,False,4,1,2,2,1,1


Conclusion

We were able to identify problematic labels inside this artificially created fraud detection dataset, revealing inconsistencies that could undermine model performance if left unaddressed. Specifically, Cleanlab flagged a substantial number of near-duplicate entries, likely stemming from synthetic data generation, and identified label issues where transactions labeled as fraudulent closely resembled legitimate behavior — such as low transaction amounts, non-repeated addresses, and established user accounts. Additionally, outlier transactions associated with new accounts and atypical patterns may reflect complex fraud strategies that traditional labeling methods missed. These findings highlight the presence of weak supervision and the risk of training models on noisy data. 



Next Step:

Retraining models using the corrected labels suggested by Cleanlab, removing duplicate records, and 

incorporating anomaly detection techniques to better capture underrepresented or sophisticated fraud patterns. This combined approach is essential to building a more reliable and robust fraud detection system.