In [50]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

cust = pd.read_csv("customers.csv")
prod = pd.read_csv("products.csv")
orders = pd.read_csv("orders.csv")

In [51]:
orders

Unnamed: 0,customer_id,product_id,is_fraud
0,1,108220,0
1,1,105636,0
2,2,104923,0
3,3,101955,0
4,3,108117,0
...,...,...,...
102073,10000,102897,0
102074,10000,102637,0
102075,10000,103859,0
102076,10000,108829,0


In [52]:
cust

Unnamed: 0,customer_id,is_consumer,is_bad_actor
0,1,1,0
1,2,1,0
2,3,1,0
3,4,1,0
4,5,1,0
...,...,...,...
9995,9996,0,0
9996,9997,1,0
9997,9998,1,0
9998,9999,1,0


In the way we constructed the example, we don't have a lot of direct features for XGBoost to exploit. Products don't have any feature, and the only feature available for consumers is whether they are consumers or not. Without any feature engineering, we're doomed.

In [55]:
orders_ext = orders.merge(cust, on=['customer_id'], how="inner", suffixes=[None, "_2"])
orders_ext = orders_ext[["customer_id", "is_consumer", "product_id", "is_fraud"]]


orders_ext

Unnamed: 0,customer_id,is_consumer,product_id,is_fraud
0,1,1,108220,0
1,1,1,105636,0
2,2,1,104923,0
3,3,1,101955,0
4,3,1,108117,0
...,...,...,...,...
102073,10000,0,102897,0
102074,10000,0,102637,0
102075,10000,0,103859,0
102076,10000,0,108829,0


In [56]:
X = orders_ext[['is_consumer']]
y = orders_ext[['is_fraud']]

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [58]:
xgb_c = xgb.XGBClassifier()

In [59]:
xgb_c.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [60]:
preds = xgb_c.predict(X_test)

In [61]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92     17450
           1       0.00      0.00      0.00      2966

    accuracy                           0.85     20416
   macro avg       0.43      0.50      0.46     20416
weighted avg       0.73      0.85      0.79     20416



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


If we knew that bad actors are disguising as consumers and buying as business customers, it would be very easy to identify fraud by observing the ordering behavior. For this, we're going to create a variable that aggregates how many purchases we have for each customer.

In [62]:
orders_by_cust = orders.groupby('customer_id').count().reset_index()[['customer_id', 'product_id']]
orders_by_cust.rename(columns={'product_id': 'purchases'}, inplace=True)
orders_by_cust

Unnamed: 0,customer_id,purchases
0,1,2
1,2,1
2,3,4
3,4,3
4,5,2
...,...,...
9792,9996,41
9793,9997,8
9794,9998,3
9795,9999,4


In [64]:
orders_ext2 = orders_ext.merge(orders_by_cust, on=['customer_id'], how="inner", suffixes=[None, "_2"])
orders_ext2 = orders_ext2[["customer_id", "is_consumer", "product_id", "is_fraud", "purchases"]]
orders_ext2

Unnamed: 0,customer_id,is_consumer,product_id,is_fraud,purchases
0,1,1,108220,0,2
1,1,1,105636,0,2
2,2,1,104923,0,1
3,3,1,101955,0,4
4,3,1,108117,0,4
...,...,...,...,...,...
102073,10000,0,102897,0,44
102074,10000,0,102637,0,44
102075,10000,0,103859,0,44
102076,10000,0,108829,0,44


In [66]:
X = orders_ext2[['is_consumer', 'purchases']]
y = orders_ext2[['is_fraud']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
xgb_c.fit(X_train, y_train)
preds = xgb_c.predict(X_test)
print(classification_report(y_test, preds))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


              precision    recall  f1-score   support

           0       0.99      1.00      1.00     17275
           1       0.99      0.97      0.98      3141

    accuracy                           0.99     20416
   macro avg       0.99      0.98      0.99     20416
weighted avg       0.99      0.99      0.99     20416

