In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

def load_data(file_path):
    return pd.read_csv(file_path)

In [12]:
def preprocess_data(df):
    df['transaction_amount'].fillna(df['transaction_amount'].median(), inplace=True)
    df.drop(columns=['payer_mobile_anonymous'], inplace=True, errors='ignore')
    df['transaction_date'] = pd.to_datetime(df['transaction_date'], errors='coerce')
    df['transaction_hour'] = df['transaction_date'].dt.hour
    df['transaction_day'] = df['transaction_date'].dt.day
    df['transaction_month'] = df['transaction_date'].dt.month
    df['transaction_day_of_week'] = df['transaction_date'].dt.dayofweek
    df.drop(columns=['transaction_date'], inplace=True)
    return df

# Encode Categorical Features
def encode_categorical(df, categorical_cols):
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    return df, label_encoders

# Train XGBoost Model
def train_xgboost(X_train, y_train):
    scale_pos_weight = len(y_train[y_train == 0]) / max(1, len(y_train[y_train == 1]))
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)
    model.fit(X_train, y_train)
    return model


In [13]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    report = classification_report(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    return report, roc_auc

In [14]:
file_path = "transactions_train.csv" 
df = load_data(file_path)
df = preprocess_data(df)
categorical_cols = ['transaction_channel', 'payer_email_anonymous', 'payee_ip_anonymous', 'transaction_id_anonymous', 'payee_id_anonymous']
df, _ = encode_categorical(df, categorical_cols)

X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model = train_xgboost(X_train, y_train)
report, roc_auc = evaluate_model(model, X_test, y_test)

print("Classification Report:\n", report)
print("ROC-AUC Score:", roc_auc)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['transaction_amount'].fillna(df['transaction_amount'].median(), inplace=True)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     34584
           1       0.00      0.00      0.00         2

    accuracy                           1.00     34586
   macro avg       0.50      0.50      0.50     34586
weighted avg       1.00      1.00      1.00     34586

ROC-AUC Score: 0.49975422160536664
