XGBoost

In [1]:
pip install xgboost imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: xgboost, imbalanced-learn
Successfully installed imbalanced-learn-0.12.4 xgboost-2.1.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart t

In [3]:
import pandas as pd
import numpy as np

In [4]:
data=pd.read_csv('Base.csv')

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

In [12]:
# Step 1: Clean data
data = data.replace(-1, np.nan)  # replace -1 as missing
data['fraud_bool'] = data['fraud_bool'].astype(int)

In [13]:
# Step 2: Define target and features
Y = data['fraud_bool']
X = data.drop(columns=['fraud_bool'])

In [14]:
# Step 3: Identify column types
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [15]:
# Step 4: Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [16]:
# Step 5: Column Transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [10]:
# ✅ Calculate scale_pos_weight for class imbalanceThe first part calculates the scale_pos_weight to address the class imbalance 
#(fraud is much less frequent than non-fraud). The second part sets up the model pipeline, 
#which includes preprocessing and training the model in one step. This ensures the model can better detect fraud despite the class imbalance.
# scale_pos_weight = (number of negative samples / number of positive samples)
neg, pos = np.bincount(Y)
scale_pos_weight = neg / pos
print(f"Class 0: {neg}, Class 1: {pos}, scale_pos_weight: {scale_pos_weight:.2f}")

# Build model pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=scale_pos_weight,
        random_state=42
    ))
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train
clf.fit(X_train, y_train)

# Predict & report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

Class 0: 988971, Class 1: 11029, scale_pos_weight: 89.67


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       1.00      0.88      0.93    197891
           1       0.06      0.70      0.11      2109

    accuracy                           0.88    200000
   macro avg       0.53      0.79      0.52    200000
weighted avg       0.99      0.88      0.93    200000

