In [23]:
!pip install xgboost
from google.colab import drive
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline  # Use imbalanced-learn's Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Define the correct paths to your dataset
train_path = "/content/drive/MyDrive/fraudTrain.csv"
test_path = "/content/drive/MyDrive/fraudTest.csv"

# Load training data
df_train= pd.read_csv(train_path)
# Load training data
df_test= pd.read_csv(test_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
# Drop unnecessary columns
df_train.drop(columns=['Unnamed: 0', 'trans_num', 'trans_date_trans_time', 'dob','state'], inplace=True)
df_test.drop(columns=['Unnamed: 0', 'trans_num', 'trans_date_trans_time', 'dob','state'], inplace=True)

In [25]:
# Separate target variable before transformation
y_train = df_train.pop('is_fraud')
y_test = df_test.pop('is_fraud')

In [26]:
# Define categorical and numerical columns
categorical_columns = ['merchant', 'category', 'gender', 'city', 'job']
numerical_columns = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_columns),  # Ensure dense output
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_columns)
    ]
)

# Split the data into train and test (Stratified split to maintain the imbalance ratio)
X_train, X_test, y_train, y_test = train_test_split(df_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

# Apply preprocessing to the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)  # Apply the same transformations to test data

In [27]:
# Apply SMOTE after preprocessing but before model training
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

In [28]:
# Initialize classifiers
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [29]:
# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train_resampled, y_train_resampled)

    # Get probabilities for the positive class (fraud)
    y_prob = model.predict_proba(X_test_preprocessed)[:, 1]

    # Use default 0.5 threshold
    y_pred_default = (y_prob >= 0.5).astype(int)
    print(f"Performance of {model_name} at default threshold (0.5):")
    print(classification_report(y_test, y_pred_default))

    # Custom threshold (0.3)
    custom_threshold = 0.3
    y_pred_custom = (y_prob >= custom_threshold).astype(int)
    print(f"Performance of {model_name} at custom threshold (0.3):")
    print(classification_report(y_test, y_pred_custom))


Training Random Forest...
Performance of Random Forest at default threshold (0.5):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257834
           1       0.83      0.65      0.73      1501

    accuracy                           1.00    259335
   macro avg       0.91      0.82      0.86    259335
weighted avg       1.00      1.00      1.00    259335

Performance of Random Forest at custom threshold (0.3):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257834
           1       0.68      0.77      0.73      1501

    accuracy                           1.00    259335
   macro avg       0.84      0.89      0.86    259335
weighted avg       1.00      1.00      1.00    259335


Training Gradient Boosting...
Performance of Gradient Boosting at default threshold (0.5):
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    257834
     

Parameters: { "use_label_encoder" } are not used.



Performance of XGBoost at default threshold (0.5):
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    257834
           1       0.43      0.85      0.57      1501

    accuracy                           0.99    259335
   macro avg       0.72      0.92      0.79    259335
weighted avg       1.00      0.99      0.99    259335

Performance of XGBoost at custom threshold (0.3):
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    257834
           1       0.25      0.91      0.39      1501

    accuracy                           0.98    259335
   macro avg       0.62      0.94      0.69    259335
weighted avg       1.00      0.98      0.99    259335

