In [1]:
# from google.colab import files
# uploaded = files.upload()  # Upload kaggle.json

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d mlg-ulb/creditcardfraud

Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
License(s): DbCL-1.0


In [4]:
!unzip creditcardfraud.zip

Archive:  creditcardfraud.zip
  inflating: creditcard.csv          


In [5]:
import pandas as pd
df = pd.read_csv('creditcard.csv')
print(df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [6]:


# Explore the data
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:\n", df.head())
print("\nData Info:\n", df.info())
print("\nClass Distribution:\n", df['Class'].value_counts())
print("\nMissing Values:\n", df.isnull().sum())

Dataset Shape: (284807, 31)

First 5 rows:
    Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


# Split features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine resampled data into a DataFrame
processed_df = pd.DataFrame(X_resampled, columns=X.columns)
processed_df['Class'] = y_resampled

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Save the processed data as CSV files
train_df = pd.DataFrame(X_train, columns=X.columns)
train_df['Class'] = y_train
test_df = pd.DataFrame(X_test, columns=X.columns)
test_df['Class'] = y_test

train_df.to_csv('train_processed.csv', index=False)
test_df.to_csv('test_processed.csv', index=False)

print("Processed data saved as train_processed.csv and test_processed.csv")
print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

Processed data saved as train_processed.csv and test_processed.csv
Training set shape: (454904, 31)
Test set shape: (113726, 31)


In [8]:
import sqlite3

# Create a SQLite database
conn = sqlite3.connect('creditcard_data.db')
cursor = conn.cursor()

# Create tables for train and test data
train_df.to_sql('train_data', conn, if_exists='replace', index=False)
test_df.to_sql('test_data', conn, if_exists='replace', index=False)

# Verify the data
print("Training data in SQLite:")
print(pd.read_sql_query("SELECT * FROM train_data LIMIT 5", conn))
print("Test data in SQLite:")
print(pd.read_sql_query("SELECT * FROM test_data LIMIT 5", conn))

# Close the connection
conn.close()

Training data in SQLite:
            Time        V1        V2        V3        V4        V5        V6  \
0  135001.676696  1.282330  0.875020 -2.507626  2.071007  0.505984 -0.679542   
1  102650.727183 -4.193704  5.093261 -8.889153  7.062972 -3.263271 -1.922530   
2  140109.783604  0.782775  2.503060 -5.236965  3.786795  1.762973 -0.940165   
3   62934.000000 -1.119211 -0.076681  2.532709  0.480750 -0.939219  0.296271   
4  122324.000000  2.048996 -0.366883 -2.543852 -0.728626  2.387568  3.318339   

         V7        V8        V9  ...       V21       V22       V23       V24  \
0 -0.327953 -0.982698 -0.632150  ... -0.341666 -0.018783 -0.050106  0.186553   
1 -6.041619  2.745936 -3.466607  ...  1.215907  0.579972  0.041185 -0.462953   
2 -0.563909 -0.801622 -1.738874  ...  1.257866 -0.182190  0.134185 -1.445014   
3 -0.722636  0.130148  1.505440  ...  0.016087  0.576419 -0.172156  0.554262   
4 -0.477944  0.791325  0.403268  ... -0.292818 -0.835277  0.380947  0.698226   

        V25  

In [9]:
# Install MLflow and XGBoost
!pip install mlflow xgboost

import mlflow
import mlflow.xgboost
import pandas as pd
from sklearn.metrics import f1_score, roc_auc_score
from xgboost import XGBClassifier
import joblib

# Load the processed data (use CSV or SQLite)
# Option 1: Load from CSV
train_df = pd.read_csv('train_processed.csv')
test_df = pd.read_csv('test_processed.csv')

# Option 2: Load from SQLite (if you used it)
# conn = sqlite3.connect('creditcard_data.db')
# train_df = pd.read_sql_query("SELECT * FROM train_data", conn)
# test_df = pd.read_sql_query("SELECT * FROM test_data", conn)
# conn.close()

# Split features and target
X_train = train_df.drop('Class', axis=1)
y_train = train_df['Class']
X_test = test_df.drop('Class', axis=1)
y_test = test_df['Class']

# Train with MLflow
with mlflow.start_run():
    model = XGBClassifier(n_estimators=100, max_depth=5, random_state=42, eval_metric='logloss')
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)

    # Log metrics and parameters
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 5)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("auc", auc)

    # Log model
    mlflow.xgboost.log_model(model, "model")

    print(f"F1-Score: {f1}, AUC: {auc}")

    # Save the model locally
    joblib.dump(model, 'fraud_model_xgb.pkl')

Collecting mlflow
  Downloading mlflow-2.21.2-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.2 (from mlflow)
  Downloading mlflow_skinny-2.21.2-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.21.2->mlflow)
  Downloading databricks_sdk-0.49.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.21.2->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.21.2->mlflow)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 k



F1-Score: 0.9997982013599473, AUC: 0.9997973568281938


In [10]:
from google.colab import drive
drive.mount('/content/drive')
!cp fraud_model_xgb.pkl /content/drive/MyDrive/fraud_model_xgb.pkl
print("Model uploaded to Google Drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model uploaded to Google Drive
