In [2]:
# Step 1: Upload the zip file from your local machine
from google.colab import files
import zipfile
import os

# Import the pandas library, which was missing
import pandas as pd

uploaded = files.upload()

# The key of the uploaded dictionary is the name of the file you selected
uploaded_file_name = list(uploaded.keys())[0]

# Step 2: Extract the creditcard.csv file
print(f"File '{uploaded_file_name}' uploaded successfully. Now extracting...")

extraction_path = 'credit_card_data'
if not os.path.exists(extraction_path):
    os.makedirs(extraction_path)

with zipfile.ZipFile(uploaded_file_name, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

print(f"File '{uploaded_file_name}' extracted to '{extraction_path}'.")

# Step 3: Find and load the creditcard.csv file
data_file_path = os.path.join(extraction_path, 'creditcard.csv')

# Load the dataset using the correct function for CSV files
try:
    df = pd.read_csv(data_file_path)
    print("Dataset 'creditcard.csv' loaded successfully.")
except FileNotFoundError:
    print(f"The file '{data_file_path}' was not found after extraction. Please check the contents of the zip file.")
    raise
except Exception as e:
    print(f"An error occurred while trying to read the CSV file: {e}")
    raise

# Step 4: Data exploration and preparation
print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nDataset information:")
df.info()

print("\nMissing values per column:")
print(df.isnull().sum())

Saving archive (4).zip to archive (4) (1).zip
File 'archive (4) (1).zip' uploaded successfully. Now extracting...
File 'archive (4) (1).zip' extracted to 'credit_card_data'.
Dataset 'creditcard.csv' loaded successfully.

First 5 rows of the dataset:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  

In [3]:
# Continue from the previous code block after it has successfully loaded the data.

# Since the 'Class' column is what we are trying to predict, let's see its distribution
# This will highlight the severe class imbalance
print("\nClass distribution:")
print(df['Class'].value_counts())
print(f"Fraudulent transactions make up {df['Class'].value_counts()[1]/len(df) * 100:.4f}% of the dataset.")

# Separate features (X) and target (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Remove 'Time' and 'Amount' columns, as they may not be useful for all models
# and 'Amount' is not scaled like the other features (V1-V28)
X = X.drop(['Time', 'Amount'], axis=1)
# You might want to scale the 'Amount' feature instead of dropping it
# from sklearn.preprocessing import StandardScaler
# X['scaled_amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
# X = X.drop('Amount', axis=1)

from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("\nData split into training and testing sets.")


Class distribution:
Class
0    284315
1       492
Name: count, dtype: int64
Fraudulent transactions make up 0.1727% of the dataset.

Data split into training and testing sets.


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, roc_auc_score

# Initialize and train the Random Forest model
rf_baseline = RandomForestClassifier(n_estimators=100, random_state=42)
rf_baseline.fit(X_train, y_train)

# Make predictions on the test set
y_pred_baseline = rf_baseline.predict(X_test)

# Evaluate the model
print("--- Baseline Random Forest Model Performance ---")
print(classification_report(y_test, y_pred_baseline))
print(f"F1 Score: {f1_score(y_test, y_pred_baseline):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_baseline):.4f}")

--- Baseline Random Forest Model Performance ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.93      0.83      0.88        98

    accuracy                           1.00     56962
   macro avg       0.97      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962

F1 Score: 0.8757
ROC-AUC Score: 0.9132


In [5]:
# Import the SMOTE library
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print("--- Class Distribution After SMOTE ---")
print(y_res.value_counts())

--- Class Distribution After SMOTE ---
Class
0    227451
1    227451
Name: count, dtype: int64


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, roc_auc_score
import xgboost as xgb

# Initialize and train the Random Forest model on the SMOTE data
rf_smote = RandomForestClassifier(n_estimators=100, random_state=42)
rf_smote.fit(X_res, y_res)

# Make predictions on the original (non-SMOTE) test set
y_pred_rf_smote = rf_smote.predict(X_test)

# Evaluate the Random Forest model
print("\n--- Random Forest Model with SMOTE Performance ---")
print(classification_report(y_test, y_pred_rf_smote))
print(f"F1 Score: {f1_score(y_test, y_pred_rf_smote):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_rf_smote):.4f}")

# -------------------------------------------------------------------
# Initialize and train the XGBoost model on the SMOTE data
# The 'use_label_encoder=False' and 'eval_metric' are for avoiding warnings.
xgb_smote = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
xgb_smote.fit(X_res, y_res)

# Make predictions on the original test set
y_pred_xgb_smote = xgb_smote.predict(X_test)

# Evaluate the XGBoost model
print("\n--- XGBoost Model with SMOTE Performance ---")
print(classification_report(y_test, y_pred_xgb_smote))
print(f"F1 Score: {f1_score(y_test, y_pred_xgb_smote):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_xgb_smote):.4f}")


--- Random Forest Model with SMOTE Performance ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.83      0.82      0.82        98

    accuracy                           1.00     56962
   macro avg       0.92      0.91      0.91     56962
weighted avg       1.00      1.00      1.00     56962

F1 Score: 0.8247
ROC-AUC Score: 0.9080


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- XGBoost Model with SMOTE Performance ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.69      0.85      0.76        98

    accuracy                           1.00     56962
   macro avg       0.85      0.92      0.88     56962
weighted avg       1.00      1.00      1.00     56962

F1 Score: 0.7615
ROC-AUC Score: 0.9231
