In [None]:
import kagglehub
kartik2112_fraud_detection_path = kagglehub.dataset_download('kartik2112/fraud-detection')
johanan28_credit_card_fraud_detection_path = kagglehub.notebook_output_download('johanan28/credit-card-fraud-detection')

print('Data source import complete.')


Data source import complete.


In [None]:
!pip uninstall scipy -y
!pip install imbalanced-learn==0.11.0 scikit-learn==1.2.2 numpy==1.26.4 scipy==1.11.4 --no-deps

Found existing installation: scipy 1.15.3
Uninstalling scipy-1.15.3:
  Successfully uninstalled scipy-1.15.3
Collecting imbalanced-learn==0.11.0
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Collecting scikit-learn==1.2.2
  Downloading scikit_learn-1.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy==1.11.4
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv("/kaggle/input/fraud-detection/fraudTrain.csv")
test_data = pd.read_csv("/kaggle/input/fraud-detection/fraudTest.csv")

In [None]:
print("Training Data Shape:", train_data.shape)
print("Fraud Cases in Train:", len(train_data[train_data['is_fraud'] == 1]))
print("Legitimate Cases in Train:", len(train_data[train_data['is_fraud'] == 0]))
print("\nTest Data Shape:", test_data.shape)

Training Data Shape: (1296675, 23)
Fraud Cases in Train: 7506
Legitimate Cases in Train: 1289169

Test Data Shape: (555719, 23)


In [None]:
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])

In [None]:
train_data['trans_hour'] = train_data['trans_date_trans_time'].dt.hour
train_data['trans_day_of_week'] = train_data['trans_date_trans_time'].dt.dayofweek
test_data['trans_hour'] = test_data['trans_date_trans_time'].dt.hour
test_data['trans_day_of_week'] = test_data['trans_date_trans_time'].dt.dayofweek

In [None]:
train_data = train_data.drop(columns=['trans_date_trans_time'])
test_data = test_data.drop(columns=['trans_date_trans_time'])

In [None]:
categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job']
le = LabelEncoder()
for col in ['merchant', 'city', 'state', 'job']:
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = test_data[col].map(lambda s: '<unknown>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<unknown>')
    test_data[col] = le.transform(test_data[col])
train_data = pd.get_dummies(train_data, columns=['category', 'gender'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['category', 'gender'], drop_first=True)

In [None]:
columns_to_drop = ['trans_num', 'first', 'last', 'street', 'dob']
train_data = train_data.drop(columns=[col for col in columns_to_drop if col in train_data.columns])
test_data = test_data.drop(columns=[col for col in columns_to_drop if col in test_data.columns])

In [None]:
missing_cols = set(train_data.columns) - set(test_data.columns)
for col in missing_cols:
    if col != 'is_fraud':
        test_data[col] = 0
test_data = test_data[train_data.columns]

In [None]:
train_data = train_data.dropna()
test_data = test_data.dropna()

In [None]:
X_train = train_data.drop('is_fraud', axis=1)
y_train = train_data['is_fraud']
X_test = test_data.drop('is_fraud', axis=1)
y_test = test_data['is_fraud']

In [None]:
print("Features after preprocessing:", X_train.columns.tolist())

Features after preprocessing: ['Unnamed: 0', 'cc_num', 'merchant', 'amt', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'unix_time', 'merch_lat', 'merch_long', 'trans_hour', 'trans_day_of_week', 'category_food_dining', 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos', 'category_health_fitness', 'category_home', 'category_kids_pets', 'category_misc_net', 'category_misc_pos', 'category_personal_care', 'category_shopping_net', 'category_shopping_pos', 'category_travel', 'gender_M']


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
smote = SMOTE(sampling_strategy=0.1, random_state=42, n_jobs=1)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

In [None]:
print("Shape of resampled training data:", X_train_res.shape)

Shape of resampled training data: (1418085, 30)


In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, n_jobs=1),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=1)
}

In [None]:
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test_scaled)
    print(f"\nResults for {name}:")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred))


Training Logistic Regression...

Results for Logistic Regression:
Confusion Matrix:
[[552839    735]
 [  2145      0]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           0.99    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      0.99      0.99    555719

ROC AUC Score: 0.49933613211603145

Training Decision Tree...

Results for Decision Tree:
Confusion Matrix:
[[550751   2823]
 [   698   1447]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.34      0.67      0.45      2145

    accuracy                           0.99    555719
   macro avg       0.67      0.83      0.72    555719
weighted avg       1.00      0.99      0.99    555719

ROC AUC Score: 0.8347462426886317

