In [1]:
# Step 1: Generate a small synthetic fraud detection dataset for demonstration
import numpy as np
import pandas as pd

np.random.seed(42)

# Generate 1000 samples
n_samples = 1000
# Features: amount, transaction_time, is_foreign, is_high_risk_country
amount = np.random.exponential(scale=100, size=n_samples)
transaction_time = np.random.randint(0, 24, size=n_samples)  # hour of day
is_foreign = np.random.binomial(1, 0.1, size=n_samples)  # 10% foreign
is_high_risk_country = np.random.binomial(1, 0.05, size=n_samples)  # 5% high risk

# Generate labels: fraud is more likely for high amount, foreign, high risk country, odd hours
fraud = (
        (amount > 200).astype(int) +
        (is_foreign == 1).astype(int) +
        (is_high_risk_country == 1).astype(int) +
        ((transaction_time < 6) | (transaction_time > 22)).astype(int)
)
# If sum of risk factors >= 2, label as fraud
labels = (fraud >= 2).astype(int)

# Create DataFrame
fraud_df = pd.DataFrame({
    'amount': amount,
    'transaction_time': transaction_time,
    'is_foreign': is_foreign,
    'is_high_risk_country': is_high_risk_country,
    'fraud': labels
})

print('Sample of synthetic fraud detection data:')
display(fraud_df.head())
print('Fraud distribution:')
print(fraud_df["fraud"].value_counts())

# Save the generated synthetic fraud detection DataFrame to a CSV file for later use.
fraud_df.to_csv('synthetic_fraud_data.csv', index=False)
print('Data saved to synthetic_fraud_data.csv')


Sample of synthetic fraud detection data:


Unnamed: 0,amount,transaction_time,is_foreign,is_high_risk_country,fraud
0,46.926809,14,0,0,0
1,301.012143,11,1,0,1
2,131.674569,15,0,0,0
3,91.294255,23,1,0,1
4,16.962487,18,0,0,0


Fraud distribution:
fraud
0    897
1    103
Name: count, dtype: int64
Data saved to synthetic_fraud_data.csv


In [None]:
# Step 2: Read the synthetic fraud detection data from file
import pandas as pd
fraud_df = pd.read_csv('synthetic_fraud_data.csv')
print('Loaded data sample:')
display(fraud_df.head())
print('Fraud distribution:')
print(fraud_df['fraud'].value_counts())


In [None]:
# Step 3: Exploratory Data Analysis (EDA)
import matplotlib.pyplot as plt

# 1. Check for missing values
print('Missing values in each column:')
print(fraud_df.isnull().sum())

# 2. Summary statistics
print('\nSummary statistics:')
print(fraud_df.describe())

# 3. Visualize feature distributions
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
axs[0, 0].hist(fraud_df['amount'], bins=30, color='skyblue')
axs[0, 0].set_title('Amount Distribution')
axs[0, 1].hist(fraud_df['transaction_time'], bins=24, color='orange')
axs[0, 1].set_title('Transaction Time Distribution')
axs[1, 0].bar(['No', 'Yes'], fraud_df['is_foreign'].value_counts().sort_index(), color='green')
axs[1, 0].set_title('Is Foreign')
axs[1, 1].bar(['No', 'Yes'], fraud_df['is_high_risk_country'].value_counts().sort_index(), color='red')
axs[1, 1].set_title('Is High Risk Country')
plt.tight_layout()
plt.show()

# 4. Check class balance
print('\nFraud class balance:')
print(fraud_df['fraud'].value_counts(normalize=True))


In [None]:
# Step 4: Data Preprocessing (train-test split and scaling)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Split features and target
y = fraud_df['fraud']
X = fraud_df.drop('fraud', axis=1)

# 2. Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Scale numerical features
scaler = StandardScaler()

X_train[['amount', 'transaction_time']] = scaler.fit_transform(X_train[['amount', 'transaction_time']])
X_test[['amount', 'transaction_time']] = scaler.transform(X_test[['amount', 'transaction_time']])

# 4. Show shapes
print(f'Training set shape: {X_train.shape}, {y_train.shape}')
print(f'Test set shape: {X_test.shape}, {y_test.shape}')
print('Sample of scaled training data:')
display(X_train.head())

# Save train and test data for later use
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
print('Train and test data saved as X_train.csv, X_test.csv, y_train.csv, y_test.csv')


In [10]:
# Step 5: Train and evaluate a Logistic Regression model for fraud detection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import pandas as pd

# Load train and test data (in case running this cell independently)
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

# Train logistic regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

# Predict on test set
y_pred = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)[:, 1]

# Evaluation
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred, digits=3))
print(f'ROC AUC Score: {roc_auc_score(y_test, y_proba):.3f}')


Confusion Matrix:
[[178   1]
 [ 12   9]]

Classification Report:
              precision    recall  f1-score   support

           0      0.937     0.994     0.965       179
           1      0.900     0.429     0.581        21

    accuracy                          0.935       200
   macro avg      0.918     0.711     0.773       200
weighted avg      0.933     0.935     0.924       200

ROC AUC Score: 0.930


In [None]:
# Step 6: Train and evaluate a Neural Network for fraud detection
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import pandas as pd

# Load train and test data (in case running this cell independently)
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

# Build a simple neural network model
nn_model = keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(8, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
nn_model.fit(X_train, y_train, epochs=30, batch_size=32, verbose=1, validation_split=0.1)

# Predict on test set
y_pred_nn = (nn_model.predict(X_test) > 0.5).astype(int).ravel()
y_proba_nn = nn_model.predict(X_test).ravel()

# Evaluation
print('Confusion Matrix (Neural Network):')
print(confusion_matrix(y_test, y_pred_nn))
print('\nClassification Report (Neural Network):')
print(classification_report(y_test, y_pred_nn, digits=3))
print(f'ROC AUC Score (Neural Network): {roc_auc_score(y_test, y_proba_nn):.3f}')


In [12]:
# Step 7: Improve models and compare Logistic Regression and Neural Network
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np

# Load train and test data
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

# 1. Use SMOTE to oversample the minority class in training data
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# 2. Compute class weights for use in models
classes = np.unique(y_train)
class_weights = dict(zip(classes, compute_class_weight('balanced', classes=classes, y=y_train)))

# 3. Improved Logistic Regression (with class weights)
logreg = LogisticRegression(random_state=42, class_weight=class_weights, max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)
y_proba_lr = logreg.predict_proba(X_test)[:, 1]

# 4. Improved Neural Network (with class weights and SMOTE data)
nn_model = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_sm, y_train_sm, epochs=50, batch_size=32, verbose=0, validation_split=0.1, class_weight=class_weights)
y_pred_nn = (nn_model.predict(X_test) > 0.5).astype(int).ravel()
y_proba_nn = nn_model.predict(X_test).ravel()

# 5. Compare results
print('--- Improved Logistic Regression ---')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, digits=3))
print(f'ROC AUC Score: {roc_auc_score(y_test, y_proba_lr):.3f}\n')

print('--- Improved Neural Network ---')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_nn))
print(classification_report(y_test, y_pred_nn, digits=3))
print(f'ROC AUC Score: {roc_auc_score(y_test, y_proba_nn):.3f}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
--- Improved Logistic Regression ---
Confusion Matrix:
[[157  22]
 [  2  19]]
              precision    recall  f1-score   support

           0      0.987     0.877     0.929       179
           1      0.463     0.905     0.613        21

    accuracy                          0.880       200
   macro avg      0.725     0.891     0.771       200
weighted avg      0.932     0.880     0.896       200

ROC AUC Score: 0.950

--- Improved Neural Network ---
Confusion Matrix:
[[153  26]
 [  0  21]]
              precision    recall  f1-score   support

           0      1.000     0.855     0.922       179
           1      0.447     1.000     0.618        21

    accuracy                          0.870       200
   macro avg      0.723     0.927     0.770       200
weighted avg      0.942     0.870     0.890       200

ROC AUC Score: 0.997
