In [1]:
!pip install faker

Collecting faker
  Downloading faker-36.2.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-36.2.2-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m1.4/1.9 MB[0m [31m39.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m37.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-36.2.2


In [2]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta
import numpy as np

# Initialize Faker
fake = Faker()

# Define customer profiles (typical behavior for each customer)
customer_profiles = {
    1001: {"avg_amount": 100, "location": "New York", "merchant_categories": ["Retail", "Food & Beverage"], "typical_transaction_times": ["09:00-18:00"], "chargeback_history": 0},
    1002: {"avg_amount": 50, "location": "Chicago", "merchant_categories": ["Retail", "Transportation"], "typical_transaction_times": ["07:00-20:00"], "chargeback_history": 1},
    1003: {"avg_amount": 200, "location": "Los Angeles", "merchant_categories": ["Shopping", "Food & Beverage"], "typical_transaction_times": ["10:00-22:00"], "chargeback_history": 2},
    1004: {"avg_amount": 150, "location": "Houston", "merchant_categories": ["Retail", "Shopping"], "typical_transaction_times": ["08:00-19:00"], "chargeback_history": 0},
    1005: {"avg_amount": 75, "location": "Miami", "merchant_categories": ["Food & Beverage", "Transportation"], "typical_transaction_times": ["12:00-23:00"], "chargeback_history": 1},
}

# Define merchant categories
merchant_categories = ["Retail", "Food & Beverage", "Shopping", "Transportation", "Online"]

# Generate synthetic data
data = []
num_transactions = 100000  # Number of transactions to generate

for i in range(1, num_transactions + 1):
    transaction_id = i
    customer_id = random.choice(list(customer_profiles.keys()))
    customer_profile = customer_profiles[customer_id]

    # Generate transaction date and time
    transaction_date = fake.date_time_between(start_date='-30d', end_date='now')
    transaction_time = transaction_date.time()

    # Generate transaction amount based on customer profile
    avg_amount = customer_profile["avg_amount"]
    transaction_amount = round(random.uniform(0.5 * avg_amount, 1.5 * avg_amount), 2)

    # Generate merchant name and category
    merchant_name = fake.company()
    merchant_category = random.choice(merchant_categories)

    # Generate transaction location (sometimes deviate from customer's typical location)
    if random.random() < 0.1:  # 10% chance of geographic anomaly
        transaction_location = fake.city()
    else:
        transaction_location = customer_profile["location"]

    # Determine fraud based on rules
    fraud_flag = 0  # Default to legitimate transaction

    # Rule 1: Unusually high transaction amount
    if transaction_amount > 3 * avg_amount:
        fraud_flag = 1

    # Rule 2: Geographic anomaly
    if transaction_location != customer_profile["location"]:
        fraud_flag = 1

    # Rule 3: Unusual merchant category
    if merchant_category not in customer_profile["merchant_categories"]:
        fraud_flag = 1

    # Rule 4: High-frequency transactions (simulate by checking recent transactions)
    recent_transactions = [t for t in data if t[1] == customer_id and (transaction_date - t[2]).total_seconds() < 3600]  # Last hour
    if len(recent_transactions) > 5:  # More than 5 transactions in the last hour
        fraud_flag = 1

    # Rule 5: Unusual time of day
    typical_times = customer_profile["typical_transaction_times"]
    is_typical_time = False
    for time_range in typical_times:
        start, end = time_range.split('-')
        start_time = datetime.strptime(start, "%H:%M").time()
        end_time = datetime.strptime(end, "%H:%M").time()
        if start_time <= transaction_time <= end_time:
            is_typical_time = True
            break
    if not is_typical_time:
        fraud_flag = 1

    # Rule 6: Chargeback history
    if customer_profile["chargeback_history"] > 0:
        fraud_flag = 1 if random.random() < 0.3 else 0  # 30% chance of fraud if chargeback history exists

    # Append transaction to data
    data.append([transaction_id, customer_id, transaction_date, transaction_amount, merchant_name, merchant_category, transaction_location, fraud_flag])

# Create DataFrame
df = pd.DataFrame(data, columns=["Transaction_ID", "Customer_ID", "Transaction_Date", "Transaction_Amount", "Merchant_Name", "Merchant_Category", "Transaction_Location", "Fraud_Flag"])

# Save to CSV
df.to_csv("realistic_credit_card_transactions.csv", index=False)

print("Synthetic credit card transaction data generated and saved to 'realistic_credit_card_transactions.csv'.")

Synthetic credit card transaction data generated and saved to 'realistic_credit_card_transactions.csv'.
