In [1]:
pip install pandas faker geopy



In [2]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta
from geopy.distance import geodesic  # For accurate geographic distance calculation

# Initialize Faker for generating synthetic data
fake = Faker()

# Customer profiles
customer_profiles = {
    1001: {"avg_amount": 100, "location": "New York", "merchant_categories": ["Retail", "Food & Beverage"], "typical_transaction_times": ["09:00-18:00"], "chargeback_history": 0},
    1002: {"avg_amount": 50, "location": "Chicago", "merchant_categories": ["Retail", "Transportation"], "typical_transaction_times": ["07:00-20:00"], "chargeback_history": 1},
    1003: {"avg_amount": 200, "location": "Los Angeles", "merchant_categories": ["Shopping", "Food & Beverage"], "typical_transaction_times": ["10:00-22:00"], "chargeback_history": 2},
    1004: {"avg_amount": 150, "location": "Houston", "merchant_categories": ["Retail", "Shopping"], "typical_transaction_times": ["08:00-19:00"], "chargeback_history": 0},
    1005: {"avg_amount": 75, "location": "Miami", "merchant_categories": ["Food & Beverage", "Transportation"], "typical_transaction_times": ["12:00-23:00"], "chargeback_history": 1},
}

# Merchant categories
merchant_categories = ["Retail", "Food & Beverage", "Shopping", "Transportation", "Online"]

# Geographic distance threshold (in miles) for anomaly detection
GEO_DISTANCE_THRESHOLD = 500  # Example: 500 miles

# City coordinates (latitude, longitude) for distance calculation
city_coordinates = {
    "New York": (40.7128, -74.0060),
    "Chicago": (41.8781, -87.6298),
    "Los Angeles": (34.0522, -118.2437),
    "Houston": (29.7604, -95.3698),
    "Miami": (25.7617, -80.1918),
}

# Function to calculate distance between two cities using geopy
def calculate_distance(city1, city2):
    if city1 in city_coordinates and city2 in city_coordinates:
        return geodesic(city_coordinates[city1], city_coordinates[city2]).miles
    return 0  # Default to 0 if city coordinates are not found

# Generate synthetic transaction data
data = []
num_transactions = 10000  # Number of transactions to generate

for i in range(1, num_transactions + 1):
    transaction_id = i
    customer_id = random.choice(list(customer_profiles.keys()))
    customer_profile = customer_profiles[customer_id]

    # Generate transaction date and time
    transaction_date = fake.date_time_between(start_date='-30d', end_date='now')
    transaction_time = transaction_date.time()

    # Generate transaction amount based on customer's average amount
    avg_amount = customer_profile["avg_amount"]
    transaction_amount = round(random.uniform(0.5 * avg_amount, 1.5 * avg_amount), 2)

    # Generate merchant details
    merchant_name = fake.company()
    merchant_category = random.choice(merchant_categories)

    # Generate transaction location (10% chance of being in a different city)
    if random.random() < 0.1:
        transaction_location = fake.city()
    else:
        transaction_location = customer_profile["location"]

    # Initialize fraud flag
    fraud_flag = 0

    # Rule 1: Transaction amount is unusually high
    if transaction_amount > 3 * avg_amount:
        fraud_flag = 1

    # Rule 2: Transaction location is far from the customer's typical location
    if transaction_location != customer_profile["location"]:
        distance = calculate_distance(transaction_location, customer_profile["location"])
        if distance > GEO_DISTANCE_THRESHOLD:
            fraud_flag = 1

    # Rule 3: Merchant category is unusual for the customer
    if merchant_category not in customer_profile["merchant_categories"]:
        fraud_flag = 1

    # Rule 4: Too many transactions in a short time (high-frequency transactions)
    recent_transactions = [t for t in data if t[1] == customer_id and (transaction_date - t[2]).total_seconds() < 3600]  # Last hour
    if len(recent_transactions) > 5:
        fraud_flag = 1

    # Rule 5: Transaction time is unusual for the customer
    typical_times = customer_profile["typical_transaction_times"]
    is_typical_time = False
    for time_range in typical_times:
        start, end = time_range.split('-')
        start_time = datetime.strptime(start, "%H:%M").time()
        end_time = datetime.strptime(end, "%H:%M").time()
        if start_time <= transaction_time <= end_time:
            is_typical_time = True
            break
    if not is_typical_time:
        fraud_flag = 1

    # Rule 6: Customer has a history of chargebacks
    if customer_profile["chargeback_history"] > 0:
        fraud_flag = 1 if random.random() < 0.3 else 0

    # Append transaction to data
    data.append([transaction_id, customer_id, transaction_date, transaction_amount, merchant_name, merchant_category, transaction_location, fraud_flag])

# Convert data to DataFrame
df = pd.DataFrame(data, columns=["Transaction_ID", "Customer_ID", "Transaction_Date", "Transaction_Amount", "Merchant_Name", "Merchant_Category", "Transaction_Location", "Fraud_Flag"])

# Save data to CSV
df.to_csv("realistic_credit_card_transactions.csv", index=False)

print("Synthetic credit card transaction data generated and saved to 'realistic_credit_card_transactions.csv'.")

Synthetic credit card transaction data generated and saved to 'realistic_credit_card_transactions.csv'.
