In [1]:
import pandas as pd                     # Used for creating and saving the dataset
import numpy as np                      # Used for probability-based selections
from datetime import datetime, timedelta # Used to generate random timestamps
import random                           # Used for random values

# -----------------------------
# CONFIGURATION
# -----------------------------
n_samples = 5000                        # Total number of transactions to generate
fraud_ratio = 0.25                      # 25% of transactions will be marked as fraud
locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Chennai',
             'Kolkata', 'Pune', 'Ahmedabad']  # List of possible transaction locations

# -----------------------------
# DATA GENERATION FUNCTION
# -----------------------------
def generate_data(n, fraud_ratio):
    data = []                           # Empty list to store transaction rows

    for i in range(n):                  # Loop through each transaction

        # Timestamp
        timestamp = datetime(2024, 1, 1) + timedelta(
            minutes=random.randint(0, 525600)
        )                               # Generates a random date/time in 2024

        # Transaction amount
        amount = round(random.uniform(10, 50000), 2)  # Random amount 10â€“50000

        # Random feature selections
        transaction_type = random.choice(['send', 'receive', 'merchant_payment'])
        location = random.choice(locations)
        device_type = random.choice(['mobile', 'tablet'])

        # Rooted device probability (5% chance of being rooted)
        is_rooted_device = np.random.choice([0, 1], p=[0.95, 0.05])

        # Network type
        network_type = random.choice(['WiFi', '4G', '5G'])

        # Time of day calculation
        hour = timestamp.hour
        time_of_day = (
            'night' if hour < 6 else
            'morning' if hour < 12 else
            'afternoon' if hour < 18 else
            'evening'
        )                               # Categorizes the time into day parts

        # -----------------------------
        # FRAUD LOGIC
        # -----------------------------
        rule_fraud = (
            amount > 25000 and               # High amount
            is_rooted_device == 1 and        # Rooted device
            transaction_type == 'send' and   # Send transactions
            time_of_day in ['night', 'evening']  # Risky time
        )                                   # Rule-based fraud pattern

        random_fraud = random.random() < fraud_ratio  # Injects fraud based on ratio

        is_fraud = 1 if (rule_fraud or random_fraud) else 0  # Final fraud decision

        # Store transaction as a row
        data.append([
            i + 1, timestamp, 
            f"user_{random.randint(1, 5000)}",
            f"user_{random.randint(1, 5000)}",
            amount, transaction_type, location, device_type,
            is_rooted_device, network_type, time_of_day, is_fraud
        ])

    # Convert list to DataFrame
    df = pd.DataFrame(data, columns=[
        'transaction_id', 'timestamp', 'user_id', 'receiver_id', 'amount',
        'transaction_type', 'location', 'device_type', 'is_rooted_device',
        'network_type', 'time_of_day', 'is_fraud'
    ])

    return df                           # Return final dataset

# -----------------------------
# GENERATE & SAVE DATA
# -----------------------------
df = generate_data(n_samples, fraud_ratio)     # Create dataset
df.to_csv('upi_fraud_data.csv', index=False)   # Save as CSV file

print("Dataset generated and saved as upi_fraud_data.csv")  # Output message


Dataset generated and saved as upi_fraud_data.csv
