In [34]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Parameters
n_samples = 10000  # Total number of transactions
n_users = 1000     # Number of unique users
cities = ['Mumbai', 'Delhi', 'Bangalore', 'Kolkata', 'Chennai', 'Hyderabad', 'Pune', 'Ahmedabad', 'Jaipur', 'Lucknow',
          'Surat', 'Kanpur', 'Nagpur', 'Patna', 'Indore', 'Thane', 'Bhopal', 'Visakhapatnam', 'Pimpri-Chinchwad', 'Ghaziabad']
transaction_types = ['withdrawal', 'deposit', 'transfer', 'payment']
start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 12, 31)

# Generate user profiles
user_ids = np.arange(1, n_users + 1)
user_locations = {user: random.choice(cities) for user in user_ids}
user_devices = {user: f'device_{user}' for user in user_ids}
user_ips = {user: f'192.168.0.{user}' for user in user_ids}

# Generate basic transaction data
transaction_id = np.arange(1, n_samples + 1)
user_id = np.random.choice(user_ids, size=n_samples)
is_fraud = np.array([0] * (n_samples // 2) + [1] * (n_samples // 2))  # 5000 non-fraud, 5000 fraud
np.random.shuffle(is_fraud)

# Transaction time
days_in_year = (end_date - start_date).days
random_days = np.random.randint(0, days_in_year, size=n_samples)
base_dates = [start_date + timedelta(days=int(day)) for day in random_days]

# Hour probabilities: fraud more likely at night
night_hours = [22, 23, 0, 1, 2, 3, 4, 5, 6]
weights_fraud = [1 if h in night_hours else 0.2 for h in range(24)]
p_fraud = np.array(weights_fraud) / sum(weights_fraud)
p_non_fraud = np.ones(24) / 24  # Uniform for non-fraud

hours = [np.random.choice(24, p=p_non_fraud if is_fraud[i] == 0 else p_fraud) for i in range(n_samples)]
transaction_time = [base_dates[i] + timedelta(hours=hours[i]) for i in range(n_samples)]

# Transaction type: fraud more likely to be 'transfer'
p_non_fraud_type = [0.25] * 4  # Equal probability
p_fraud_type = [1/6, 1/6, 0.5, 1/6]  # Higher probability for 'transfer'

transaction_type_list = [np.random.choice(transaction_types, p=p_non_fraud_type if is_fraud[i] == 0 else p_fraud_type) 
                        for i in range(n_samples)]

# Location, device, IP: fraud often deviates from usual patterns
all_devices = list(user_devices.values())
all_ips = list(user_ips.values())

location_list = []
device_id_list = []
ip_address_list = []

for i in range(n_samples):
    usual_location = user_locations[user_id[i]]
    usual_device = user_devices[user_id[i]]
    usual_ip = user_ips[user_id[i]]
    
    # Location
    if is_fraud[i] == 0:
        location = usual_location if random.random() < 0.95 else random.choice([c for c in cities if c != usual_location])
    else:
        location = usual_location if random.random() < 0.2 else random.choice([c for c in cities if c != usual_location])
    location_list.append(location)
    
    # Device
    if is_fraud[i] == 0:
        device = usual_device if random.random() < 0.95 else random.choice([d for d in all_devices if d != usual_device])
    else:
        device = usual_device if random.random() < 0.2 else random.choice([d for d in all_devices if d != usual_device])
    device_id_list.append(device)
    
    # IP
    if is_fraud[i] == 0:
        ip = usual_ip if random.random() < 0.95 else random.choice([ip for ip in all_ips if ip != usual_ip])
    else:
        ip = usual_ip if random.random() < 0.2 else random.choice([ip for ip in all_ips if ip != usual_ip])
    ip_address_list.append(ip)

# Transaction amount: fraud tends to be higher (in INR)
amount_list = [round(np.random.lognormal(7, 1) if is_fraud[i] == 0 else np.random.lognormal(8.5, 1), 2) 
               for i in range(n_samples)]

# Is mobile: fraud more likely from mobile
is_mobile_list = [np.random.choice([0,1], p=[0.5, 0.5] if is_fraud[i] == 0 else [0.3, 0.7]) for i in range(n_samples)]

# Create DataFrame
df = pd.DataFrame({
    'transaction_id': transaction_id,
    'user_id': user_id,
    'transaction_amount': amount_list,
    'transaction_time': transaction_time,
    'transaction_type': transaction_type_list,
    'location': location_list,
    'device_id': device_id_list,
    'ip_address': ip_address_list,
    'is_mobile': is_mobile_list,
    'is_fraud': is_fraud
})

# Optionally save to CSV
df.to_csv('synthetic_fraud_dataset_india.csv', index=False)

# Verify balance
print(df['is_fraud'].value_counts())

is_fraud
0    5000
1    5000
Name: count, dtype: int64


In [32]:
df

Unnamed: 0,transaction_id,user_id,transaction_amount,transaction_time,transaction_type,location,device_id,ip_address,is_mobile,is_fraud
0,1,103,399.50,2022-02-20 19:00:00,payment,Visakhapatnam,device_103,192.168.0.103,1,0
1,2,436,5910.50,2022-11-01 22:00:00,transfer,Bhopal,device_459,192.168.0.354,0,1
2,3,861,3812.60,2022-03-15 11:00:00,withdrawal,Delhi,device_861,192.168.0.861,1,0
3,4,271,619.80,2022-07-18 08:00:00,payment,Thane,device_271,192.168.0.245,0,0
4,5,107,7459.53,2022-07-11 19:00:00,withdrawal,Pimpri-Chinchwad,device_107,192.168.0.107,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,9996,915,706.49,2022-10-11 21:00:00,deposit,Jaipur,device_915,192.168.0.915,0,0
9996,9997,210,8599.30,2022-02-05 02:00:00,withdrawal,Jaipur,device_897,192.168.0.210,1,1
9997,9998,741,3271.61,2022-11-08 10:00:00,transfer,Thane,device_741,192.168.0.741,1,0
9998,9999,862,7715.66,2022-06-24 05:00:00,transfer,Bangalore,device_565,192.168.0.162,0,1
