In [7]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta


# 1. Configuration & Seeding

np.random.seed(42)
random.seed(42)

fake = Faker()
Faker.seed(42)

num_entries = 100000

products = [
    'Laptop', 'Smartphone', 'Headphones', 'Smartwatch', 'Camera',
    'Printer', 'Router', 'Tablet', 'Mouse', 'Keyboard'
]

payment_methods = ['Card', 'PayPal', 'MB WAY', 'Bank Transfer']
shipping_methods = ['Standard', 'Express', 'Overnight']


# 2. Generate Customers

customers = {
    'Customer ID': range(1, num_entries + 1),
    'Age': [fake.random_int(min=18, max=78) for _ in range(num_entries)],
    'Gender': [fake.random_element(elements=('M', 'F')) for _ in range(num_entries)],
}
df_customers = pd.DataFrame(customers)


# 3. Generate Browsing Data

browsing_data = {
    'Customer ID': range(1, num_entries + 1),
    'Total Sessions': [fake.random_int(min=1, max=20) for _ in range(num_entries)],
    'Total Pages Visited': [fake.random_int(min=1, max=200) for _ in range(num_entries)],
    'Average Time Spent per Page (sec)': [fake.random_int(min=10, max=300) for _ in range(num_entries)]
}
df_browsing = pd.DataFrame(browsing_data)


# 4. Generate Orders with Cart Abandonment Patterns

order_ids = range(1, num_entries + 1)

cart_abandoned_list = []
product_id_list = []
product_name_list = []
shipping_method_list = []

for i in order_ids:
    cust_id = i
    customer = df_customers.loc[df_customers['Customer ID'] == cust_id].iloc[0]
    age = customer['Age']
    
    # Randomly choose a product
    p_id = random.randint(1, len(products))
    p_name = products[p_id - 1]
    
    # Stronger base cart abandonment probability
    base_abandon_prob = 0.5
    
    # Stronger adjust based on age
    if 18 <= age <= 25:
        base_abandon_prob += 0.4
    elif age >= 60:
        base_abandon_prob -= 0.3
    
    # Incorporate browsing patterns with stronger effect
    browsing = df_browsing.loc[df_browsing['Customer ID'] == cust_id].iloc[0]
    visited_pages = browsing['Total Pages Visited']
    avg_time_spent = browsing['Average Time Spent per Page (sec)']
    
    # If visited many pages -> less likely to abandon
    if visited_pages > 100:
        base_abandon_prob -= 0.4
    
    # If time spent is very high -> less likely to abandon
    if avg_time_spent > 200:
        base_abandon_prob -= 0.2
    
    # Clamp between 0 and 0.95 for variability
    final_abandon_prob = np.clip(base_abandon_prob, 0, 0.95)
    
    # Decide if abandoned
    abandoned = np.random.rand() < final_abandon_prob
    cart_abandoned_list.append('Yes' if abandoned else 'No')
    
    product_id_list.append(p_id)
    product_name_list.append(p_name)
    
    # Assign Shipping Method
    if abandoned:
        shipping_method = 'None'
    else:
        shipping_method = random.choices(
            shipping_methods,
            weights=[0.7, 0.25, 0.05],
            k=1
        )[0]
    shipping_method_list.append(shipping_method)

df_orders = pd.DataFrame({
    'Order ID': order_ids,
    'Customer ID': range(1, num_entries + 1),
    'Product ID': product_id_list,
    'Product Name': product_name_list,
    'Cart Abandoned': cart_abandoned_list,
    'Shipping Method': shipping_method_list
})


# 5. Generate Payments with Correlated Payment Failures

transaction_success_list = []
transaction_amount_list = []
payment_method_list = []

for i in range(num_entries):
    order = df_orders.iloc[i]
    
    if order['Cart Abandoned'] == 'Yes':
        transaction_success = 'No'
        transaction_amount = 0
        payment_method = 'None'
    else:
        pay_method = random.choice(payment_methods)
        success_rates = {
            'Card': 0.98,
            'PayPal': 0.90,
            'MB WAY': 0.60,
            'Bank Transfer': 0.70
        }
        
        # Base success probability from payment method
        base_success_prob = success_rates[pay_method]
        payment_method = pay_method
        
        # Determine transaction amount based on product
        product = order['Product Name']
        if product in ['Laptop', 'Camera', 'Smartphone']:
            base_amount = random.randint(500, 2000)
        elif product in ['Printer', 'Smartwatch', 'Tablet']:
            base_amount = random.randint(150, 800)
        else:
            base_amount = random.randint(20, 300)
        transaction_amount = base_amount
        
        # Penalty for high amounts
        if transaction_amount > 1000:
            base_success_prob -= 0.20
        
        # Penalty for older customers
        cust_id = order['Customer ID']
        age = df_customers.loc[df_customers['Customer ID'] == cust_id, 'Age'].values[0]
        if age >= 60:
            base_success_prob -= 0.15
        
        final_success_prob = np.clip(base_success_prob, 0, 1)
        
        # Decide success
        success = np.random.rand() < final_success_prob
        transaction_success = 'Yes' if success else 'No'
        

    transaction_success_list.append(transaction_success)
    transaction_amount_list.append(transaction_amount)
    payment_method_list.append(payment_method)

df_orders['Payment Method'] = payment_method_list
df_orders['Transaction Success'] = transaction_success_list
df_orders['Transaction Amount'] = transaction_amount_list


# 6. Feedback Given

feedback_given_list = []

for i in range(num_entries):
    order = df_orders.iloc[i]
    
    if order['Cart Abandoned'] == 'Yes' or order['Transaction Success'] == 'No':
        feedback_given = 'No'
    else:
        cust_id = order['Customer ID']
        customer = df_customers.loc[df_customers['Customer ID'] == cust_id].iloc[0]
        age = customer['Age']
        
        transaction_amount = order['Transaction Amount']
        product = order['Product Name']
        
        # Base feedback probability
        base_feedback_prob = 0.6
        
        # Age adjustments
        if age > 60:
            base_feedback_prob += 0.3
        elif age < 25:
            base_feedback_prob -= 0.2
        
        # Transaction amount adjustments
        if transaction_amount > 1000:
            base_feedback_prob += 0.4
        elif transaction_amount < 50:
            base_feedback_prob -= 0.15
        
        # High-value products -> more likely to give feedback
        if product in ['Laptop', 'Camera', 'Smartphone']:
            base_feedback_prob += 0.3
        
        final_feedback_prob = np.clip(base_feedback_prob, 0, 1)
        feedback_given = 'Yes' if np.random.rand() < final_feedback_prob else 'No'
    
    feedback_given_list.append(feedback_given)

df_orders['Feedback Given'] = feedback_given_list


# 7. Final Dataset

df_final = df_orders.merge(df_customers, on='Customer ID', how='left', suffixes=('', '_Customer'))
df_final = df_final.merge(df_browsing, on='Customer ID', how='left', suffixes=('', '_Browsing'))

df_final = df_final[[
    'Order ID', 'Customer ID', 'Age', 'Gender',
    'Total Sessions', 'Total Pages Visited', 'Average Time Spent per Page (sec)',
    'Product ID', 'Product Name', 'Cart Abandoned',
    'Shipping Method', 'Payment Method', 'Transaction Success', 'Transaction Amount',
    'Feedback Given'
]]


# 8. Save to CSV

df_final.to_csv('eledget_single_dataset_stronger_patterns.csv', index=False)
