## Mock Data Generation for E-commerce Chatbot

In [45]:
import pandas as pd
from faker import Faker
from datetime import datetime, timedelta
import random

In [33]:
# Initialize Faker
fake = Faker()

# Define the number of rows for the dataset
num_rows = 100

# Helper function to generate random dates
def random_date(start, end):
    return fake.date_between(start_date=start, end_date=end)

# Helper function to calculate expected delivery date
def expected_delivery_date(shipment_date):
    delivery_days = random.randint(3, 10)  # Random delivery time window between 3 and 10 days
    return shipment_date + timedelta(days=delivery_days)

# Helper function to calculate business days between two dates
def business_days_between(start_date, end_date):
    day_generator = (start_date + timedelta(x + 1) for x in range((end_date - start_date).days))
    business_days = sum(1 for day in day_generator if day.weekday() < 5)
    return business_days

In [40]:
# Generate initial data
data = {
    "Order ID": [f"ORD{1000 + i}" for i in range(num_rows)],
    "Tracking Number": [f"{123456789 + i}" for i in range(num_rows)],
    "Shipment Date": [],  # Placeholder for shipment dates
    "Expected Delivery Date": [],  # Placeholder for expected delivery dates
    "Delivery Date": [],  # Placeholder for delivery dates
    "Status": [random.choice(["in process", "in transit", "delivered", "pending", "cancel"]) for _ in range(num_rows)],
    "Customer ID": [f"CUST{1000 + i}" for i in range(num_rows)],
    "Customer Name": [fake.name() for _ in range(num_rows)],
    "Customer Email": [fake.email() for _ in range(num_rows)],
    "Customer Phone": [fake.phone_number() for _ in range(num_rows)],
    "Order Date": [random_date('-60d', '-30d') for _ in range(num_rows)],
    "Refund Requested": [],  # Placeholder for refund requested
    "Refund Reason": [],  # Placeholder for refund reasons
    "Refund Status": [],  # Placeholder for refund statuses
    "Refund Date": [],  # Placeholder for refund dates
    "Notification Status": [random.choice([True, False]) for _ in range(num_rows)]
}

In [41]:
# Calculate expected delivery and delivery dates based on shipment dates
for i in range(num_rows):
    status = data["Status"][i]
    
    if status == "in process":
        if random.choice([True, False]):  # Randomly decide if the address is out of scope
            data["Shipment Date"].append(None)
            data["Expected Delivery Date"].append(None)
            data["Delivery Date"].append(None)
            data["Refund Requested"].append(True)
            data["Refund Reason"].append("Delivery Region Out of Scope")
            data["Refund Status"].append("approved")
            data["Refund Date"].append(None)
        else:
            data["Shipment Date"].append(None)
            data["Expected Delivery Date"].append(None)
            data["Delivery Date"].append(None)
            data["Refund Requested"].append(False)
            data["Refund Reason"].append("")
            data["Refund Status"].append("")
            data["Refund Date"].append(None)
    else:
        shipment_date = random_date('-30d', 'today')
        expected_delivery = expected_delivery_date(shipment_date)
        
        data["Shipment Date"].append(shipment_date)
        data["Expected Delivery Date"].append(expected_delivery)
        
        if status == "delivered":
            delivery_date = expected_delivery + timedelta(days=random.randint(0, 10))
            data["Delivery Date"].append(delivery_date)
            refund_requested = random.choice([True, False])
            
            if refund_requested:
                data["Refund Requested"].append(True)
                days_since_delivery = (datetime.now() - datetime.combine(delivery_date, datetime.min.time())).days
                if days_since_delivery <= 7:
                    reason = random.choice(["Damaged Item", "Defective Item"])
                    data["Refund Reason"].append(reason)
                    data["Refund Status"].append("approved")
                    refund_process_date = delivery_date + timedelta(days=random.randint(1, 3))
                    refund_receive_date = refund_process_date + timedelta(days=business_days_between(refund_process_date, refund_process_date + timedelta(days=7)))
                    data["Refund Date"].append(refund_receive_date)
                else:
                    data["Refund Reason"].append("Request Exceeded 7 Days")
                    data["Refund Status"].append("rejected")
                    data["Refund Date"].append(None)
            else:
                data["Refund Requested"].append(False)
                data["Refund Reason"].append("")
                data["Refund Status"].append("")
                data["Refund Date"].append(None)
        elif status == "cancel":
            delivery_date = expected_delivery + timedelta(days=random.randint(0, 10))
            data["Delivery Date"].append(delivery_date)
            refund_requested = True
            reason = random.choice(["Damaged Item", "Defective Item"])
            data["Refund Requested"].append(refund_requested)
            data["Refund Reason"].append(reason)
            data["Refund Status"].append("approved")
            refund_process_date = delivery_date + timedelta(days=random.randint(1, 3))
            refund_receive_date = refund_process_date + timedelta(days=business_days_between(refund_process_date, refund_process_date + timedelta(days=7)))
            data["Refund Date"].append(refund_receive_date)
        else:
            data["Delivery Date"].append(None)
            data["Refund Requested"].append(False)
            data["Refund Reason"].append("")
            data["Refund Status"].append("")
            data["Refund Date"].append(None)

In [42]:
for key, value in data.items():
    print(f"{key}: {len(value)}")

Order ID: 100
Tracking Number: 100
Shipment Date: 100
Expected Delivery Date: 100
Delivery Date: 100
Status: 100
Customer ID: 100
Customer Name: 100
Customer Email: 100
Customer Phone: 100
Order Date: 100
Refund Requested: 100
Refund Reason: 100
Refund Status: 100
Refund Date: 100
Notification Status: 100


In [43]:
# Create DataFrame
df = pd.DataFrame(data)

In [44]:
df

Unnamed: 0,Order ID,Tracking Number,Shipment Date,Expected Delivery Date,Delivery Date,Status,Customer ID,Customer Name,Customer Email,Customer Phone,Order Date,Refund Requested,Refund Reason,Refund Status,Refund Date,Notification Status
0,ORD1000,123456789,2024-05-05,2024-05-10,2024-05-20,cancel,CUST1000,Cassandra Foster,rachel42@example.com,001-910-504-6389x837,2024-03-21,True,Damaged Item,approved,2024-05-26,False
1,ORD1001,123456790,2024-05-15,2024-05-22,,in transit,CUST1001,Peter Turner,princelouis@example.net,(385)402-5026,2024-03-26,False,,,,True
2,ORD1002,123456791,2024-05-07,2024-05-11,,pending,CUST1002,Justin Parrish,pjohnson@example.org,568-540-3172x25216,2024-04-16,False,,,,False
3,ORD1003,123456792,2024-05-13,2024-05-17,,in transit,CUST1003,Sarah Hill,craigwilliams@example.com,001-931-777-1524x0415,2024-04-11,False,,,,True
4,ORD1004,123456793,2024-04-29,2024-05-05,,in transit,CUST1004,Ian White,robertsteele@example.net,888.556.7985,2024-03-29,False,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,ORD1095,123456884,2024-04-29,2024-05-02,,in transit,CUST1095,Brett Terry,uperez@example.com,703-577-3838x0314,2024-04-01,False,,,,False
96,ORD1096,123456885,2024-04-22,2024-05-02,,pending,CUST1096,Seth Delgado,michaelbentley@example.net,+1-517-949-1722x978,2024-04-05,False,,,,False
97,ORD1097,123456886,2024-05-10,2024-05-16,2024-05-20,cancel,CUST1097,Elizabeth Aguilar,gabriel02@example.com,001-889-641-0142x60592,2024-03-23,True,Defective Item,approved,2024-05-27,True
98,ORD1098,123456887,,,,in process,CUST1098,Katherine Fisher,yangkathy@example.com,001-568-524-6821,2024-04-08,False,,,,True


In [46]:
# Save DataFrame to CSV
df.to_csv('mock_dataset.csv', index=False)

print("Mock dataset generated and saved as 'mock_dataset.csv'.")

Mock dataset generated and saved as 'mock_dataset.csv'.
