# Simulated Supply Chain DataFrame

In [43]:
import numpy as np
import pandas as pd
from faker import Faker
import sys

In [44]:
import os

In [50]:

# Assuming 'scripts' is in the same parent directory as 'notebooks'
sys.path.append(os.path.abspath("../scripts"))

from data_cleaning import load_data, clean_data

In [46]:
# Initialize Faker
fake = Faker()

# Define the number of records you want to simulate
num_records = 10000  # Increased to 10,000 for larger dataset

# Define lists for categorical variables
weather_conditions = ["Clear", "Light Rain", "Heavy Rain", "Snow", "Fog"]
traffic_conditions = ["Light", "Moderate", "Heavy", "Severe"]
experience_levels = ["Junior", "Intermediate", "Senior"]
delivery_windows = ["Morning", "Afternoon", "Evening", "Overnight"]
package_types = ["Standard", "Fragile", "Perishable", "Oversized"]
route_types = ["Interstate", "Urban", "Suburban"]
truck_types = ["Box Truck", "Semi", "Flatbed"]
satisfaction_levels = ["Very Satisfied", "Satisfied", "Neutral", "Dissatisfied", "Very Dissatisfied"]
fuel_types = ["Diesel", "Gasoline"]

# Generate a range of dates
date_range = pd.date_range(start="2023-01-01", end="2023-12-31", freq='D')

# Generate unique driver IDs
num_drivers = 500  # Assuming 500 unique drivers
driver_ids = [fake.uuid4() for _ in range(num_drivers)]

# Generate random data
data = {
    "Route ID": [fake.uuid4() for _ in range(num_records)],
    "Driver ID": np.random.choice(driver_ids, num_records),
    "Delivery Time (hours)": np.random.uniform(1, 10, num_records),
    "Date": np.random.choice(date_range, num_records),
    "Fuel Costs (USD)": np.random.uniform(50, 1000, num_records),
    "Delivery Start Time": [fake.time(pattern="%H:%M:%S") for _ in range(num_records)],
    "Distance Traveled (miles)": np.random.uniform(50, 3000, num_records),
    "Estimated Distance (miles)": np.random.uniform(50, 3000, num_records),
    "Weather Conditions": np.random.choice(weather_conditions, num_records),
    "Traffic Conditions": np.random.choice(traffic_conditions, num_records),
    "Driver Ratings": np.random.uniform(1, 5, num_records),
    "Customer Satisfaction": np.random.choice(satisfaction_levels, num_records),
    "Delays (hours)": np.random.uniform(0, 5, num_records),
    "Warehouse Storage Costs (USD)": np.random.uniform(100, 500, num_records),
    "Truck Maintenance Costs (USD)": np.random.uniform(500, 2000, num_records),
    "Load Type": np.random.choice(package_types, num_records),
    "Load Weight (tons)": np.random.uniform(0.5, 20, num_records),
    "Route Type": np.random.choice(route_types, num_records),
    "Truck Type": np.random.choice(truck_types, num_records),
    "Driver Experience": np.random.choice(experience_levels, num_records),
    "Delivery Window": np.random.choice(delivery_windows, num_records),
    "Truck Condition": np.random.randint(1, 6, num_records),  # Rating from 1 to 5
    "Labor Costs (USD)": np.random.uniform(20, 200, num_records),
    "Fuel Type": np.random.choice(fuel_types, num_records),
    "Toll Costs (USD)": np.random.uniform(0, 50, num_records),
    "Parking Costs (USD)": np.random.uniform(0, 30, num_records),
    "Idle Time (hours)": np.random.uniform(0, 2, num_records)
}

# Additional calculated columns
data["Distance Difference (miles)"] = data["Distance Traveled (miles)"] - data["Estimated Distance (miles)"]
data["Cost per Gallon (USD)"] = np.where(np.array(data["Fuel Type"]) == "Diesel", 3.5, 3.0)
data["Total Fuel Cost (USD)"] = data["Distance Traveled (miles)"] / np.random.uniform(5, 10) * data["Cost per Gallon (USD)"]
data["Insurance Costs (USD)"] = np.where(np.array(data["Load Type"]) == "Hazardous", np.random.uniform(50, 150, num_records), np.random.uniform(20, 100, num_records))
data["Breakdown Repair Costs (USD)"] = np.where(np.array(data["Truck Condition"]) <= 2, np.random.uniform(200, 1000, num_records), 0)
data["Overtime Labor Costs (USD)"] = np.where(data["Delivery Time (hours)"] > 8, (data["Delivery Time (hours)"] - 8) * np.random.uniform(20, 40), 0)
data["Fuel Surcharge (USD)"] = data["Fuel Costs (USD)"] * np.random.uniform(0.05, 0.15)
data["Idle Cost (USD)"] = data["Idle Time (hours)"] * data["Cost per Gallon (USD)"] * 0.5  # Assuming half a gallon per hour idling

# Calculate total operational costs
data["Total Operational Cost (USD)"] = (
    data["Fuel Costs (USD)"] +
    data["Toll Costs (USD)"] +
    data["Insurance Costs (USD)"] +
    data["Parking Costs (USD)"] +
    data["Breakdown Repair Costs (USD)"] +
    data["Overtime Labor Costs (USD)"] +
    data["Fuel Surcharge (USD)"] +
    data["Idle Cost (USD)"]
)

# Additional metrics
data["Fuel Cost per Mile"] = data["Fuel Costs (USD)"] / data["Distance Traveled (miles)"]
data["Delivery Efficiency Score"] = (
    1 / (1 + data["Fuel Cost per Mile"]) *
    (1 / (1 + data["Delays (hours)"])) *
    (1 / (1 + data["Load Weight (tons)"]))
)

# Create a DataFrame
logistics_df = pd.DataFrame(data)

# Display the first few rows to verify
print(logistics_df.head())

                               Route ID                             Driver ID  \
0  c405a09a-7c1c-4cbb-b044-8d93ec12fd41  cd5f2588-0177-4ecf-b43b-64e4cffdae77   
1  1fedfb9c-4dd4-4866-ba9b-249b6c28a57c  cef901ab-9579-4d33-8a47-f2d772f71c72   
2  5870f02e-e6c5-47a2-b17c-2d9eb0a6b371  0ee00c87-6ef8-4895-b25a-ef185b8f1555   
3  7d4a8625-bd3a-42e8-ba14-1770b143f82d  49a5eacb-02d2-408b-b79d-9d3f447d2e37   
4  a4f49219-8ed9-49cd-a20e-6f82cc0e9554  b3ae425a-f6f9-41c6-ac70-672e7a7e5bcf   

   Delivery Time (hours)       Date  Fuel Costs (USD) Delivery Start Time  \
0               3.661775 2023-04-24         73.167465            18:11:58   
1               9.159558 2023-08-08        639.179375            00:56:12   
2               1.372942 2023-08-22         92.633480            02:29:03   
3               3.855411 2023-08-24        538.893607            21:07:08   
4               2.194644 2023-08-10        810.358195            11:36:06   

   Distance Traveled (miles)  Estimated Distance (

In [47]:
# Define paths to save/load data in the `data` directory
raw_data_path = '../data/raw/logistics_df.csv'
raw_excel_path = '../data/raw/logistics_df.xlsx'
cleaned_data_path = '../data/processed/cleaned_logistics_data.csv'
engineered_data_path = '../data/processed/engineered_data.csv'

# Ensure the `raw` directory exists within `data`
raw_directory = os.path.dirname(raw_data_path)
if not os.path.exists(raw_directory):
    os.makedirs(raw_directory)
    print(f"Directory created at {raw_directory}")

# Example: Saving a DataFrame to the `data/raw` directory# Sample data
logistics_df.to_csv(raw_data_path, index=False)
print(f"Data saved successfully to {raw_data_path}")

# Save the DataFrame to an Excel file
logistics_df.to_excel(raw_excel_path, index=False)  # index=False to avoid saving the index as a column

Data saved successfully to ../data/raw/logistics_df.csv


In [49]:
# Load the dataset
logistics_df = pd.read_csv('../data/raw/logistics_df.csv')

# Display first few rows
logistics_df.head()

Unnamed: 0,Route ID,Driver ID,Delivery Time (hours),Date,Fuel Costs (USD),Delivery Start Time,Distance Traveled (miles),Estimated Distance (miles),Weather Conditions,Traffic Conditions,...,Cost per Gallon (USD),Total Fuel Cost (USD),Insurance Costs (USD),Breakdown Repair Costs (USD),Overtime Labor Costs (USD),Fuel Surcharge (USD),Idle Cost (USD),Total Operational Cost (USD),Fuel Cost per Mile,Delivery Efficiency Score
0,c405a09a-7c1c-4cbb-b044-8d93ec12fd41,cd5f2588-0177-4ecf-b43b-64e4cffdae77,3.661775,2023-04-24,73.167465,18:11:58,1453.29031,1284.660045,Snow,Moderate,...,3.5,612.042183,82.188272,658.105936,0.0,7.739251,1.19769,854.69569,0.050346,0.035347
1,1fedfb9c-4dd4-4866-ba9b-249b6c28a57c,cef901ab-9579-4d33-8a47-f2d772f71c72,9.159558,2023-08-08,639.179375,00:56:12,751.259132,2132.837081,Snow,Moderate,...,3.0,271.188926,31.393083,699.534174,29.666102,67.608867,0.909051,1531.803244,0.850811,0.006516
2,5870f02e-e6c5-47a2-b17c-2d9eb0a6b371,0ee00c87-6ef8-4895-b25a-ef185b8f1555,1.372942,2023-08-22,92.63348,02:29:03,2666.204691,2661.824609,Light Rain,Light,...,3.5,1122.851867,54.905655,999.656631,0.0,9.798258,2.693096,1178.100685,0.034744,0.060085
3,7d4a8625-bd3a-42e8-ba14-1770b143f82d,49a5eacb-02d2-408b-b79d-9d3f447d2e37,3.855411,2023-08-24,538.893607,21:07:08,1220.239053,691.805233,Snow,Severe,...,3.0,440.480927,39.639937,825.812869,0.0,57.001192,2.442666,1495.349013,0.44163,0.047148
4,a4f49219-8ed9-49cd-a20e-6f82cc0e9554,b3ae425a-f6f9-41c6-ac70-672e7a7e5bcf,2.194644,2023-08-10,810.358195,11:36:06,2686.870337,2337.833994,Heavy Rain,Heavy,...,3.5,1131.555047,39.60033,769.116645,0.0,85.715218,2.511495,1727.060317,0.301599,0.017343
