In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("DataCoSupplyChainDataset.csv", encoding_errors = 'ignore')
amazon = pd.read_csv("amazon.csv")
clothing = pd.read_csv("clothing.csv")

Products Table

In [4]:
amazon = amazon[['product_name', 'category', 'actual_price']]

# converting prices from rupees to usd
amazon['actual_price'] = amazon['actual_price'].astype(str).str.replace('₹', '')
amazon['actual_price'] = amazon['actual_price'].str.replace(',', '')
amazon['actual_price'] = pd.to_numeric(amazon['actual_price'])
amazon['actual_price'] = round(amazon['actual_price']*0.012, 2)

amazon['category'] = amazon['category'].str.split('|').str[0]

# Remove categories that are not central to our analysis (We only want Home Appliances, Electronics, Clothing)
remove_set = {"MusicalInstruments", "Officeamazon", "Toys&Games", "Car&Motorbike", "Health&PersonalCare", "OfficeProducts"}
amazon = amazon[~amazon['category'].isin(remove_set)]

# We will only have two main categories: Home appliances and Electronics
amazon['category'] = amazon['category'].replace(["Home&Kitchen"], "Home Appliances")
amazon['category'] = amazon['category'].replace(["Computers&Accessories"], "Electronics")
amazon['category'] = amazon['category'].replace(["HomeImprovement"], "Home Appliances")

amazon['category'] = amazon.category.astype('category')

amazon.dropna()
amazon.drop_duplicates()

amazon.rename(columns = {'product_id': 'ProductID',
                           'product_name': 'Product Name',
                           'category': 'Category',
                           'actual_price': 'Price'}, inplace=True)

clothing = clothing[['name', 'actual_price', 'sub_category']]

clothing.dropna()
clothing.drop_duplicates()

# converting prices from rupees to usd
clothing['actual_price'] = clothing['actual_price'].astype(str).str.replace('₹', '')
clothing['actual_price'] = clothing['actual_price'].str.replace(',', '')
clothing["actual_price"].replace("nan", np.nan, inplace=True)

# Drop rows where 'actual_price' is NaN
clothing.dropna(subset=["actual_price"], inplace=True)

clothing['actual_price'] = pd.to_numeric(clothing['actual_price'])
clothing['actual_price'] = round(clothing['actual_price']*0.012, 2)

clothing.rename(columns = {'name': 'Product Name',
                           'actual_price': 'Price',
                           'sub_category': 'Category'}, inplace=True)

# row bind the 2 dataframes together
products = pd.concat([amazon, clothing], axis = 0, ignore_index = True)

# use index as productIDs
products['ProductID'] = products.index + 1
products = products[['ProductID', 'Product Name', 'Category', 'Price']] # reorder columns

products.to_csv("Products Table.csv")

Orders Table

In [5]:
orders = df[['Order Id', 'order date (DateOrders)', 'shipping date (DateOrders)', 'Days for shipping (real)', 'Days for shipment (scheduled)', 'Late_delivery_risk', 'Shipping Mode', 'Order Item Discount Rate']]

orders = orders.drop_duplicates()
orders.isna().sum() # no missing values
orders.rename(columns = {"Order Id": "OrderID",
                         "order date (DateOrders)": "Order Date",
                         "shipping date (DateOrders)": "Shipping Date",
                         "Days for shipping (real)": "Actual days for shipping",
                         "Days for shipment (scheduled)": "Scheduled days for shipping",
                         "Late_delivery_risk": "Late Delivery Risk",
                         "Order Item Discount Rate": "Discount Percentage"}, inplace = True)

np.random.seed(42)
orders["CustomerID"] = np.random.randint(0, 100000, size = len(orders))
orders["Returns"] = np.random.choice([0, 1], size = len(orders), p = [0.9, 0.1])
orders["ProductID"] = np.random.choice(products['ProductID'], size = len(orders), replace=True)
orders['Promotional Period'] = np.random.choice([0, 1], size = len(orders))

orders = orders[['OrderID', 'ProductID', 'CustomerID', 'Order Date', 'Shipping Date', 'Actual days for shipping', 'Scheduled days for shipping', 'Late Delivery Risk', 'Discount Percentage', 'Returns', 'Promotional Period']]
orders.reset_index(inplace = True, drop = True)
orders.to_csv("Orders Table.csv")

Suppliers Table

In [None]:
suppliers = products[["ProductID"]]
suppliers["SupplierID"] = np.random.randint(0, 25, size = len(suppliers))

suppliers.to_csv("Suppliers Table.csv")