<h1><b>Splitting 1 CSV File into Multiple CSV Files (By Column)</h1></b>

In [1]:
# Loading the required functions
import pandas as pd
import os
import numpy as np

In [None]:
file_path = os.path.expanduser(r'~\Documents\GitHub\E-Commerce_SupplyChain_DataAnalysis\data\interim\cleaned_data.csv')
# Import the cleaned data with specified data types
clean_df = pd.read_csv(file_path, dtype={
    'customer_zipcode': str,
    'order_zipcode': str
})


DATABASE STRUCTURE
------------------

The data schema will contain 6 dimension tables and 1 fact table for order information
1) dim_location: Locations (combining customer and order information)
2) dim_customer: Information related to customers
3) dim_product: Details about products
4) dim_category: Product category
5) dim_date: Dates for various events (order, shipping)
6) dim_order_status: Order statuses
7) dim_market: Market information
8) order

In [3]:
## dim_location table
# Extract the required columns for customer and order locations
customer_locations = clean_df[['customer_city', 'customer_state', 'customer_zipcode']].copy()
order_locations = clean_df[['order_city', 'order_state', 'order_zipcode', 'order_region', 'order_country']].copy()
customer_locations.columns = ['city', 'state', 'zipcode']
order_locations.columns = ['city', 'state', 'zipcode', 'region', 'country']

# Add empty columns for region and country in customer_locations
customer_locations['region'] = ''
customer_locations['country'] = ''

# Concatenate customer and order locations
dim_location = pd.concat([customer_locations, order_locations], ignore_index=True)

# Drop duplicates to ensure unique locations
dim_location = dim_location.drop_duplicates().reset_index(drop=True)

# Generate a unique location_id
dim_location['location_id'] = range(1, len(dim_location) + 1)

# Fill region and country with 'USA' for rows with 2-digit state codes
us_state = dim_location['state'].str.len() == 2
dim_location.loc[us_state, 'region'] = 'USA'
dim_location.loc[us_state, 'country'] = 'United States'
dim_location = dim_location[['location_id', 'city', 'state', 'zipcode', 'country','region']]

print(dim_location)

      location_id         city            state zipcode        country  \
0               1       Caguas               PR     725  United States   
1               2     San Jose               CA   95125  United States   
2               3  Los Angeles               CA   90027  United States   
3               4    Tonawanda               NY   14150  United States   
4               5        Miami               FL   33162  United States   
...           ...          ...              ...     ...            ...   
4791         4792      Urganch          Khorezm     NaN     Uzbekistan   
4792         4793  Pico Rivera       California   90660  United States   
4793         4794     Tongling            Anhui     NaN          China   
4794         4795      Liuyang            Hunan     NaN          China   
4795         4796       Nashua  Nuevo Hampshire    3060  United States   

            region  
0              USA  
1              USA  
2              USA  
3              USA  
4     

In [4]:
# dim_customer Table
dim_customer = clean_df.filter(regex='^customer_').copy()
dim_customer = dim_customer.drop_duplicates().reset_index(drop=True)
dim_customer = pd.merge(dim_customer, dim_location[['location_id','city','state','zipcode']], left_on=['customer_city','customer_zipcode','customer_state'], right_on=['city','zipcode','state'])
dim_customer = dim_customer.drop(columns=['customer_email', 'customer_password','customer_city','customer_state','customer_zipcode','city','state','zipcode'])
dim_customer = dim_customer[['customer_id','customer_fname','customer_lname','customer_segment','customer_street','location_id']]

display(dim_customer)

Unnamed: 0,customer_id,customer_fname,customer_lname,customer_segment,customer_street,location_id
0,20755,Cally,Holloway,Consumer,5365 Noble Nectar Island,1
1,19492,Irene,Luna,Consumer,2679 Rustic Loop,1
2,19491,Gillian,Maldonado,Consumer,8510 Round Bear Gate,2
3,19490,Tana,Tate,Home Office,3200 Amber Bend,3
4,19489,Orli,Hendricks,Corporate,8671 Iron Anchor Corners,1
...,...,...,...,...,...,...
20647,18586,Diana,Tran,Home Office,1802 Broad Carrefour,1
20648,16322,Anika,Davenport,Home Office,8284 Colonial Gate Path,1
20649,18579,Yuri,Smith,Home Office,2248 Heather Mews,1
20650,16244,Hyacinth,Witt,Home Office,7247 Honey Prairie Field,1


In [5]:
# dim_category Table
dim_category = clean_df[['category_id', 'category_name']]
dim_category = dim_category.drop_duplicates().sort_values(by='category_id').reset_index(drop=True)
display(dim_category)

Unnamed: 0,category_id,category_name
0,2,Soccer
1,3,Baseball & Softball
2,4,Basketball
3,5,Lacrosse
4,6,Tennis & Racquet
5,7,Hockey
6,9,Cardio Equipment
7,10,Strength Training
8,11,Fitness Accessories
9,12,Boxing & MMA


In [7]:
# dim_department Table
dim_department = clean_df[['department_id', 'department_name']]
dim_department = dim_department.drop_duplicates().sort_values(by='department_id').reset_index(drop=True)    
display(dim_department)

Unnamed: 0,department_id,department_name
0,2,Fitness
1,3,Footwear
2,4,Apparel
3,5,Golf
4,6,Outdoors
5,7,Fan Shop
6,8,Book Shop
7,9,Discs Shop
8,10,Technology
9,11,Pet Shop


In [19]:
# dim_product Table
dim_product = clean_df.filter(regex='^product_').copy()
dim_product = dim_product.drop_duplicates().reset_index(drop=True).sort_values(by='product_card_id')
dim_product.head()

Unnamed: 0,product_card_id,product_category_id,product_image,product_name,product_price
87,19,2,http://images.acmesports.sports/Nike+Men%27s+F...,Nike Men's Fingertrap Max Training Shoe,124.989998
22,24,2,http://images.acmesports.sports/Elevation+Trai...,Elevation Training Mask 2.0,79.989998
83,35,3,http://images.acmesports.sports/adidas+Brazuca...,adidas Brazuca 2014 Official Match Ball,159.990005
51,37,3,http://images.acmesports.sports/adidas+Kids%27...,adidas Kids' F5 Messi FG Soccer Cleat,34.990002
14,44,3,http://images.acmesports.sports/adidas+Men%27s...,adidas Men's F10 Messi TRX FG Soccer Cleat,59.990002


In [29]:
# Orders Table
orders_fact = clean_df.drop(columns=['category_id','category_name', 'department_name','product_name','product_image','product_card_id','product_category_id','product_price'])
orders_fact = pd.merge(orders_fact, dim_location, how='left',
                       left_on=['order_city','order_zipcode','order_state','order_country','order_region'], 
                       right_on=['city','zipcode','state','country','region'])
orders_fact = orders_fact.drop(columns=['order_city','order_state','order_zipcode','order_country','order_region','city','state','zipcode','country','region'])
orders_fact = orders_fact.drop(columns=['customer_id','customer_city','customer_state','customer_zipcode','customer_fname','customer_lname','customer_segment','customer_street','customer_email','customer_password'])
orders_fact = orders_fact.drop_duplicates().sort_values(by='order_id').reset_index(drop=True)

display(orders_fact)

Unnamed: 0,type,days_for_shipping_(real),days_for_shipment_(scheduled),benefit_per_order,sales_per_customer,delivery_status,late_delivery_risk,department_id,latitude,longitude,...,order_item_product_price,order_item_profit_ratio,order_item_quantity,sales,order_item_total,order_profit_per_order,order_status,shipping_date_(dateorders),shipping_mode,location_id
0,CASH,2,4,88.790001,239.979996,Advance shipping,0,7,35.776661,-81.362625,...,299.980011,0.37,1,299.980011,239.979996,88.790001,CLOSED,2015-01-03 00:00:00,Standard Class,1054
1,PAYMENT,3,4,91.180000,193.990005,Advance shipping,0,7,41.832722,-87.980484,...,199.990005,0.47,1,199.990005,193.990005,91.180000,PENDING_PAYMENT,2015-01-04 00:21:00,Standard Class,1050
2,PAYMENT,3,4,68.250000,227.500000,Advance shipping,0,5,41.832722,-87.980484,...,50.000000,0.30,5,250.000000,227.500000,68.250000,PENDING_PAYMENT,2015-01-04 00:21:00,Standard Class,1050
3,PAYMENT,3,4,36.470001,107.889999,Advance shipping,0,4,41.832722,-87.980484,...,129.990005,0.34,1,129.990005,107.889999,36.470001,PENDING_PAYMENT,2015-01-04 00:21:00,Standard Class,1050
4,CASH,5,4,33.590000,159.940002,Late delivery,1,7,29.520010,-98.637413,...,49.980000,0.21,4,199.919998,159.940002,33.590000,CLOSED,2015-01-06 01:03:00,Standard Class,1050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180514,TRANSFER,6,4,-107.959999,161.869995,Late delivery,1,4,18.273024,-66.037056,...,215.820007,-0.67,1,215.820007,161.869995,-107.959999,PROCESSING,2018-02-06 22:14:00,Standard Class,997
180515,DEBIT,2,4,-126.559998,172.660004,Advance shipping,0,4,41.608639,-88.202042,...,215.820007,-0.73,1,215.820007,172.660004,-126.559998,COMPLETE,2018-02-02 22:35:00,Standard Class,997
180516,DEBIT,3,4,91.250000,314.640015,Advance shipping,0,2,18.251453,-66.037056,...,327.750000,0.29,1,327.750000,314.640015,91.250000,COMPLETE,2018-02-03 22:56:00,Standard Class,997
180517,PAYMENT,4,4,1.530000,10.910000,Shipping on time,0,7,41.830791,-87.802979,...,11.540000,0.14,1,11.540000,10.910000,1.530000,PENDING_PAYMENT,2018-02-04 23:17:00,Standard Class,997


In [19]:
# Products Table
products_columns = ['Product Card Id', 'Product Category Id', 'Product Name', 'Product Description', 'Product Image', 'Product Price', 'Product Status']
products_df = data_df[products_columns]
display(products_df)

Unnamed: 0,Product Card Id,Product Category Id,Product Name,Product Description,Product Image,Product Price,Product Status
0,1360,73,Smart watch,,http://images.acmesports.sports/Smart+watch,327.750000,0
1,1360,73,Smart watch,,http://images.acmesports.sports/Smart+watch,327.750000,0
2,1360,73,Smart watch,,http://images.acmesports.sports/Smart+watch,327.750000,0
3,1360,73,Smart watch,,http://images.acmesports.sports/Smart+watch,327.750000,0
4,1360,73,Smart watch,,http://images.acmesports.sports/Smart+watch,327.750000,0
...,...,...,...,...,...,...,...
180514,1004,45,Field & Stream Sportsman 16 Gun Fire Safe,,http://images.acmesports.sports/Field+%26+Stre...,399.980011,0
180515,1004,45,Field & Stream Sportsman 16 Gun Fire Safe,,http://images.acmesports.sports/Field+%26+Stre...,399.980011,0
180516,1004,45,Field & Stream Sportsman 16 Gun Fire Safe,,http://images.acmesports.sports/Field+%26+Stre...,399.980011,0
180517,1004,45,Field & Stream Sportsman 16 Gun Fire Safe,,http://images.acmesports.sports/Field+%26+Stre...,399.980011,0


In [5]:
# OrdersProcessing Table
ordersprocessing_columns = ['Order Id', 'Type', 'Days for shipping (real)', 'Days for shipment (scheduled)', 'Benefit per order', 'Shipping Mode', 'Delivery Status', 'Order Status', 'Late_delivery_risk']
ordersprocessing_df = data_df[ordersprocessing_columns]
display(ordersprocessing_df)                    

Unnamed: 0,Order Id,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Shipping Mode,Delivery Status,Order Status,Late_delivery_risk
0,77202,DEBIT,3,4,91.250000,Standard Class,Advance shipping,COMPLETE,0
1,75939,TRANSFER,5,4,-249.089996,Standard Class,Late delivery,PENDING,1
2,75938,CASH,4,4,-247.779999,Standard Class,Shipping on time,CLOSED,0
3,75937,DEBIT,3,4,22.860001,Standard Class,Advance shipping,COMPLETE,0
4,75936,PAYMENT,2,4,134.210007,Standard Class,Advance shipping,PENDING_PAYMENT,0
...,...,...,...,...,...,...,...,...,...
180514,26043,CASH,4,4,40.000000,Standard Class,Shipping on time,CLOSED,0
180515,26037,DEBIT,3,2,-613.770019,Second Class,Late delivery,COMPLETE,1
180516,26024,TRANSFER,5,4,141.110001,Standard Class,Late delivery,PENDING,1
180517,26022,PAYMENT,3,4,186.229996,Standard Class,Advance shipping,PENDING_PAYMENT,0


In [26]:
# Replace 'output_path' with the desired output directory
output_path = 'C:\\Users\\LENOVO\\Documents\\Personal Project\\Data Analysis on DataCo Supply Chain Management System'

# Save each DataFrame to a separate CSV file
orders_df.to_csv('orders.csv', index=False)
order_items_df.to_csv('order_items.csv', index=False)
customers_df.to_csv('customers.csv', index=False)
products_df.to_csv('products.csv', index=False)
categories_df.to_csv('categories.csv', index=False)
departments_df.to_csv('departments.csv', index=False)