<h1><b>Splitting 1 CSV File into Multiple CSV Files (By Column)</h1></b>

In [13]:
# Loading the required functions
import pandas as pd
import os
import numpy as np

In [14]:
file_path = os.path.expanduser(r'~\Documents\GitHub\E-Commerce_SupplyChain_DataAnalysis\data\interim\cleaned_data.csv')
# Import the cleaned data with specified data types
clean_df = pd.read_csv(file_path, dtype={
    'customer_zipcode': str,
    'order_zipcode': str
})


DATABASE STRUCTURE
------------------

The data schema will contain 6 dimension tables and 1 fact table for order information
1) dim_location: Locations (combining customer and order information)
2) dim_customer: Information related to customers
3) dim_product: Details about products
4) dim_category: Product category
5) dim_date: Dates for various events (order, shipping)
6) dim_order_status: Order statuses
7) dim_market: Market information
8) order

In [15]:
## dim_location table
# Extract the required columns for customer and order locations
dim_location = clean_df[['order_city', 'order_state', 'order_zipcode', 'order_region', 'order_country','market']].copy()

# Drop duplicates to ensure unique locations then Generate a unique location_id
dim_location = dim_location.drop_duplicates().reset_index(drop=True)
dim_location['location_id'] = range(1, len(dim_location) + 1)
display(dim_location)

Unnamed: 0,order_city,order_state,order_zipcode,order_region,order_country,market,location_id
0,Bekasi,Java Occidental,,Southeast Asia,Indonesia,Pacific Asia,1
1,Bikaner,Rajastán,,South Asia,India,Pacific Asia,2
2,Townsville,Queensland,,Oceania,Australia,Pacific Asia,3
3,Toowoomba,Queensland,,Oceania,Australia,Pacific Asia,4
4,Guangzhou,Guangdong,,Eastern Asia,China,Pacific Asia,5
...,...,...,...,...,...,...,...
3795,Urganch,Khorezm,,Central Asia,Uzbekistan,Pacific Asia,3796
3796,Pico Rivera,California,90660,West of USA,United States,North America,3797
3797,Tongling,Anhui,,Eastern Asia,China,Pacific Asia,3798
3798,Liuyang,Hunan,,Eastern Asia,China,Pacific Asia,3799


In [16]:
# dim_customer Table
dim_customer = clean_df.filter(regex='^customer_').copy()
dim_customer = dim_customer.drop_duplicates().reset_index(drop=True)

display(dim_customer)

Unnamed: 0,customer_city,customer_email,customer_fname,customer_id,customer_lname,customer_password,customer_segment,customer_state,customer_street,customer_zipcode
0,Caguas,XXXXXXXXX,Cally,20755,Holloway,XXXXXXXXX,Consumer,PR,5365 Noble Nectar Island,725
1,Caguas,XXXXXXXXX,Irene,19492,Luna,XXXXXXXXX,Consumer,PR,2679 Rustic Loop,725
2,San Jose,XXXXXXXXX,Gillian,19491,Maldonado,XXXXXXXXX,Consumer,CA,8510 Round Bear Gate,95125
3,Los Angeles,XXXXXXXXX,Tana,19490,Tate,XXXXXXXXX,Home Office,CA,3200 Amber Bend,90027
4,Caguas,XXXXXXXXX,Orli,19489,Hendricks,XXXXXXXXX,Corporate,PR,8671 Iron Anchor Corners,725
...,...,...,...,...,...,...,...,...,...,...
20647,Caguas,XXXXXXXXX,Diana,18586,Tran,XXXXXXXXX,Home Office,PR,1802 Broad Carrefour,725
20648,Caguas,XXXXXXXXX,Anika,16322,Davenport,XXXXXXXXX,Home Office,PR,8284 Colonial Gate Path,725
20649,Caguas,XXXXXXXXX,Yuri,18579,Smith,XXXXXXXXX,Home Office,PR,2248 Heather Mews,725
20650,Caguas,XXXXXXXXX,Hyacinth,16244,Witt,XXXXXXXXX,Home Office,PR,7247 Honey Prairie Field,725


In [17]:
# dim_category Table
dim_category = clean_df[['category_id', 'category_name']]
dim_category = dim_category.drop_duplicates().sort_values(by='category_id').reset_index(drop=True)
display(dim_category)

Unnamed: 0,category_id,category_name
0,2,Soccer
1,3,Baseball & Softball
2,4,Basketball
3,5,Lacrosse
4,6,Tennis & Racquet
5,7,Hockey
6,9,Cardio Equipment
7,10,Strength Training
8,11,Fitness Accessories
9,12,Boxing & MMA


In [18]:
# dim_product Table
dim_product = clean_df.filter(regex='^product_').copy()
dim_product = dim_product.drop_duplicates().reset_index(drop=True).sort_values(by='product_card_id')
display(dim_product)

Unnamed: 0,product_card_id,product_category_id,product_image,product_name,product_price
87,19,2,http://images.acmesports.sports/Nike+Men%27s+F...,Nike Men's Fingertrap Max Training Shoe,124.989998
22,24,2,http://images.acmesports.sports/Elevation+Trai...,Elevation Training Mask 2.0,79.989998
83,35,3,http://images.acmesports.sports/adidas+Brazuca...,adidas Brazuca 2014 Official Match Ball,159.990005
51,37,3,http://images.acmesports.sports/adidas+Kids%27...,adidas Kids' F5 Messi FG Soccer Cleat,34.990002
14,44,3,http://images.acmesports.sports/adidas+Men%27s...,adidas Men's F10 Messi TRX FG Soccer Cleat,59.990002
...,...,...,...,...,...
75,1359,72,http://images.acmesports.sports/Adult+dog+supp...,Adult dog supplies,84.400002
0,1360,73,http://images.acmesports.sports/Smart+watch,Smart watch,327.750000
114,1361,74,http://images.acmesports.sports/Toys,Toys,11.540000
79,1362,75,http://images.acmesports.sports/Fighting+video...,Fighting video games,39.750000


In [19]:
# dim_department Table
dim_department = clean_df[['department_id', 'department_name']]
dim_department = dim_department.drop_duplicates().sort_values(by='department_id').reset_index(drop=True)    
display(dim_department)

Unnamed: 0,department_id,department_name
0,2,Fitness
1,3,Footwear
2,4,Apparel
3,5,Golf
4,6,Outdoors
5,7,Fan Shop
6,8,Book Shop
7,9,Discs Shop
8,10,Technology
9,11,Pet Shop


In [20]:
# Orders Table
orders_fact = clean_df.drop(columns=['category_id','category_name', 'department_name','product_name','product_image','product_card_id','product_category_id','product_price','order_item_product_price'])
orders_fact = pd.merge(orders_fact, dim_location, how='left',
                       on=['order_city','order_zipcode','order_state','order_country','order_region','market'])
orders_fact = orders_fact.drop(columns=['order_city','order_state','order_zipcode','order_country','order_region','market'])
orders_fact = orders_fact.drop(columns=['customer_id','customer_city','customer_state','customer_zipcode','customer_fname','customer_lname','customer_segment','customer_street','customer_email','customer_password'])
orders_fact = orders_fact.drop_duplicates().sort_values(by='order_id').reset_index(drop=True)

display(orders_fact)

Unnamed: 0,type,days_for_shipping_(real),days_for_shipment_(scheduled),benefit_per_order,sales_per_customer,delivery_status,late_delivery_risk,department_id,latitude,longitude,...,order_item_id,order_item_profit_ratio,order_item_quantity,sales,order_item_total,order_profit_per_order,order_status,shipping_date_(dateorders),shipping_mode,location_id
0,CASH,2,4,88.790001,239.979996,Advance shipping,0,7,35.776661,-81.362625,...,1,0.37,1,299.980011,239.979996,88.790001,CLOSED,2015-01-03 00:00:00,Standard Class,58
1,PAYMENT,3,4,36.470001,107.889999,Advance shipping,0,4,41.832722,-87.980484,...,4,0.34,1,129.990005,107.889999,36.470001,PENDING_PAYMENT,2015-01-04 00:21:00,Standard Class,54
2,PAYMENT,3,4,68.250000,227.500000,Advance shipping,0,5,41.832722,-87.980484,...,3,0.30,5,250.000000,227.500000,68.250000,PENDING_PAYMENT,2015-01-04 00:21:00,Standard Class,54
3,PAYMENT,3,4,91.180000,193.990005,Advance shipping,0,7,41.832722,-87.980484,...,2,0.47,1,199.990005,193.990005,91.180000,PENDING_PAYMENT,2015-01-04 00:21:00,Standard Class,54
4,CASH,5,4,33.590000,159.940002,Late delivery,1,7,29.520010,-98.637413,...,8,0.21,4,199.919998,159.940002,33.590000,CLOSED,2015-01-06 01:03:00,Standard Class,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180514,TRANSFER,6,4,-107.959999,161.869995,Late delivery,1,4,18.273024,-66.037056,...,180515,-0.67,1,215.820007,161.869995,-107.959999,PROCESSING,2018-02-06 22:14:00,Standard Class,1
180515,DEBIT,2,4,-126.559998,172.660004,Advance shipping,0,4,41.608639,-88.202042,...,180516,-0.73,1,215.820007,172.660004,-126.559998,COMPLETE,2018-02-02 22:35:00,Standard Class,1
180516,DEBIT,3,4,91.250000,314.640015,Advance shipping,0,2,18.251453,-66.037056,...,180517,0.29,1,327.750000,314.640015,91.250000,COMPLETE,2018-02-03 22:56:00,Standard Class,1
180517,PAYMENT,4,4,1.530000,10.910000,Shipping on time,0,7,41.830791,-87.802979,...,180518,0.14,1,11.540000,10.910000,1.530000,PENDING_PAYMENT,2018-02-04 23:17:00,Standard Class,1


In [14]:
# Replace 'output_path' with the desired output directory
output_path = r'~\Documents\GitHub\E-Commerce_SupplyChain_DataAnalysis\data\processed'

# Save each DataFrame to a separate CSV file
dim_location.to_csv(os.path.join(output_path, 'dim_location.csv'), index=False)
dim_customer.to_csv(os.path.join(output_path, 'dim_customer.csv'), index=False)
dim_category.to_csv(os.path.join(output_path, 'dim_category.csv'), index=False)
dim_department.to_csv(os.path.join(output_path, 'dim_department.csv'), index=False)
dim_product.to_csv(os.path.join(output_path, 'dim_product.csv'), index=False)
orders_fact.to_csv(os.path.join(output_path, 'orders_fact.csv'), index=False)