In [1]:
# Imoports
import numpy as np
import pandas as pd
import chardet
from pathlib import Path

In [2]:
# Automatically detect the encoding of the CSV.
with open('Sample-Superstore.csv', 'rb') as f:
    result = chardet.detect(f.read())

print(result['encoding'])

Windows-1252


In [3]:
# Read the initila data.
df = pd.read_csv('Sample-Superstore.csv', encoding=result['encoding'])

In [4]:
# Get some basic info about the data.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

In [5]:
# Make sure there is no NA values.
df.isna().any().any()

False

In [6]:
# Make sure there is no NULL values.
df.isnull().any().any()

False

In [7]:
# Check the values in the data.
df.describe()

Unnamed: 0,Row ID,Postal Code,Sales,Quantity,Discount,Profit
count,9994.0,9994.0,9994.0,9994.0,9994.0,9994.0
mean,4997.5,55190.379428,229.858001,3.789574,0.156203,28.656896
std,2885.163629,32063.69335,623.245101,2.22511,0.206452,234.260108
min,1.0,1040.0,0.444,1.0,0.0,-6599.978
25%,2499.25,23223.0,17.28,2.0,0.0,1.72875
50%,4997.5,56430.5,54.49,3.0,0.2,8.6665
75%,7495.75,90008.0,209.94,5.0,0.2,29.364
max,9994.0,99301.0,22638.48,14.0,0.8,8399.976


In [8]:
# Are there any fully duplicated lines?
df.duplicated().any()

False

In [9]:
# Convert 'Order Date' and 'Ship Date' to date objects
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%m/%d/%Y')
df['Ship Date'] = pd.to_datetime(df['Ship Date'], format='%m/%d/%Y')

In [10]:
# The numeric values in Sales and Profit look weird. They shouldn't have values with more than 2 decimal points.
# This is very likely a rounding error.
df[['Sales', 'Profit']]

Unnamed: 0,Sales,Profit
0,261.9600,41.9136
1,731.9400,219.5820
2,14.6200,6.8714
3,957.5775,-383.0310
4,22.3680,2.5164
...,...,...
9989,25.2480,4.1028
9990,91.9600,15.6332
9991,258.5760,19.3932
9992,29.6000,13.3200


In [11]:
# Assuming those are values in USD, let's first transform them into cents and then get rid of the rounding error.
df['Sales'] = (df['Sales'] * 100).astype(int)
df['Profit'] = (df['Profit'] * 100).astype(int)

In [12]:
df

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,26195,2,0.00,4191
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",73194,3,0.00,21958
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,1462,2,0.00,687
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,95757,5,0.45,-38303
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,2236,2,0.20,251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9989,9990,CA-2014-110422,2014-01-21,2014-01-23,Second Class,TB-21400,Tom Boeckenhauer,Consumer,United States,Miami,...,33180,South,FUR-FU-10001889,Furniture,Furnishings,Ultra Door Pull Handle,2524,3,0.20,410
9990,9991,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,FUR-FU-10000747,Furniture,Furnishings,Tenex B1-RE Series Chair Mats for Low Pile Car...,9196,2,0.00,1563
9991,9992,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,TEC-PH-10003645,Technology,Phones,Aastra 57i VoIP phone,25857,2,0.20,1939
9992,9993,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,OFF-PA-10004041,Office Supplies,Paper,"It's Hot Message Books with Stickers, 2 3/4"" x 5""",2960,4,0.00,1332


# Idea / structure of split data

Split data into:
- orders (PK: Order ID, FK: Customer ID)
- customers (PK: Customer ID)
- products (PK: Product ID)
- orders table (FK: Order ID, Product ID) [joins order and customer into a junction table]
- locations (PK: postal code) [it would be a FK in orders then as well]

# Helper Functions

In [14]:
# This function will be used later to check if a column qualifies as primary key. It should not consist duplicates.
def duplicates_raport(df):
    """This function checks for the presence of duplicate values in each column of a DataFrame
    and generates a report indicating whether duplicates exist in each column."""
    results = dict()

    for column in df.columns:
        results[column] = df[column].duplicated().any()
    
    return pd.DataFrame(results, index=['Duplicate'])

# Products Table

In [15]:
# Create products table
products = df.copy()
products = products[['Product ID', 'Product Name', 'Category', 'Sub-Category']]

In [16]:
# We don't need duplicated lines, because we have the ID's and can connect the data using them.
products.drop_duplicates(inplace=True)

In [17]:
# Check wheather there is no duplicated data in the table.
duplicates_raport(products)

Unnamed: 0,Product ID,Product Name,Category,Sub-Category
Duplicate,True,True,True,True


In [18]:
# Unfortunately there are duplicated Product IDs which would cause a problem if we try to use them as primary keys.

In [19]:
# Identify the duplicated Product IDs.
products[products['Product ID'].duplicated(keep=False)].sort_values('Product ID')

Unnamed: 0,Product ID,Product Name,Category,Sub-Category
2471,FUR-BO-10002213,"Sauder Forest Hills Library, Woodland Oak Finish",Furniture,Bookcases
2115,FUR-BO-10002213,DMI Eclipse Executive Suite Bookcases,Furniture,Bookcases
66,FUR-CH-10001146,"Global Value Mid-Back Manager's Chair, Gray",Furniture,Chairs
128,FUR-CH-10001146,"Global Task Chair, Black",Furniture,Chairs
1459,FUR-FU-10001473,DAX Wood Document Frame,Furniture,Furnishings
...,...,...,...,...
1219,TEC-PH-10002200,Samsung Galaxy Note 2,Technology,Phones
2596,TEC-PH-10002310,Plantronics Calisto P620-M USB Wireless Speake...,Technology,Phones
1378,TEC-PH-10002310,Panasonic KX T7731-B Digital phone,Technology,Phones
922,TEC-PH-10004531,OtterBox Commuter Series Case - iPhone 5 & 5s,Technology,Phones


In [20]:
# Save the duplicated_ids in a pd Series and count how many there is.
duplicated_ids = products[products['Product ID'].duplicated()].sort_values('Product ID')['Product ID']
print('ammount of duplicated IDs:', len(duplicated_ids))

ammount of duplicated IDs: 32


In [21]:
# Looks like for 32 unique Product IDs we have 2 different Product Names.

In [22]:
# Look at them in context to get an idea what the problem could be or what a possible solution could look like.
#df_duplicated_ids = df[df['Product ID'].isin(duplicated_ids)].sort_values('Product ID')
#df_duplicated_ids.to_csv('df_duplicated_ids.csv')

In [23]:
# There seems to be no real pattern to why some products have the same product IDs.

In [24]:
# Have a look at the Product Names of the duplicated Product IDs
# and see if they don't come up under different Product IDs, which would complicate the problem further.
df[df['Product Name'] == "Global Value Mid-Back Manager's Chair, Gray"]

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
66,67,US-2015-164175,2015-04-30,2015-05-05,Standard Class,PS-18970,Paul Stevenson,Home Office,United States,Chicago,...,60610,Central,FUR-CH-10001146,Furniture,Chairs,"Global Value Mid-Back Manager's Chair, Gray",21311,5,0.3,-1522
1067,1068,CA-2016-157686,2016-10-01,2016-10-02,First Class,BD-11620,Brian DeCherney,Consumer,United States,San Francisco,...,94122,West,FUR-CH-10001146,Furniture,Chairs,"Global Value Mid-Back Manager's Chair, Gray",19484,4,0.2,1217
1267,1268,US-2014-167738,2014-12-24,2014-12-29,Standard Class,JC-16105,Julie Creighton,Corporate,United States,Los Angeles,...,90045,West,FUR-CH-10001146,Furniture,Chairs,"Global Value Mid-Back Manager's Chair, Gray",29227,6,0.2,1826
1881,1882,CA-2015-109512,2015-03-05,2015-03-05,Same Day,LF-17185,Luke Foster,Consumer,United States,New York City,...,10011,East,FUR-CH-10001146,Furniture,Chairs,"Global Value Mid-Back Manager's Chair, Gray",38360,7,0.1,6393
2003,2004,CA-2017-163510,2017-12-25,2017-12-28,Second Class,JW-15955,Joni Wasserman,Consumer,United States,Louisville,...,40214,South,FUR-CH-10001146,Furniture,Chairs,"Global Value Mid-Back Manager's Chair, Gray",30445,5,0.0,7611
4559,4560,CA-2014-110219,2014-05-05,2014-05-08,First Class,EB-13870,Emily Burns,Consumer,United States,San Antonio,...,78207,Central,FUR-CH-10001146,Furniture,Chairs,"Global Value Mid-Back Manager's Chair, Gray",12786,3,0.3,-913
7425,7426,CA-2016-101693,2016-06-25,2016-06-27,Second Class,LC-17140,Logan Currie,Consumer,United States,Houston,...,77070,Central,FUR-CH-10001146,Furniture,Chairs,"Global Value Mid-Back Manager's Chair, Gray",8524,2,0.3,-608
8891,8892,CA-2016-162159,2016-09-16,2016-09-18,First Class,CR-12625,Corey Roper,Home Office,United States,Columbus,...,31907,South,FUR-CH-10001146,Furniture,Chairs,"Global Value Mid-Back Manager's Chair, Gray",12178,2,0.0,3044
9254,9255,CA-2014-168368,2014-02-11,2014-02-15,Second Class,GA-14725,Guy Armstrong,Consumer,United States,Columbia,...,65203,Central,FUR-CH-10001146,Furniture,Chairs,"Global Value Mid-Back Manager's Chair, Gray",6089,1,0.0,1522
9807,9808,CA-2017-107209,2017-07-27,2017-08-01,Second Class,JW-15955,Joni Wasserman,Consumer,United States,Raleigh,...,27604,South,FUR-CH-10001146,Furniture,Chairs,"Global Value Mid-Back Manager's Chair, Gray",19484,4,0.2,1217


In [25]:
# At first glance it looks like they don't.

In [26]:
# To be save, check if any of the products with duplicated Product Names also have duplicated Product IDs.
duplicate_names = products[products['Product Name'].duplicated(keep=False)].sort_values('Product Name')
duplicates_raport(duplicate_names)

Unnamed: 0,Product ID,Product Name,Category,Sub-Category
Duplicate,False,True,True,True


In [27]:
# Luckily they don't.

In [28]:
# Check if Sub-Category is unique on it's own, meaning each Sub-Category can be only in one Category.
categories = products[['Category', 'Sub-Category']].copy()
categories.drop_duplicates(inplace=True)
categories.sort_values('Category')

Unnamed: 0,Category,Sub-Category
0,Furniture,Bookcases
1,Furniture,Chairs
3,Furniture,Tables
5,Furniture,Furnishings
138,Office Supplies,Supplies
53,Office Supplies,Fasteners
30,Office Supplies,Envelopes
12,Office Supplies,Paper
9,Office Supplies,Appliances
8,Office Supplies,Binders


In [29]:
# Confirm using the duplicates raport.
duplicates_raport(categories)

Unnamed: 0,Category,Sub-Category
Duplicate,True,False


In [30]:
# Each Sub-Category can be only in one Category.

In [31]:
# Split the 'Product ID' column into 'Cat', 'Sub-Cat' and 'Product Number'.
products[['Cat', 'Sub-Cat', 'Product Number']] = products['Product ID'].str.split('-', expand=True)

# Convert 'Product Number' to int.
products['Product Number'] = products['Product Number'].astype(int)

products.sort_values('Product Number')

Unnamed: 0,Product ID,Product Name,Category,Sub-Category,Cat,Sub-Cat,Product Number
748,TEC-PH-10000004,Belkin iPhone and iPad Lightning Cable,Technology,Phones,TEC,PH,10000004
3563,OFF-PA-10000007,Telephone Message Books with Fax/Mobile Sectio...,Office Supplies,Paper,OFF,PA,10000007
617,FUR-FU-10000010,"DAX Value U-Channel Document Frames, Easel Back",Furniture,Furnishings,FUR,FU,10000010
1454,TEC-MA-10000010,Hewlett-Packard Deskjet 3050a All-in-One Color...,Technology,Machines,TEC,MA,10000010
240,TEC-PH-10000011,PureGear Roll-On Screen Protector,Technology,Phones,TEC,PH,10000011
...,...,...,...,...,...,...,...
5412,TEC-AC-10004992,Kingston Digital DataTraveler 64GB USB 2.0,Technology,Accessories,TEC,AC,10004992
353,OFF-BI-10004995,GBC DocuBind P400 Electric Binding System,Office Supplies,Binders,OFF,BI,10004995
3455,OFF-PA-10004996,"Speediset Carbonless Redi-Letter 7"" x 8 1/2""",Office Supplies,Paper,OFF,PA,10004996
569,FUR-CH-10004997,Hon Every-Day Series Multi-Task Chairs,Furniture,Chairs,FUR,CH,10004997


In [32]:
# Generate new Product Numbers for one of the products with duplicated Product IDs.
# In case there would be more than two unique products per Product ID

for id in duplicated_ids:

    # Filter the DataFrame to get the subset corresponding to the current ID
    subset = products[products['Product ID'] == id]

    # Randomly choose a 'Product Name' from the subset
    random_name = np.random.choice(subset['Product Name'])

    # Get the 'Sub-Cat', the should all be the same, because they determine the 'Product ID', so just pick the first.
    sub_cat = subset['Sub-Cat'].iloc[0]

    # Find the maximum 'Product Number' for the 'Sub-Cat'
    max_prod_num = products[products['Sub-Cat'] == sub_cat]['Product Number'].max()

    # Update the 'Product Number' for the current product
    products.loc[(products['Product ID'] == id) & (products['Product Name'] == random_name), 'Product Number'] = max_prod_num + 1

In [33]:
# Generate a New Product ID
products['New Product ID'] = products[['Cat', 'Sub-Cat', 'Product Number']].astype(str).apply('-'.join, axis=1)

In [34]:
# Take a look at the modified Product Numbers and New Product IDs
products[products['Product ID'].isin(duplicated_ids)].sort_values('Product ID')

Unnamed: 0,Product ID,Product Name,Category,Sub-Category,Cat,Sub-Cat,Product Number,New Product ID
2471,FUR-BO-10002213,"Sauder Forest Hills Library, Woodland Oak Finish",Furniture,Bookcases,FUR,BO,10004835,FUR-BO-10004835
2115,FUR-BO-10002213,DMI Eclipse Executive Suite Bookcases,Furniture,Bookcases,FUR,BO,10002213,FUR-BO-10002213
66,FUR-CH-10001146,"Global Value Mid-Back Manager's Chair, Gray",Furniture,Chairs,FUR,CH,10001146,FUR-CH-10001146
128,FUR-CH-10001146,"Global Task Chair, Black",Furniture,Chairs,FUR,CH,10004998,FUR-CH-10004998
1459,FUR-FU-10001473,DAX Wood Document Frame,Furniture,Furnishings,FUR,FU,10001473,FUR-FU-10001473
...,...,...,...,...,...,...,...,...
1219,TEC-PH-10002200,Samsung Galaxy Note 2,Technology,Phones,TEC,PH,10002200,TEC-PH-10002200
2596,TEC-PH-10002310,Plantronics Calisto P620-M USB Wireless Speake...,Technology,Phones,TEC,PH,10004981,TEC-PH-10004981
1378,TEC-PH-10002310,Panasonic KX T7731-B Digital phone,Technology,Phones,TEC,PH,10002310,TEC-PH-10002310
922,TEC-PH-10004531,OtterBox Commuter Series Case - iPhone 5 & 5s,Technology,Phones,TEC,PH,10004531,TEC-PH-10004531


In [35]:
# Drop the data that's not necessary anymore.
products.drop(['Cat', 'Sub-Cat', 'Product Number'], axis=1, inplace=True)

In [36]:
# Check if the 'New Product ID' has duplicates
duplicates_raport(products)

Unnamed: 0,Product ID,Product Name,Category,Sub-Category,New Product ID
Duplicate,True,True,True,True,False


In [37]:
# Apply changes to df so they are consistent everywhere.
# Merge the two DataFrames on the 'Product ID' and 'Product Name' columns
merged_df = df.merge(products, on=['Product ID', 'Product Name'], how='left')

# Update the 'Product ID' column in df with the values from the 'New Product ID' column in products
df['Product ID'] = merged_df['New Product ID']

In [38]:
# Replace 'Product ID' with 'New Product ID'
products['Product ID'] = products['New Product ID']
products.drop(['New Product ID'], axis=1, inplace=True)

In [39]:
products

Unnamed: 0,Product ID,Product Name,Category,Sub-Category
0,FUR-BO-10001798,Bush Somerset Collection Bookcase,Furniture,Bookcases
1,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Furniture,Chairs
2,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...,Office Supplies,Labels
3,FUR-TA-10000577,Bretford CR4500 Series Slim Rectangular Table,Furniture,Tables
4,OFF-ST-10000760,Eldon Fold 'N Roll Cart System,Office Supplies,Storage
...,...,...,...,...
9521,TEC-PH-10002817,RCA ViSYS 25425RE1 Corded phone,Technology,Phones
9562,TEC-MA-10003589,Cisco 8961 IP Phone Charcoal,Technology,Machines
9604,OFF-AP-10003099,"Eureka Hand Vacuum, Bagless",Office Supplies,Appliances
9673,TEC-PH-10002645,LG G2,Technology,Phones


# Orders Details Table
Connects the orders and products tables.  
Each record in the Order Details table represents one line item on an order. The Order Details table’s primary key consists of two fields — the foreign keys from the Orders and the Products tables.

In [40]:
# Create orders details table
orders_details = df.copy()
orders_details = orders_details[['Order ID', 'Product ID', 'Sales', 'Quantity', 'Discount', 'Profit']]

In [41]:
# Check if any of the 'Order ID', 'Product ID' combinations contain duplicates.
# They should not and if they do it must be an error.
orders_details.duplicated(['Order ID', 'Product ID']).any()

True

In [42]:
# Have a look at the data in context. Find the reason for duplicates.
# Get duplicates
duplicated_order_details = orders_details[orders_details.duplicated(['Order ID', 'Product ID'], keep=False)]

# Filter the main df for them.
order_details_df = df[df[['Order ID', 'Product ID']].isin(duplicated_order_details[['Order ID', 'Product ID']]).any(axis=1)]
order_details_df

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
350,351,CA-2016-129714,2016-09-01,2016-09-03,First Class,AB-10060,Adam Bellavance,Home Office,United States,New York City,...,10009,East,OFF-PA-10001970,Office Supplies,Paper,Xerox 1881,2456,2,0.0,1154
352,353,CA-2016-129714,2016-09-01,2016-09-03,First Class,AB-10060,Adam Bellavance,Home Office,United States,New York City,...,10009,East,OFF-PA-10001970,Office Supplies,Paper,Xerox 1881,4912,4,0.0,2308
430,431,US-2016-123750,2016-04-15,2016-04-21,Standard Class,RB-19795,Ross Baird,Home Office,United States,Gastonia,...,28052,South,TEC-AC-10004659,Technology,Accessories,Imation Secure+ Hardware Encrypted USB 2.0 Fla...,40874,7,0.2,7663
431,432,US-2016-123750,2016-04-15,2016-04-21,Standard Class,RB-19795,Ross Baird,Home Office,United States,Gastonia,...,28052,South,TEC-AC-10004659,Technology,Accessories,Imation Secure+ Hardware Encrypted USB 2.0 Fla...,29195,5,0.2,5474
1300,1301,CA-2016-137043,2016-12-23,2016-12-25,Second Class,LC-17140,Logan Currie,Consumer,United States,Springfield,...,22153,South,FUR-FU-10003664,Furniture,Furnishings,"Electrix Architect's Clamp-On Swing Arm Lamp, ...",57276,6,0.0,16610
1301,1302,CA-2016-137043,2016-12-23,2016-12-25,Second Class,LC-17140,Logan Currie,Consumer,United States,Springfield,...,22153,South,FUR-FU-10003664,Furniture,Furnishings,"Electrix Architect's Clamp-On Swing Arm Lamp, ...",28638,3,0.0,8305
3183,3184,CA-2017-152912,2017-11-09,2017-11-12,Second Class,BM-11650,Brian Moss,Corporate,United States,Columbia,...,21044,East,OFF-ST-10003208,Office Supplies,Storage,Adjustable Depth Letter/Legal Cart,163314,9,0.0,47361
3184,3185,CA-2017-152912,2017-11-09,2017-11-12,Second Class,BM-11650,Brian Moss,Corporate,United States,Columbia,...,21044,East,OFF-ST-10003208,Office Supplies,Storage,Adjustable Depth Letter/Legal Cart,54438,3,0.0,15787
3405,3406,US-2014-150119,2014-04-23,2014-04-27,Standard Class,LB-16795,Laurel Beltran,Home Office,United States,Columbus,...,43229,East,FUR-CH-10002965,Furniture,Chairs,Global Leather Highback Executive Chair with P...,28137,2,0.3,-1205
3406,3407,US-2014-150119,2014-04-23,2014-04-27,Standard Class,LB-16795,Laurel Beltran,Home Office,United States,Columbus,...,43229,East,FUR-CH-10002965,Furniture,Chairs,Global Leather Highback Executive Chair with P...,28137,2,0.3,-1205


In [43]:
# The problem seems to be in the last 4 numeric columns.
# Looks like some products didn't get accumulated into one entry, even though they are the same product in the same order.
# To fix that I'll add them together into one line.

In [44]:
for group_key, group_df in order_details_df.groupby(['Order ID', 'Product ID']):
    #print(group_key)

    # Identify the minimum 'Row ID' within the group
    save_row_id = group_df['Row ID'].min()

    # Create a list of all 'Row IDs' in the group and remove the 'save_row_id' from the list
    delete_row_ids = group_df['Row ID'].values.tolist()
    delete_row_ids.remove(save_row_id)  # Remove the row to be saved

    # Update values in the main DataFrame based on the aggregated values of the group
    df.loc[df['Row ID'] == save_row_id, 'Sales'] = group_df['Sales'].sum()
    df.loc[df['Row ID'] == save_row_id, 'Quantity'] = group_df['Quantity'].sum()
    df.loc[df['Row ID'] == save_row_id, 'Discount'] = group_df['Discount'].iloc[0]
    df.loc[df['Row ID'] == save_row_id, 'Profit'] = group_df['Profit'].sum()

    # Delete unnecessary rows
    df = df.drop(df[df['Row ID'].isin(delete_row_ids)].index)

In [45]:
# See if it worked.
#df
#df.info()
#df.isna().any()

In [46]:
# Create a new orders details table, based on fixed data
orders_details = df.copy()
orders_details = orders_details[['Order ID', 'Product ID', 'Sales', 'Quantity', 'Discount', 'Profit']]

In [47]:
orders_details

Unnamed: 0,Order ID,Product ID,Sales,Quantity,Discount,Profit
0,CA-2016-152156,FUR-BO-10001798,26195,2,0.00,4191
1,CA-2016-152156,FUR-CH-10000454,73194,3,0.00,21958
2,CA-2016-138688,OFF-LA-10000240,1462,2,0.00,687
3,US-2015-108966,FUR-TA-10000577,95757,5,0.45,-38303
4,US-2015-108966,OFF-ST-10000760,2236,2,0.20,251
...,...,...,...,...,...,...
9989,CA-2014-110422,FUR-FU-10001889,2524,3,0.20,410
9990,CA-2017-121258,FUR-FU-10000747,9196,2,0.00,1563
9991,CA-2017-121258,TEC-PH-10003645,25857,2,0.20,1939
9992,CA-2017-121258,OFF-PA-10004041,2960,4,0.00,1332


In [48]:
# Check if any of the 'Order ID', 'Product ID' combinations contain duplicates.
# They should not and if they do it must be an error.
orders_details.duplicated(['Order ID', 'Product ID']).any()

False

# Customers Table

In [49]:
# Create customers table
customers = df.copy()
customers = customers[['Customer ID', 'Customer Name', 'Segment']]

In [50]:
# We don't need duplicated lines, because we have the ID's and can connect the data using them.
customers.drop_duplicates(inplace=True)

In [51]:
# Check wheather there is no duplicated data in the table.
duplicates_raport(customers)

Unnamed: 0,Customer ID,Customer Name,Segment
Duplicate,False,False,True


In [52]:
# There are no duplicates in Customer ID and Customer Name which is what's important for us.
# The Customer ID needs to be unique.
# We don't care about the duplicated Segment.

In [53]:
# Might be useful for documentation.
customers['Segment'].unique()

array(['Consumer', 'Corporate', 'Home Office'], dtype=object)

In [54]:
customers

Unnamed: 0,Customer ID,Customer Name,Segment
0,CG-12520,Claire Gute,Consumer
2,DV-13045,Darrin Van Huff,Corporate
3,SO-20335,Sean O'Donnell,Consumer
5,BH-11710,Brosina Hoffman,Consumer
12,AA-10480,Andrew Allen,Consumer
...,...,...,...
8666,CJ-11875,Carl Jackson,Corporate
9209,RS-19870,Roy Skaria,Home Office
9399,SC-20845,Sung Chung,Consumer
9441,RE-19405,Ricardo Emerson,Consumer


# Addresses Table

In [55]:
# Create addresses table.
addresses = df.copy()
addresses = addresses[['Postal Code', 'City', 'State', 'Region', 'Country']]

In [56]:
# We don't need the duplicates, because we have the ID's
addresses.drop_duplicates(inplace=True)

In [57]:
# Check wheather there is no duplicated data in the table
duplicates_raport(addresses)

Unnamed: 0,Postal Code,City,State,Region,Country
Duplicate,True,True,True,True,True


In [58]:
# There are duplicates in the Postal Code so let's have a look at them
addresses[addresses['Postal Code'].duplicated(keep=False)].sort_values('Postal Code')

Unnamed: 0,Postal Code,City,State,Region,Country
481,92024,San Diego,California,West,United States
1113,92024,Encinitas,California,West,United States


In [59]:
# The Postal Code is not unique enough to be a primary key, but the combination of 'Postal Code' and 'City' are unique enough.
addresses.duplicated(['Postal Code', 'City']).any()

False

In [60]:
# Since we are going to use only 'Postal Code' and 'City' for the generation of Address ID's, let's drop all the other tables.
addresses.drop(['State', 'Region', 'Country'], axis=1, inplace=True)

In [61]:
# Generate unique Address ID's. They are based on the Postal Code.
addresses['Address ID'] = addresses['Postal Code'].astype(str) + '-' + (addresses.groupby('Postal Code').cumcount() + 1).astype(str).str.zfill(6)

In [62]:
# Make sure it solves the problem.
addresses[addresses['Postal Code'] == 92024]

Unnamed: 0,Postal Code,City,Address ID
481,92024,San Diego,92024-000001
1113,92024,Encinitas,92024-000002


In [63]:
# Apply changes to df so they are consistent everywhere.
# Merge the two DataFrames on the 'Postal Code' and 'City' columns
#merged_df = df.merge(addresses, on=['Postal Code', 'City'], how='left')

df = df.merge(addresses, on=['Postal Code', 'City'], how='left')

#display(merged_df[['Postal Code', 'City', 'Address ID']])

# Transfer the 'Address ID' column to df
#df['Address ID'] = merged_df['Address ID']

In [64]:
df[['Postal Code', 'Address ID']]

Unnamed: 0,Postal Code,Address ID
0,42420,42420-000001
1,42420,42420-000001
2,90036,90036-000001
3,33311,33311-000001
4,33311,33311-000001
...,...,...
9981,33180,33180-000001
9982,92627,92627-000001
9983,92627,92627-000001
9984,92627,92627-000001


In [65]:
# Create addresses table again, because we just changed it to add the 'Address ID' to df.
addresses = df.copy()
addresses = addresses[['Address ID', 'Postal Code', 'City', 'State', 'Region', 'Country']]

In [66]:
addresses

Unnamed: 0,Address ID,Postal Code,City,State,Region,Country
0,42420-000001,42420,Henderson,Kentucky,South,United States
1,42420-000001,42420,Henderson,Kentucky,South,United States
2,90036-000001,90036,Los Angeles,California,West,United States
3,33311-000001,33311,Fort Lauderdale,Florida,South,United States
4,33311-000001,33311,Fort Lauderdale,Florida,South,United States
...,...,...,...,...,...,...
9981,33180-000001,33180,Miami,Florida,South,United States
9982,92627-000001,92627,Costa Mesa,California,West,United States
9983,92627-000001,92627,Costa Mesa,California,West,United States
9984,92627-000001,92627,Costa Mesa,California,West,United States


# Orders Table

In [67]:
# Create orders table
orders = df.copy()
orders = orders[['Order ID', 'Order Date', 'Ship Mode', 'Customer ID']]

In [68]:
# We don't need duplicated lines, because we have the ID's and can connect the data using them.
orders.drop_duplicates(inplace=True)

In [69]:
# Check wheather there is no duplicated data in the table.
duplicates_raport(orders)

Unnamed: 0,Order ID,Order Date,Ship Mode,Customer ID
Duplicate,False,True,True,True


In [70]:
# Might be useful for documentation.
orders['Ship Mode'].unique()

array(['Second Class', 'Standard Class', 'First Class', 'Same Day'],
      dtype=object)

In [71]:
orders

Unnamed: 0,Order ID,Order Date,Ship Mode,Customer ID
0,CA-2016-152156,2016-11-08,Second Class,CG-12520
2,CA-2016-138688,2016-06-12,Second Class,DV-13045
3,US-2015-108966,2015-10-11,Standard Class,SO-20335
5,CA-2014-115812,2014-06-09,Standard Class,BH-11710
12,CA-2017-114412,2017-04-15,Standard Class,AA-10480
...,...,...,...,...
9978,CA-2016-125794,2016-09-29,Standard Class,ML-17410
9979,CA-2017-163629,2017-11-17,Standard Class,RA-19885
9981,CA-2014-110422,2014-01-21,Second Class,TB-21400
9982,CA-2017-121258,2017-02-26,Standard Class,DB-13060


# Generate all the tables
Even though we did that above sometimes, it happens here again, so all changes done to the main df are applied.

## products

In [72]:
# Create products table
products = df.copy()
products = products[['Product ID', 'Product Name', 'Category', 'Sub-Category']]

# We don't need duplicated lines, because we have the ID's and can connect the data using them.
products.drop_duplicates(inplace=True)

In [73]:
# Check wheather there is no duplicated data in the table.
duplicates_raport(products)

Unnamed: 0,Product ID,Product Name,Category,Sub-Category
Duplicate,False,True,True,True


In [74]:
products

Unnamed: 0,Product ID,Product Name,Category,Sub-Category
0,FUR-BO-10001798,Bush Somerset Collection Bookcase,Furniture,Bookcases
1,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Furniture,Chairs
2,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...,Office Supplies,Labels
3,FUR-TA-10000577,Bretford CR4500 Series Slim Rectangular Table,Furniture,Tables
4,OFF-ST-10000760,Eldon Fold 'N Roll Cart System,Office Supplies,Storage
...,...,...,...,...
9513,TEC-PH-10002817,RCA ViSYS 25425RE1 Corded phone,Technology,Phones
9554,TEC-MA-10003589,Cisco 8961 IP Phone Charcoal,Technology,Machines
9596,OFF-AP-10003099,"Eureka Hand Vacuum, Bagless",Office Supplies,Appliances
9665,TEC-PH-10002645,LG G2,Technology,Phones


## orders_details

In [75]:
# Create a new orders details table, based on fixed data
orders_details = df.copy()
orders_details = orders_details[['Order ID', 'Product ID', 'Sales', 'Quantity', 'Discount', 'Profit']]

In [76]:
# Check if any of the 'Order ID', 'Product ID' combinations contain duplicates.
# They should not and if they do it must be an error.
orders_details.duplicated(['Order ID', 'Product ID']).any()

False

In [77]:
orders_details

Unnamed: 0,Order ID,Product ID,Sales,Quantity,Discount,Profit
0,CA-2016-152156,FUR-BO-10001798,26195,2,0.00,4191
1,CA-2016-152156,FUR-CH-10000454,73194,3,0.00,21958
2,CA-2016-138688,OFF-LA-10000240,1462,2,0.00,687
3,US-2015-108966,FUR-TA-10000577,95757,5,0.45,-38303
4,US-2015-108966,OFF-ST-10000760,2236,2,0.20,251
...,...,...,...,...,...,...
9981,CA-2014-110422,FUR-FU-10001889,2524,3,0.20,410
9982,CA-2017-121258,FUR-FU-10000747,9196,2,0.00,1563
9983,CA-2017-121258,TEC-PH-10003645,25857,2,0.20,1939
9984,CA-2017-121258,OFF-PA-10004041,2960,4,0.00,1332


## customers

In [78]:
# Create customers table
customers = df.copy()
customers = customers[['Customer ID', 'Customer Name', 'Segment']]

# We don't need duplicated lines, because we have the ID's and can connect the data using them.
customers.drop_duplicates(inplace=True)

In [79]:
# Check wheather there is no duplicated data in the table.
duplicates_raport(customers)

Unnamed: 0,Customer ID,Customer Name,Segment
Duplicate,False,False,True


In [80]:
customers

Unnamed: 0,Customer ID,Customer Name,Segment
0,CG-12520,Claire Gute,Consumer
2,DV-13045,Darrin Van Huff,Corporate
3,SO-20335,Sean O'Donnell,Consumer
5,BH-11710,Brosina Hoffman,Consumer
12,AA-10480,Andrew Allen,Consumer
...,...,...,...
8659,CJ-11875,Carl Jackson,Corporate
9201,RS-19870,Roy Skaria,Home Office
9391,SC-20845,Sung Chung,Consumer
9433,RE-19405,Ricardo Emerson,Consumer


## orders

In [81]:
# Create orders table
orders = df.copy()
orders = orders[['Order ID', 'Order Date', 'Ship Date', 'Ship Mode', 'Customer ID', 'Address ID']]

# We don't need duplicated lines, because we have the ID's and can connect the data using them.
orders.drop_duplicates(inplace=True)

In [82]:
# Check wheather there is no duplicated data in the table.
duplicates_raport(orders)

Unnamed: 0,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Address ID
Duplicate,False,True,True,True,True,True


In [83]:
orders

Unnamed: 0,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Address ID
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,42420-000001
2,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,90036-000001
3,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,33311-000001
5,CA-2014-115812,2014-06-09,2014-06-14,Standard Class,BH-11710,90032-000001
12,CA-2017-114412,2017-04-15,2017-04-20,Standard Class,AA-10480,28027-000001
...,...,...,...,...,...,...
9978,CA-2016-125794,2016-09-29,2016-10-03,Standard Class,ML-17410,90008-000001
9979,CA-2017-163629,2017-11-17,2017-11-21,Standard Class,RA-19885,30605-000001
9981,CA-2014-110422,2014-01-21,2014-01-23,Second Class,TB-21400,33180-000001
9982,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,92627-000001


## addresses

In [84]:
# Create addresses table.
addresses = df.copy()
addresses = addresses[['Address ID', 'Postal Code', 'City', 'State', 'Region', 'Country']]

# We don't need duplicated lines, because we have the ID's and can connect the data using them.
addresses.drop_duplicates(['Address ID'], inplace=True)

In [85]:
# Check wheather there is no duplicated data in the table.
duplicates_raport(addresses)

Unnamed: 0,Address ID,Postal Code,City,State,Region,Country
Duplicate,False,True,True,True,True,True


In [86]:
addresses

Unnamed: 0,Address ID,Postal Code,City,State,Region,Country
0,42420-000001,42420,Henderson,Kentucky,South,United States
2,90036-000001,90036,Los Angeles,California,West,United States
3,33311-000001,33311,Fort Lauderdale,Florida,South,United States
5,90032-000001,90032,Los Angeles,California,West,United States
12,28027-000001,28027,Concord,North Carolina,South,United States
...,...,...,...,...,...,...
9752,72762-000001,72762,Springdale,Arkansas,South,United States
9793,95240-000001,95240,Lodi,California,West,United States
9826,77571-000001,77571,La Porte,Texas,Central,United States
9860,45040-000001,45040,Mason,Ohio,East,United States


# Save to CSV

In [87]:
# Make path for the CSVs
path = 'SuperstoreTables'
Path(path).mkdir(exist_ok=True)

In [88]:
# Prepare file names for the CSVs
tables = {
    'Products' : products,
    'OrdersDetails' : orders_details,
    'Customers' : customers,
    'Orders' : orders,
    'Addresses' : addresses
}

In [89]:
# Save the CSVs
for table in tables:
    tables[table].to_csv(f"{path}/{table}.csv", index=False)