# Super Store Analysis

---

## Installing dependencies

---

In [6]:
# !pip install xlrd
# !pip install pandas
# !pip install numpy
# !pip install mysql-connector-python
# !pip install sqlalchemy
# !pip install pymysql

---

## Setting up the environment

In [45]:
import re
import os
import warnings
import pandas as pd
import numpy as np
import mysql.connector as mysql
from sqlalchemy import create_engine, text
from sqlalchemy.exc import *
from google.cloud import bigquery as bq
from google.oauth2 import service_account

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'explore29-33756158108f.json'
warnings.filterwarnings('ignore')

---

## Extract (import data)

In [8]:
file_path = 'data-src/Sample - Superstore.xls'
orders_df = pd.read_excel(file_path, sheet_name=0, header=0)
people_df = pd.read_excel(file_path, sheet_name=1, header=0)
returns_df = pd.read_excel(file_path, sheet_name=2, header=0)

---

## Transform (clean and standardize data)

### Orders

#### Overview

In [9]:
orders_df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country/Region,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,US-2021-103800,2021-01-03,2021-01-07,Standard Class,DP-13000,Darren Powers,Consumer,United States,Houston,...,77095,Central,OFF-PA-10000174,Office Supplies,Paper,"Message Book, Wirebound, Four 5 1/2"" X 4"" Form...",16.448,2,0.2,5.5512
1,2,US-2021-112326,2021-01-04,2021-01-08,Standard Class,PO-19195,Phillina Ober,Home Office,United States,Naperville,...,60540,Central,OFF-BI-10004094,Office Supplies,Binders,GBC Standard Plastic Binding Systems Combs,3.54,2,0.8,-5.487
2,3,US-2021-112326,2021-01-04,2021-01-08,Standard Class,PO-19195,Phillina Ober,Home Office,United States,Naperville,...,60540,Central,OFF-LA-10003223,Office Supplies,Labels,Avery 508,11.784,3,0.2,4.2717
3,4,US-2021-112326,2021-01-04,2021-01-08,Standard Class,PO-19195,Phillina Ober,Home Office,United States,Naperville,...,60540,Central,OFF-ST-10002743,Office Supplies,Storage,SAFCO Boltless Steel Shelving,272.736,3,0.2,-64.7748
4,5,US-2021-141817,2021-01-05,2021-01-12,Standard Class,MB-18085,Mick Brown,Consumer,United States,Philadelphia,...,19143,East,OFF-AR-10003478,Office Supplies,Art,Avery Hi-Liter EverBold Pen Style Fluorescent ...,19.536,3,0.2,4.884


In [10]:
orders_df.tail()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country/Region,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
10189,10190,US-2024-143259,2024-12-30,2025-01-03,Standard Class,PO-18865,Patrick O'Donnell,Consumer,United States,New York City,...,10009,East,OFF-BI-10003684,Office Supplies,Binders,Wilson Jones Legal Size Ring Binders,52.776,3,0.2,19.791
10190,10191,US-2024-115427,2024-12-30,2025-01-03,Standard Class,EB-13975,Erica Bern,Corporate,United States,Fairfield,...,94533,West,OFF-BI-10004632,Office Supplies,Binders,GBC Binding covers,20.72,2,0.2,6.475
10191,10192,US-2024-156720,2024-12-30,2025-01-03,Standard Class,JM-15580,Jill Matthias,Consumer,United States,Loveland,...,80538,West,OFF-FA-10003472,Office Supplies,Fasteners,Bagged Rubber Bands,3.024,3,0.2,-0.6048
10192,10193,US-2024-143259,2024-12-30,2025-01-03,Standard Class,PO-18865,Patrick O'Donnell,Consumer,United States,New York City,...,10009,East,TEC-PH-10004774,Technology,Phones,Gear Head AU3700S Headset,90.93,7,0.0,2.7279
10193,10194,CA-2024-143500,2024-12-30,2025-01-03,Standard Class,HO-15230,Harry Olson,Consumer,Canada,Charlottetown,...,C0A,East,OFF-BI-10004040,Office Supplies,Binders,Wilson Jones Impact Binders,3.024,3,0.2,-0.6048


In [11]:
orders_df.shape

(10194, 21)

In [12]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10194 entries, 0 to 10193
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Row ID          10194 non-null  int64         
 1   Order ID        10194 non-null  object        
 2   Order Date      10194 non-null  datetime64[ns]
 3   Ship Date       10194 non-null  datetime64[ns]
 4   Ship Mode       10194 non-null  object        
 5   Customer ID     10194 non-null  object        
 6   Customer Name   10194 non-null  object        
 7   Segment         10194 non-null  object        
 8   Country/Region  10194 non-null  object        
 9   City            10194 non-null  object        
 10  State/Province  10194 non-null  object        
 11  Postal Code     10194 non-null  object        
 12  Region          10194 non-null  object        
 13  Product ID      10194 non-null  object        
 14  Category        10194 non-null  object        
 15  Su

#### Cleaning

**Check for duplicates**

In [13]:
print(orders_df.duplicated().sum())

0


**Check for null values**

In [14]:
print(orders_df.isnull().sum())

Row ID            0
Order ID          0
Order Date        0
Ship Date         0
Ship Mode         0
Customer ID       0
Customer Name     0
Segment           0
Country/Region    0
City              0
State/Province    0
Postal Code       0
Region            0
Product ID        0
Category          0
Sub-Category      0
Product Name      0
Sales             0
Quantity          0
Discount          0
Profit            0
dtype: int64


#### Renaming headers

In [15]:
def snake_case(data:str) -> str:
	return(data.lower().strip().replace(' ','_').replace('-', '_'))

In [16]:
orders_df = orders_df.rename(columns={
	'Country/Region': 'country',
    'State/Province': 'province',
    'Postal Code': 'post_code'
})

orders_df.columns = [snake_case(col) for col in orders_df.columns]

#### Final inspection

In [17]:
display(
	orders_df.info(),
	orders_df.head()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10194 entries, 0 to 10193
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   row_id         10194 non-null  int64         
 1   order_id       10194 non-null  object        
 2   order_date     10194 non-null  datetime64[ns]
 3   ship_date      10194 non-null  datetime64[ns]
 4   ship_mode      10194 non-null  object        
 5   customer_id    10194 non-null  object        
 6   customer_name  10194 non-null  object        
 7   segment        10194 non-null  object        
 8   country        10194 non-null  object        
 9   city           10194 non-null  object        
 10  province       10194 non-null  object        
 11  post_code      10194 non-null  object        
 12  region         10194 non-null  object        
 13  product_id     10194 non-null  object        
 14  category       10194 non-null  object        
 15  sub_category   1019

None

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,post_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
0,1,US-2021-103800,2021-01-03,2021-01-07,Standard Class,DP-13000,Darren Powers,Consumer,United States,Houston,...,77095,Central,OFF-PA-10000174,Office Supplies,Paper,"Message Book, Wirebound, Four 5 1/2"" X 4"" Form...",16.448,2,0.2,5.5512
1,2,US-2021-112326,2021-01-04,2021-01-08,Standard Class,PO-19195,Phillina Ober,Home Office,United States,Naperville,...,60540,Central,OFF-BI-10004094,Office Supplies,Binders,GBC Standard Plastic Binding Systems Combs,3.54,2,0.8,-5.487
2,3,US-2021-112326,2021-01-04,2021-01-08,Standard Class,PO-19195,Phillina Ober,Home Office,United States,Naperville,...,60540,Central,OFF-LA-10003223,Office Supplies,Labels,Avery 508,11.784,3,0.2,4.2717
3,4,US-2021-112326,2021-01-04,2021-01-08,Standard Class,PO-19195,Phillina Ober,Home Office,United States,Naperville,...,60540,Central,OFF-ST-10002743,Office Supplies,Storage,SAFCO Boltless Steel Shelving,272.736,3,0.2,-64.7748
4,5,US-2021-141817,2021-01-05,2021-01-12,Standard Class,MB-18085,Mick Brown,Consumer,United States,Philadelphia,...,19143,East,OFF-AR-10003478,Office Supplies,Art,Avery Hi-Liter EverBold Pen Style Fluorescent ...,19.536,3,0.2,4.884


---

### People

#### Overview

In [18]:
people_df.head()

Unnamed: 0,Regional Manager,Region
0,Sadie Pawthorne,West
1,Chuck Magee,East
2,Roxanne Rodriguez,Central
3,Fred Suzuki,South


In [19]:
people_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Regional Manager  4 non-null      object
 1   Region            4 non-null      object
dtypes: object(2)
memory usage: 196.0+ bytes


#### Renaming headers

In [20]:
people_df = people_df.rename(columns={
	'Regional Manager': 'manager',
    'Region': 'region'
})

#### Final inspection

In [21]:
people_df.tail()

Unnamed: 0,manager,region
0,Sadie Pawthorne,West
1,Chuck Magee,East
2,Roxanne Rodriguez,Central
3,Fred Suzuki,South


---

### Returns

#### Overview

In [22]:
returns_df.head()

Unnamed: 0,Returned,Order ID
0,Yes,US-2021-100762
1,Yes,US-2021-100762
2,Yes,US-2021-100762
3,Yes,US-2021-100762
4,Yes,US-2021-100867


In [23]:
returns_df.tail()

Unnamed: 0,Returned,Order ID
795,Yes,US-2024-147886
796,Yes,US-2024-147998
797,Yes,US-2024-151127
798,Yes,US-2024-155999
799,Yes,US-2024-155999


In [24]:
returns_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Returned  800 non-null    object
 1   Order ID  800 non-null    object
dtypes: object(2)
memory usage: 12.6+ KB


**Checking for duplicates**

In [25]:
print(returns_df.duplicated().sum())

504


**Remove duplicates**

In [26]:
returns_df.drop_duplicates(inplace=True) # no need to use keep= as we are not dealing with datetime data

**Checking for null values**

In [27]:
returns_df.isnull().sum()

Returned    0
Order ID    0
dtype: int64

### Renaming headers

In [28]:
returns_df.columns = [snake_case(col) for col in returns_df.columns]

#### Final inspection

In [29]:
returns_df.tail()

Unnamed: 0,returned,order_id
787,Yes,US-2024-136679
789,Yes,US-2024-147886
796,Yes,US-2024-147998
797,Yes,US-2024-151127
798,Yes,US-2024-155999


---

## Load data - MySQL prototype

### Break down data frames into tables

**Entity Relationship Diagram**

<img src="https://github.com/user-attachments/assets/bc9bce47-408f-4c4f-9676-d2fdf79db532" alt="Image" width="800" height="300">

**Note: samples**

In the case where the dataset is too large, we can use samples for prototyping.

In [None]:
orders_sample = orders_df.sample(frac=0.2, random_state=42)
orders_sample.head()

**Table: orders**

In [None]:
orders = orders_df[['order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_id', 'product_id', 'sales', 'quantity', 'discount', 'profit']]
orders = orders.drop_duplicates(['order_id', 'product_id'])
orders.head()

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,product_id,sales,quantity,discount,profit
0,US-2021-103800,2021-01-03,2021-01-07,Standard Class,DP-13000,OFF-PA-10000174,16.448,2,0.2,5.5512
1,US-2021-112326,2021-01-04,2021-01-08,Standard Class,PO-19195,OFF-BI-10004094,3.54,2,0.8,-5.487
2,US-2021-112326,2021-01-04,2021-01-08,Standard Class,PO-19195,OFF-LA-10003223,11.784,3,0.2,4.2717
3,US-2021-112326,2021-01-04,2021-01-08,Standard Class,PO-19195,OFF-ST-10002743,272.736,3,0.2,-64.7748
4,US-2021-141817,2021-01-05,2021-01-12,Standard Class,MB-18085,OFF-AR-10003478,19.536,3,0.2,4.884


**Table: customers**

In [31]:
customers = orders_df[['customer_id', 'customer_name', 'segment', 'country', 'city', 'province', 'post_code', 'region']]
customers = customers.drop_duplicates(subset=['customer_id'])
customers.head()

Unnamed: 0,customer_id,customer_name,segment,country,city,province,post_code,region
0,DP-13000,Darren Powers,Consumer,United States,Houston,Texas,77095,Central
1,PO-19195,Phillina Ober,Home Office,United States,Naperville,Illinois,60540,Central
4,MB-18085,Mick Brown,Consumer,United States,Philadelphia,Pennsylvania,19143,East
5,ME-17320,Maria Etezadi,Home Office,United States,Henderson,Kentucky,42420,South
7,JO-15145,Jack O'Briant,Corporate,United States,Athens,Georgia,30605,South


**Table: products**

In [32]:
products = orders_df[['product_id', 'product_name', 'category', 'sub_category']].drop_duplicates()
products = products.drop_duplicates(subset=['product_id'])
products.head()

Unnamed: 0,product_id,product_name,category,sub_category
0,OFF-PA-10000174,"Message Book, Wirebound, Four 5 1/2"" X 4"" Form...",Office Supplies,Paper
1,OFF-BI-10004094,GBC Standard Plastic Binding Systems Combs,Office Supplies,Binders
2,OFF-LA-10003223,Avery 508,Office Supplies,Labels
3,OFF-ST-10002743,SAFCO Boltless Steel Shelving,Office Supplies,Storage
4,OFF-AR-10003478,Avery Hi-Liter EverBold Pen Style Fluorescent ...,Office Supplies,Art


**Table: regions**

In [33]:
regions = people_df[['region', 'manager']].drop_duplicates()
regions.head()

Unnamed: 0,region,manager
0,West,Sadie Pawthorne
1,East,Chuck Magee
2,Central,Roxanne Rodriguez
3,South,Fred Suzuki


**Table: return_stat**

In [34]:
return_stat = returns_df[['order_id', 'returned']]
return_stat = return_stat.drop_duplicates(subset=['order_id'])
return_stat.head()

Unnamed: 0,order_id,returned
0,US-2021-100762,Yes
4,US-2021-100867,Yes
5,US-2021-102652,Yes
9,US-2021-103373,Yes
10,US-2021-103744,Yes


In [35]:
USER = 'root'
HOST = 'localhost'
PASSWORD = '42%Nice69%Evil'
DATABASE = 'superstore'

engine = create_engine(f"mysql+mysqlconnector://{USER}:{PASSWORD}@{HOST}/{DATABASE}")

### First load

In [None]:
try:
	with engine.connect() as conn:
		regions.to_sql('regions', con=conn, if_exists='append', index=False)
		products.to_sql('products', con=conn, if_exists='append', index=False)
		customers.to_sql('customers', con=conn, if_exists='append', index=False)
		orders.to_sql('orders', con=conn, if_exists='append', index=False)
		return_stat.to_sql('return_stat', con=conn, if_exists='append', index=False)
except Exception as error:
	print(f'Uploading unsuccessful. {error}')