In [11]:
import pandas as pd

import yaml
import chardet

In [12]:
with open('../config/config.yaml') as file:
    config = yaml.safe_load(file)
config

{'raw_data_paths': {'raw_data': '../data/raw/Sample-Superstore.csv'}}

In [None]:
# detect file encoding
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
    result = chardet.detect(raw_data)
    return result['encoding'], result['confidence']

file_to_check = config['raw_data_paths']['raw_data']
encoding, confidence = detect_encoding(file_to_check)

In [17]:
print(encoding)
print('confidence: ', confidence)

Windows-1252
confidence:  0.73


In [19]:
df = pd.read_csv(config['raw_data_paths']['raw_data'], encoding='Windows-1252')

In [20]:
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


# Normalize

In [45]:
df.columns = df.columns.str.replace(' ', '').str.replace('-', '')

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   RowID         9994 non-null   int64         
 1   OrderID       9994 non-null   object        
 2   OrderDate     9994 non-null   datetime64[ns]
 3   ShipDate      9994 non-null   object        
 4   ShipMode      9994 non-null   object        
 5   CustomerID    9994 non-null   object        
 6   CustomerName  9994 non-null   object        
 7   Segment       9994 non-null   object        
 8   Country       9994 non-null   object        
 9   City          9994 non-null   object        
 10  State         9994 non-null   object        
 11  PostalCode    9994 non-null   int64         
 12  Region        9994 non-null   object        
 13  ProductID     9994 non-null   object        
 14  Category      9994 non-null   object        
 15  SubCategory   9994 non-null   object  

# Set Data Types

In [46]:
df.columns

Index(['RowID', 'OrderID', 'OrderDate', 'ShipDate', 'ShipMode', 'CustomerID',
       'CustomerName', 'Segment', 'Country', 'City', 'State', 'PostalCode',
       'Region', 'ProductID', 'Category', 'SubCategory', 'ProductName',
       'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

In [60]:
df['OrderID']       = df['OrderID'].astype(pd.StringDtype())
df['OrderDate']     = pd.to_datetime(df['OrderDate']).dt.floor('D')
df['ShipDate']      = pd.to_datetime(df['ShipDate']).dt.floor('D')
df['ShipMode']      = df['ShipMode'].astype('category')
df['CustomerID']    = df['CustomerID'].astype(pd.StringDtype())
df['CustomerName']  = df['CustomerName'].astype(pd.StringDtype())
df['Segment']       = df['Segment'].astype('category')
df['Country']       = df['Country'].astype('category')
df['City']          = df['City'].astype('category')
df['State']         = df['State'].astype('category')
df['Region']        = df['Region'].astype('category')
df['ProductID']     = df['ProductID'].astype(pd.StringDtype())
df['Category']      = df['Category'].astype('category')
df['SubCategory']   = df['SubCategory'].astype('category')
df['ProductName']   = df['ProductName'].astype(pd.StringDtype())

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   RowID         9994 non-null   int64         
 1   OrderID       9994 non-null   string        
 2   OrderDate     9994 non-null   datetime64[ns]
 3   ShipDate      9994 non-null   datetime64[ns]
 4   ShipMode      9994 non-null   category      
 5   CustomerID    9994 non-null   string        
 6   CustomerName  9994 non-null   string        
 7   Segment       9994 non-null   category      
 8   Country       9994 non-null   category      
 9   City          9994 non-null   category      
 10  State         9994 non-null   category      
 11  PostalCode    9994 non-null   int64         
 12  Region        9994 non-null   category      
 13  ProductID     9994 non-null   string        
 14  Category      9994 non-null   category      
 15  SubCategory   9994 non-null   category

# Making sense of the data

## Meaning of Order ID

In [38]:
df.OrderID.value_counts()

OrderID
CA-2017-100111    14
CA-2017-157987    12
CA-2016-165330    11
US-2016-108504    11
CA-2015-131338    10
                  ..
CA-2016-157259     1
CA-2017-107986     1
US-2015-112508     1
US-2016-126844     1
CA-2017-119914     1
Name: count, Length: 5009, dtype: int64

In [40]:
df[df.OrderID == 'CA-2017-100111']

Unnamed: 0,RowID,OrderID,OrderDate,ShipDate,ShipMode,CustomerID,CustomerName,Segment,Country,City,...,PostalCode,Region,ProductID,Category,Sub-Category,ProductName,Sales,Quantity,Discount,Profit
6090,6091,CA-2017-100111,9/20/2017,9/26/2017,Standard Class,SV-20365,Seth Vernon,Consumer,United States,New York City,...,10035,East,FUR-CH-10003846,Furniture,Chairs,Hon Valutask Swivel Chairs,272.646,3,0.1,18.1764
6091,6092,CA-2017-100111,9/20/2017,9/26/2017,Standard Class,SV-20365,Seth Vernon,Consumer,United States,New York City,...,10035,East,TEC-AC-10002647,Technology,Accessories,Logitech Wireless Boombox Speaker - portable -...,212.8,2,0.0,95.76
6092,6093,CA-2017-100111,9/20/2017,9/26/2017,Standard Class,SV-20365,Seth Vernon,Consumer,United States,New York City,...,10035,East,OFF-PA-10000807,Office Supplies,Paper,"TOPS ""Important Message"" Pads, Canary, 4-1/4 x...",38.52,9,0.0,18.1044
6093,6094,CA-2017-100111,9/20/2017,9/26/2017,Standard Class,SV-20365,Seth Vernon,Consumer,United States,New York City,...,10035,East,TEC-AC-10001465,Technology,Accessories,SanDisk Cruzer 64 GB USB Flash Drive,72.64,2,0.0,21.792
6094,6095,CA-2017-100111,9/20/2017,9/26/2017,Standard Class,SV-20365,Seth Vernon,Consumer,United States,New York City,...,10035,East,OFF-ST-10000615,Office Supplies,Storage,"SimpliFile Personal File, Black Granite, 15w x...",45.4,4,0.0,12.712
6095,6096,CA-2017-100111,9/20/2017,9/26/2017,Standard Class,SV-20365,Seth Vernon,Consumer,United States,New York City,...,10035,East,OFF-PA-10002713,Office Supplies,Paper,"Adams Phone Message Book, 200 Message Capacity...",13.76,2,0.0,6.3296
6096,6097,CA-2017-100111,9/20/2017,9/26/2017,Standard Class,SV-20365,Seth Vernon,Consumer,United States,New York City,...,10035,East,FUR-CH-10003061,Furniture,Chairs,"Global Leather Task Chair, Black",80.991,1,0.1,8.0991
6097,6098,CA-2017-100111,9/20/2017,9/26/2017,Standard Class,SV-20365,Seth Vernon,Consumer,United States,New York City,...,10035,East,OFF-BI-10000343,Office Supplies,Binders,"Pressboard Covers with Storage Hooks, 9 1/2"" x...",11.784,3,0.2,3.9771
6098,6099,CA-2017-100111,9/20/2017,9/26/2017,Standard Class,SV-20365,Seth Vernon,Consumer,United States,New York City,...,10035,East,OFF-FA-10000304,Office Supplies,Fasteners,Advantus Push Pins,4.36,2,0.0,1.7876
6099,6100,CA-2017-100111,9/20/2017,9/26/2017,Standard Class,SV-20365,Seth Vernon,Consumer,United States,New York City,...,10035,East,FUR-CH-10004086,Furniture,Chairs,Hon 4070 Series Pagoda Armless Upholstered Sta...,2888.127,11,0.1,609.7157


In [41]:
df[df.OrderID == 'CA-2015-131338']

Unnamed: 0,RowID,OrderID,OrderDate,ShipDate,ShipMode,CustomerID,CustomerName,Segment,Country,City,...,PostalCode,Region,ProductID,Category,Sub-Category,ProductName,Sales,Quantity,Discount,Profit
1579,1580,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,TEC-PH-10003012,Technology,Phones,Nortel Meridian M3904 Professional Digital phone,307.98,2,0.0,89.3142
1580,1581,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,FUR-TA-10002607,Furniture,Tables,KI Conference Tables,382.806,9,0.4,-153.1224
1581,1582,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,OFF-ST-10000642,Office Supplies,Storage,"Tennsco Lockers, Gray",41.96,2,0.0,2.9372
1582,1583,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,OFF-BI-10000545,Office Supplies,Binders,GBC Ibimaster 500 Manual ProClick Binding System,1217.568,2,0.2,456.588
1583,1584,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,FUR-FU-10002157,Furniture,Furnishings,Artistic Insta-Plaque,47.04,3,0.0,18.3456
1584,1585,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,FUR-FU-10001706,Furniture,Furnishings,Longer-Life Soft White Bulbs,6.16,2,0.0,2.9568
1585,1586,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,TEC-PH-10000984,Technology,Phones,Panasonic KX-TG9471B,979.95,5,0.0,274.386
1586,1587,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,OFF-PA-10001357,Office Supplies,Paper,Xerox 1886,143.7,3,0.0,68.976
1587,1588,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,OFF-FA-10000992,Office Supplies,Fasteners,"Acco Clips to Go Binder Clips, 24 Clips in Two...",10.65,3,0.0,5.0055
1588,1589,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,TEC-AC-10002600,Technology,Accessories,Belkin QODE FastFit Bluetooth Keyboard,247.8,4,0.0,34.692


## Sales and Profit

In [42]:
# What is the meaning of sales? 
df[df.ProductID == 'FUR-TA-10002607']

Unnamed: 0,RowID,OrderID,OrderDate,ShipDate,ShipMode,CustomerID,CustomerName,Segment,Country,City,...,PostalCode,Region,ProductID,Category,Sub-Category,ProductName,Sales,Quantity,Discount,Profit
241,242,CA-2016-157749,6/4/2016,6/9/2016,Second Class,KL-16645,Ken Lonsdale,Consumer,United States,Chicago,...,60610,Central,FUR-TA-10002607,Furniture,Tables,KI Conference Tables,177.225,5,0.5,-120.513
746,747,CA-2014-124429,5/27/2014,5/27/2014,Same Day,MH-17785,Maya Herman,Corporate,United States,San Diego,...,92105,West,FUR-TA-10002607,Furniture,Tables,KI Conference Tables,567.12,10,0.2,-28.356
1580,1581,CA-2015-131338,8/9/2015,8/12/2015,First Class,NP-18325,Naresj Patel,Consumer,United States,New York City,...,10024,East,FUR-TA-10002607,Furniture,Tables,KI Conference Tables,382.806,9,0.4,-153.1224
1814,1815,CA-2015-131597,9/14/2015,9/18/2015,Standard Class,SP-20620,Stefania Perrino,Corporate,United States,Los Angeles,...,90045,West,FUR-TA-10002607,Furniture,Tables,KI Conference Tables,170.136,3,0.2,-8.5068
4294,4295,CA-2017-101581,10/22/2017,10/27/2017,Standard Class,DW-13195,David Wiener,Corporate,United States,Redmond,...,97756,West,FUR-TA-10002607,Furniture,Tables,KI Conference Tables,177.225,5,0.5,-120.513
5373,5374,CA-2015-118738,10/24/2015,10/30/2015,Standard Class,AG-10495,Andrew Gjertsen,Corporate,United States,Houston,...,77041,Central,FUR-TA-10002607,Furniture,Tables,KI Conference Tables,347.361,7,0.3,-69.4722
6799,6800,CA-2016-109827,12/25/2016,1/1/2017,Standard Class,LW-16825,Laurel Workman,Corporate,United States,Phoenix,...,85023,West,FUR-TA-10002607,Furniture,Tables,KI Conference Tables,35.445,1,0.5,-24.1026
9031,9032,CA-2017-128041,9/1/2017,9/1/2017,Same Day,RW-19540,Rick Wilson,Corporate,United States,Seattle,...,98103,West,FUR-TA-10002607,Furniture,Tables,KI Conference Tables,283.56,4,0.0,45.3696


In [43]:
283.56/4 * 0.5

35.445

* **Sales**: Total Price of order after discount = (1 - Discount) * Price * Quantity
* **Profit**: Total Profit of order; Profit = (ProfitRate * Price - Discount * Price) * Quantity

# Check Data Integrity

In [27]:
# duplicate rows
df.duplicated().sum()

0

# Export to database

In [None]:
orders_col      = ['OrderID', 'OrderDate','Sales', 'Quantity', 'Discount', 'Profit']
shipment_col    = ['ShipDate', 'ShipMode']
customer_col    = ['CustomerID','CustomerName', 'Segment']
address_col     = ['Country', 'City', 'State', 'PostalCode','Region']
product_col     = ['ProductID', 'Category', 'SubCategory', 'ProductName']