## import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator, MaxNLocator

## settings

In [2]:
pd.set_option('display.max_rows', 20)

%matplotlib inline

## import data

In [3]:
%store -r sc_dataframes

In [4]:
sc_dataframes.keys()

dict_keys(['dim_customer', 'dim_market', 'dim_product', 'fact_forecast_monthly', 'fact_sales_monthly', 'freight_cost', 'gross_price', 'manufacturing_cost', 'post_invoice_deductions', 'pre_invoice_deductions'])

In [5]:
# Create dataframes from dictionary

sc_dim_customer = sc_dataframes['dim_customer'].copy()
sc_dim_market = sc_dataframes['dim_market'].copy()
sc_dim_product = sc_dataframes['dim_product'].copy()
sc_fact_forecast_monthly = sc_dataframes['fact_forecast_monthly'].copy()
sc_fact_sales_monthly = sc_dataframes['fact_sales_monthly'].copy()
sc_freight_cost = sc_dataframes['freight_cost'].copy()
sc_gross_price = sc_dataframes['gross_price'].copy()
sc_manufacturing_cost = sc_dataframes['manufacturing_cost'].copy()
sc_post_invoice_deductions = sc_dataframes['post_invoice_deductions'].copy()
sc_pre_invoice_deductions = sc_dataframes['pre_invoice_deductions'].copy()

## apply datatypes

### sc_dim_customer

In [6]:
sc_dim_customer.head(10)

Unnamed: 0,customer,market,platform,channel,customer_code
0,Electricalsocity,India,Brick & Mortar,Retailer,90002012
1,Electricalslytical,India,Brick & Mortar,Retailer,90002013
2,Ebay,India,E-Commerce,Retailer,90002010
3,Atliq Exclusive,India,Brick & Mortar,Retailer,90002011
4,Expression,India,Brick & Mortar,Retailer,90002014
5,AltiQ Exclusive,India,Brick & Mortar,Direct,70002017
6,Atliq e Store,India,E-Commerce,Direct,70002018
7,Propel,India,Brick & Mortar,Retailer,90002015
8,Amazon,India,E-Commerce,Retailer,90002016
9,Ezone,India,Brick & Mortar,Retailer,90002003


In [7]:
# check memory usage by column

sc_dim_customer.memory_usage(deep=True)

Index              132
customer         14138
market           13395
platform         14603
channel          13520
customer_code    13585
dtype: int64

In [8]:
# check datatypes

sc_dim_customer.dtypes

customer         object
market           object
platform         object
channel          object
customer_code    object
dtype: object

In [9]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_customer['customer'].nunique())
sc_dim_customer['customer'].value_counts()

76


Amazon             25
Atliq e Store      24
AltiQ Exclusive    16
Expert              5
Euronics            4
                   ..
Otto                1
Notebillig          1
BestBuy             1
Circuit City        1
Taobao              1
Name: customer, Length: 76, dtype: int64

In [10]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_customer['market'].nunique())
sc_dim_customer['market'].value_counts()

27


India             18
USA               15
Portugal          12
Spain             11
United Kingdom    11
                  ..
China              3
Mexico             2
Brazil             2
Chile              2
Columbia           1
Name: market, Length: 27, dtype: int64

In [11]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_customer['platform'].nunique())
sc_dim_customer['platform'].value_counts()

2


Brick & Mortar    150
E-Commerce         59
Name: platform, dtype: int64

In [12]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_customer['channel'].nunique())
sc_dim_customer['channel'].value_counts()

3


Retailer       164
Direct          40
Distributor      5
Name: channel, dtype: int64

In [13]:
# ensure datype 'int32' is apropriate by checking the range

print(sc_dim_customer['customer_code'].min())
print(sc_dim_customer['customer_code'].max())

70002017
90027207


In [14]:
# change datatypes

sc_dim_customer['customer'] = sc_dim_customer['platform'].astype('category')
sc_dim_customer['platform'] = sc_dim_customer['platform'].astype('category')
sc_dim_customer['market'] = sc_dim_customer['platform'].astype('category')
sc_dim_customer['channel'] = sc_dim_customer['channel'].astype('category')
sc_dim_customer['customer_code'] = sc_dim_customer['customer_code'].astype('int32')

In [15]:
# check memory usage by column

sc_dim_customer.memory_usage(deep=True)

Index            132
customer         455
market           455
platform         455
channel          513
customer_code    836
dtype: int64

### sc_dim_market

In [16]:
sc_dim_market.head(10)

Unnamed: 0,market,sub_zone,region
0,China,ROA,APAC
1,India,India,APAC
2,Indonesia,ROA,APAC
3,Japan,ROA,APAC
4,Pakistan,ROA,APAC
5,Philiphines,ROA,APAC
6,South Korea,ROA,APAC
7,Australia,ANZ,APAC
8,Newzealand,ANZ,APAC
9,Bangladesh,ROA,APAC


In [17]:
# check memory usage by column

sc_dim_market.memory_usage(deep=True)

Index        132
market      1737
sub_zone    1619
region      1627
dtype: int64

In [18]:
# check datatypes

sc_dim_market.dtypes

market      object
sub_zone    object
region      object
dtype: object

In [19]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_market['market'].nunique())
sc_dim_market['market'].value_counts()

27


China          1
Norway         1
Mexico         1
Columbia       1
Chile          1
              ..
Philiphines    1
Pakistan       1
Japan          1
Indonesia      1
Brazil         1
Name: market, Length: 27, dtype: int64

In [20]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_market['sub_zone'].nunique())
sc_dim_market['sub_zone'].value_counts()

7


ROA      7
NE       7
SE       4
LATAM    4
ANZ      2
nan      2
India    1
Name: sub_zone, dtype: int64

In [21]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_market['region'].nunique())
sc_dim_market['region'].value_counts()

4


EU       11
APAC     10
LATAM     4
nan       2
Name: region, dtype: int64

In [22]:
# change datatypes

sc_dim_market['market'] = sc_dim_market['market'].astype('category')
sc_dim_market['sub_zone'] = sc_dim_market['sub_zone'].astype('category')
sc_dim_market['region'] = sc_dim_market['region'].astype('category')

In [23]:
# check memory usage by column

sc_dim_market.memory_usage(deep=True)

Index        132
market      2836
sub_zone     749
region       441
dtype: int64

### sc_dim_product

In [24]:
sc_dim_product.head(10)

Unnamed: 0,product_code,division,segment,category,product,variant
0,A0118150101,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Standard
1,A0118150102,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Plus
2,A0118150103,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Premium
3,A0118150104,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Premium Plus
4,A0219150201,P & A,Peripherals,Internal HDD,AQ WereWolf NAS Internal Hard Drive HDD – 8.89 cm,Standard
5,A0219150202,P & A,Peripherals,Internal HDD,AQ WereWolf NAS Internal Hard Drive HDD – 8.89 cm,Plus
6,A0220150203,P & A,Peripherals,Internal HDD,AQ WereWolf NAS Internal Hard Drive HDD – 8.89 cm,Premium
7,A0320150301,P & A,Peripherals,Internal HDD,AQ Zion Saga,Standard
8,A0321150302,P & A,Peripherals,Internal HDD,AQ Zion Saga,Plus
9,A0321150303,P & A,Peripherals,Internal HDD,AQ Zion Saga,Premium


In [25]:
# check memory usage by column

sc_dim_product.memory_usage(deep=True)

Index             132
product_code    26996
division        24131
segment         26364
category        27459
product         28750
variant         26317
dtype: int64

In [26]:
# check datatypes

sc_dim_product.dtypes

product_code    object
division        object
segment         object
category        object
product         object
variant         object
dtype: object

In [27]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_product['division'].nunique())
sc_dim_product['division'].value_counts()

3


P & A    200
PC       161
N & S     36
Name: division, dtype: int64

In [28]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_product['segment'].nunique())
sc_dim_product['segment'].value_counts()

6


Notebook       129
Accessories    116
Peripherals     84
Desktop         32
Storage         27
Networking       9
Name: segment, dtype: int64

In [29]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_product['category'].nunique())
sc_dim_product['category'].value_counts()

14


Personal Laptop                61
Mouse                          48
Keyboard                       48
Business Laptop                44
Gaming Laptop                  40
Graphic Card                   36
MotherBoard                    20
Batteries                      20
Processors                     18
Personal Desktop               16
External Solid State Drives    15
USB Flash Drives               12
Internal HDD                   10
Wi fi extender                  9
Name: category, dtype: int64

In [30]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_product['product'].nunique())
sc_dim_product['product'].value_counts()

73


AQ Elite                                           8
AQ Gamer 3                                         8
AQ Gen Y                                           8
AQ Gen X                                           8
AQ Digit                                           8
                                                  ..
AQ 5000 Series Ultron 8 5900X Desktop Processor    3
AQ Electron 5 3600 Desktop Processor               3
AQ Electron 4 3600 Desktop Processor               3
AQ Electron 3 3600 Desktop Processor               3
AQ Wi Power Dx3                                    3
Name: product, Length: 73, dtype: int64

In [31]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_product['variant'].nunique())
sc_dim_product['variant'].value_counts()

27


Plus 2            35
Standard          33
Premium           33
Plus 1            31
Standard 1        23
                  ..
Plus Firey Red     5
Plus Cool Blue     5
Plus Black         5
Plus 1             4
Premium Plus       1
Name: variant, Length: 27, dtype: int64

In [32]:
# change datatypes

sc_dim_product['division'] = sc_dim_product['division'].astype('category')
sc_dim_product['segment'] = sc_dim_product['segment'].astype('category')
sc_dim_product['category'] = sc_dim_product['category'].astype('category')
sc_dim_product['product'] = sc_dim_product['product'].astype('category')
sc_dim_product['variant'] = sc_dim_product['variant'].astype('category')

In [33]:
# check memory usage by column

sc_dim_product.memory_usage(deep=True)

Index             132
product_code    26996
division          688
segment           965
category         1934
product          8076
variant          3296
dtype: int64

### sc_fact_forecast_monthly

In [34]:
sc_fact_forecast_monthly.head(10)

Unnamed: 0,date,product_code,customer_code,forecast_quantity,fiscal_year
0,2017-09-01,A6218160101,70008169,146,2018
1,2017-09-01,A6218160101,90008165,120,2018
2,2017-09-01,A6218160101,90008166,216,2018
3,2017-09-01,A6218160101,90008167,141,2018
4,2017-09-01,A6218160101,70008170,85,2018
5,2017-09-01,A6218160101,70010047,0,2018
6,2017-09-01,A6218160101,90027207,14,2018
7,2017-09-01,A6218160101,70023031,30,2018
8,2017-09-01,A6218160101,90023022,8,2018
9,2017-09-01,A6218160101,90023025,25,2018


In [35]:
# check memory usage by column

sc_fact_forecast_monthly.memory_usage(deep=True)

Index                      132
date                  15087528
product_code         128243988
customer_code        122586165
forecast_quantity     15087528
fiscal_year           15087528
dtype: int64

In [36]:
# check datatypes

sc_fact_forecast_monthly.dtypes

date                 datetime64[ns]
product_code                 object
customer_code                object
forecast_quantity             int64
fiscal_year                   int64
dtype: object

In [37]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['product_code'].nunique())
sc_fact_forecast_monthly['product_code'].value_counts()

389


A2218150201    7982
A2118150103    7982
A2118150105    7982
A3019150206    7982
A3019150204    7982
               ... 
A5318110102    1638
A5318110101    1635
A6018110101    1462
A6018110102    1454
A3718150104    1248
Name: product_code, Length: 389, dtype: int64

In [38]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['customer_code'].nunique())
sc_fact_forecast_monthly['customer_code'].value_counts()

209


90002008    10402
80007195    10392
80007196    10392
90002009    10386
90002004    10375
            ...  
90020101     6686
90020097     6678
90024184     6574
90024183     6570
90025209     6258
Name: customer_code, Length: 209, dtype: int64

In [39]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_fact_forecast_monthly['forecast_quantity'].min())
print(sc_fact_forecast_monthly['forecast_quantity'].max())

0
7554


In [40]:
# change datatypes

sc_fact_forecast_monthly['forecast_quantity'] = sc_fact_forecast_monthly['forecast_quantity'].astype('int16')
sc_fact_forecast_monthly['fiscal_year'] = sc_fact_forecast_monthly['fiscal_year'].astype('int16')

In [41]:
# check memory usage by column

sc_fact_forecast_monthly.memory_usage(deep=True)

Index                      132
date                  15087528
product_code         128243988
customer_code        122586165
forecast_quantity      3771882
fiscal_year            3771882
dtype: int64

### sc_fact_sales_monthly

In [42]:
sc_fact_sales_monthly.head(10)

Unnamed: 0,date,product_code,customer_code,sold_quantity,fiscal_year
0,2017-09-01,A6218160101,70008169,81,2018
1,2017-09-01,A6218160101,90008165,157,2018
2,2017-09-01,A6218160101,90008166,126,2018
3,2017-09-01,A6218160101,90008167,160,2018
4,2017-09-01,A6218160101,70008170,120,2018
5,2017-09-01,A6218160101,90027207,9,2018
6,2017-09-01,A6218160101,70023031,9,2018
7,2017-09-01,A6218160101,90023022,24,2018
8,2017-09-01,A6218160101,90023025,22,2018
9,2017-09-01,A6218160101,90023026,37,2018


In [43]:
# check memory usage by column

sc_fact_sales_monthly.memory_usage(deep=True)

Index                 132
date             11405648
product_code     96948008
customer_code    92670890
sold_quantity    11405648
fiscal_year      11405648
dtype: int64

In [44]:
# check datatypes

sc_fact_sales_monthly.dtypes

date             datetime64[ns]
product_code             object
customer_code            object
sold_quantity             int64
fiscal_year               int64
dtype: object

In [None]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['product_code'].nunique())
sc_fact_sales_monthly['product_code'].value_counts()

In [None]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['customer_code'].nunique())
sc_fact_sales_monthly['customer_code'].value_counts()

In [None]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_fact_sales_monthly['sold_quantity'].min())
print(sc_fact_sales_monthly['sold_quantity'].max())

In [None]:
# change datatypes

sc_fact_sales_monthly['sold_quantity'] = sc_fact_sales_monthly['sold_quantity'].astype('int16')
sc_fact_sales_monthly['fiscal_year'] = sc_fact_sales_monthly['fiscal_year'].astype('int16')

In [None]:
# check memory usage by column

sc_fact_sales_monthly.memory_usage(deep=True)

### sc_freight_cost

In [None]:
sc_freight_cost.head(10)

In [None]:
# check memory usage by column

sc_freight_cost.memory_usage(deep=True)

In [None]:
# check datatypes

sc_freight_cost.dtypes

In [None]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_freight_cost['market'].nunique())
sc_freight_cost['market'].value_counts()

In [None]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_freight_cost['fiscal_year'].nunique())
sc_freight_cost['fiscal_year'].value_counts()

In [None]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_freight_cost['freight_pct'].min())
print(sc_freight_cost['freight_pct'].max())

In [None]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_freight_cost['other_cost_pct'].min())
print(sc_freight_cost['other_cost_pct'].max())

In [None]:
# change datatypes

sc_freight_cost['market'] = sc_freight_cost['market'].astype('category')
sc_freight_cost['fiscal_year'] = sc_freight_cost['fiscal_year'].astype('int16')
sc_freight_cost['freight_pct'] = sc_freight_cost['freight_pct'].astype('float64')
sc_freight_cost['other_cost_pct'] = sc_freight_cost['other_cost_pct'].astype('float64')

In [None]:
# check memory usage by column

sc_freight_cost.memory_usage(deep=True)

### sc_gross_price

In [None]:
sc_gross_price.head(10)

In [None]:
# check memory usage by column

sc_gross_price.memory_usage(deep=True)

In [None]:
# check datatypes

sc_gross_price.dtypes

In [None]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_gross_price['product_code'].nunique())
sc_gross_price['product_code'].value_counts()

In [None]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_gross_price['fiscal_year'].nunique())
sc_gross_price['fiscal_year'].value_counts()

In [None]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_gross_price['gross_price'].min())
print(sc_gross_price['gross_price'].max())

In [None]:
# change datatypes

sc_gross_price['fiscal_year'] = sc_gross_price['fiscal_year'].astype('int16')
sc_gross_price['gross_price'] = sc_gross_price['gross_price'].astype('float64')

In [None]:
# check memory usage by column

sc_gross_price.memory_usage(deep=True)

### sc_manufacturing_cost

In [None]:
sc_manufacturing_cost.head(10)

In [None]:
# check memory usage by column

sc_manufacturing_cost.memory_usage(deep=True)

In [None]:
# check datatypes

sc_manufacturing_cost.dtypes

In [None]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_manufacturing_cost['product_code'].nunique())
sc_manufacturing_cost['product_code'].value_counts()

In [None]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_manufacturing_cost['cost_year'].nunique())
sc_manufacturing_cost['cost_year'].value_counts()

In [None]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_manufacturing_cost['manufacturing_cost'].min())
print(sc_manufacturing_cost['manufacturing_cost'].max())

In [None]:
# change datatypes

sc_manufacturing_cost['cost_year'] = sc_manufacturing_cost['cost_year'].astype('int16')
sc_manufacturing_cost['manufacturing_cost'] = sc_manufacturing_cost['manufacturing_cost'].astype('float64')

In [None]:
# check memory usage by column

sc_manufacturing_cost.memory_usage(deep=True)

### sc_post_invoice_deductions

In [None]:
sc_post_invoice_deductions.head(10)

In [None]:
# check memory usage by column

sc_post_invoice_deductions.memory_usage(deep=True)

In [None]:
# check datatypes

sc_post_invoice_deductions.dtypes

In [None]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_post_invoice_deductions['customer_code'].nunique())
sc_post_invoice_deductions['customer_code'].value_counts()

In [None]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_post_invoice_deductions['product_code'].nunique())
sc_post_invoice_deductions['product_code'].value_counts()

In [None]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_post_invoice_deductions['discounts_pct'].min())
print(sc_post_invoice_deductions['discounts_pct'].max())

In [None]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_post_invoice_deductions['other_deductions_pct'].min())
print(sc_post_invoice_deductions['other_deductions_pct'].max())

In [None]:
# change datatypes

sc_post_invoice_deductions['date'] = sc_post_invoice_deductions['date'].astype('datetime64')
sc_post_invoice_deductions['discounts_pct'] = sc_post_invoice_deductions['discounts_pct'].astype('float64')
sc_post_invoice_deductions['other_deductions_pct'] = sc_post_invoice_deductions['other_deductions_pct'].astype('float64')
sc_post_invoice_deductions['fiscal_year'] = sc_post_invoice_deductions['fiscal_year'].astype('int16')

In [None]:
# check memory usage by column

sc_post_invoice_deductions.memory_usage(deep=True)

In [None]:
sc_post_invoice_deductions['date'].dt.year.value_counts()

In [None]:
# split dataframe into two to save on file size

print(sc_post_invoice_deductions[sc_post_invoice_deductions['date'].dt.year <= 2020]['date'].dt.year.value_counts().sum())
print(sc_post_invoice_deductions[sc_post_invoice_deductions['date'].dt.year > 2020]['date'].dt.year.value_counts().sum())

In [None]:
sc_post_invoice_deductions_18_20 = sc_post_invoice_deductions[sc_post_invoice_deductions['date'].dt.year <= 2020].reset_index().drop('index', axis=1)
sc_post_invoice_deductions_21_22 = sc_post_invoice_deductions[sc_post_invoice_deductions['date'].dt.year > 2020].reset_index().drop('index', axis=1)

In [None]:
# check memory usage by column

sc_post_invoice_deductions_18_20.memory_usage(deep=True)

In [None]:
# check memory usage by column

sc_post_invoice_deductions_21_22.memory_usage(deep=True)

### sc_pre_invoice_deductions

In [None]:
sc_pre_invoice_deductions.head(10)

In [None]:
# check memory usage by column

sc_pre_invoice_deductions.memory_usage(deep=True)

In [None]:
# check datatypes

sc_pre_invoice_deductions.dtypes

In [None]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_pre_invoice_deductions['pre_invoice_discount_pct'].min())
print(sc_pre_invoice_deductions['pre_invoice_discount_pct'].max())

In [None]:
# change datatypes

sc_pre_invoice_deductions['fiscal_year'] = sc_pre_invoice_deductions['fiscal_year'].astype('int16')
sc_pre_invoice_deductions['pre_invoice_discount_pct'] = sc_pre_invoice_deductions['pre_invoice_discount_pct'].astype('float64')

In [None]:
# check memory usage by column

sc_pre_invoice_deductions.memory_usage(deep=True)

## Save dataframes

In [None]:
# Save dtaframes into a new dictionary

sc_dataframes_cleaned = {
    'sc_dim_customer': sc_dim_customer,
    'sc_dim_market': sc_dim_market,
    'sc_dim_product': sc_dim_product,
    'sc_fact_forecast_monthly': sc_fact_forecast_monthly,
    'sc_fact_sales_monthly': sc_fact_sales_monthly,
    'sc_freight_cost': sc_freight_cost,
    'sc_gross_price': sc_gross_price,
    'sc_manufacturing_cost': sc_manufacturing_cost,
    'sc_post_invoice_deductions_18_20': sc_post_invoice_deductions_18_20,
    'sc_post_invoice_deductions_21_22': sc_post_invoice_deductions_21_22,
    'sc_pre_invoice_deductions': sc_pre_invoice_deductions,
}

%store sc_dataframes_cleaned