## import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator, MaxNLocator

## settings

In [2]:
pd.set_option('display.max_rows', 20)

%matplotlib inline

## import data

In [3]:
%store -r sc_dataframes

In [4]:
sc_dataframes.keys()

dict_keys(['dim_customer', 'dim_market', 'dim_product', 'fact_forecast_monthly', 'fact_sales_monthly', 'freight_cost', 'gross_price', 'manufacturing_cost', 'post_invoice_deductions', 'pre_invoice_deductions'])

In [5]:
# Create dataframes from dictionary

sc_dim_customer = sc_dataframes['dim_customer'].copy()
sc_dim_market = sc_dataframes['dim_market'].copy()
sc_dim_product = sc_dataframes['dim_product'].copy()
sc_fact_forecast_monthly = sc_dataframes['fact_forecast_monthly'].copy()
sc_fact_sales_monthly = sc_dataframes['fact_sales_monthly'].copy()
sc_freight_cost = sc_dataframes['freight_cost'].copy()
sc_gross_price = sc_dataframes['gross_price'].copy()
sc_manufacturing_cost = sc_dataframes['manufacturing_cost'].copy()
sc_post_invoice_deductions = sc_dataframes['post_invoice_deductions'].copy()
sc_pre_invoice_deductions = sc_dataframes['pre_invoice_deductions'].copy()

## apply datatypes

### sc_dim_customer

In [6]:
sc_dim_customer.head(10)

Unnamed: 0,customer,market,platform,channel,customer_code
0,Electricalsocity,India,Brick & Mortar,Retailer,90002012
1,Electricalslytical,India,Brick & Mortar,Retailer,90002013
2,Ebay,India,E-Commerce,Retailer,90002010
3,Atliq Exclusive,India,Brick & Mortar,Retailer,90002011
4,Expression,India,Brick & Mortar,Retailer,90002014
5,AltiQ Exclusive,India,Brick & Mortar,Direct,70002017
6,Atliq e Store,India,E-Commerce,Direct,70002018
7,Propel,India,Brick & Mortar,Retailer,90002015
8,Amazon,India,E-Commerce,Retailer,90002016
9,Ezone,India,Brick & Mortar,Retailer,90002003


In [7]:
# check memory usage by column

sc_dim_customer.memory_usage(deep=True)

Index              132
customer         14138
market           13395
platform         14603
channel          13520
customer_code    13585
dtype: int64

In [8]:
# check datatypes

sc_dim_customer.dtypes

customer         object
market           object
platform         object
channel          object
customer_code    object
dtype: object

In [9]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_customer['customer'].nunique())
sc_dim_customer['customer'].value_counts()

76


Amazon             25
Atliq e Store      24
AltiQ Exclusive    16
Expert              5
Euronics            4
                   ..
Otto                1
Notebillig          1
BestBuy             1
Circuit City        1
Taobao              1
Name: customer, Length: 76, dtype: int64

In [10]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_customer['market'].nunique())
sc_dim_customer['market'].value_counts()

27


India             18
USA               15
Portugal          12
Spain             11
United Kingdom    11
                  ..
China              3
Mexico             2
Brazil             2
Chile              2
Columbia           1
Name: market, Length: 27, dtype: int64

In [11]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_customer['platform'].nunique())
sc_dim_customer['platform'].value_counts()

2


Brick & Mortar    150
E-Commerce         59
Name: platform, dtype: int64

In [12]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_customer['channel'].nunique())
sc_dim_customer['channel'].value_counts()

3


Retailer       164
Direct          40
Distributor      5
Name: channel, dtype: int64

In [13]:
# ensure datype 'int32' is apropriate by checking the range

print(sc_dim_customer['customer_code'].min())
print(sc_dim_customer['customer_code'].max())

70002017
90027207


In [14]:
# change datatypes

sc_dim_customer['customer'] = sc_dim_customer['platform'].astype('category')
sc_dim_customer['platform'] = sc_dim_customer['platform'].astype('category')
sc_dim_customer['market'] = sc_dim_customer['platform'].astype('category')
sc_dim_customer['channel'] = sc_dim_customer['channel'].astype('category')
sc_dim_customer['customer_code'] = sc_dim_customer['customer_code'].astype('int32')

In [15]:
# check memory usage by column

sc_dim_customer.memory_usage(deep=True)

Index            132
customer         455
market           455
platform         455
channel          513
customer_code    836
dtype: int64

### sc_dim_market

In [16]:
sc_dim_market.head(10)

Unnamed: 0,market,sub_zone,region
0,China,ROA,APAC
1,India,India,APAC
2,Indonesia,ROA,APAC
3,Japan,ROA,APAC
4,Pakistan,ROA,APAC
5,Philiphines,ROA,APAC
6,South Korea,ROA,APAC
7,Australia,ANZ,APAC
8,Newzealand,ANZ,APAC
9,Bangladesh,ROA,APAC


In [17]:
# check memory usage by column

sc_dim_market.memory_usage(deep=True)

Index        132
market      1737
sub_zone    1619
region      1627
dtype: int64

In [18]:
# check datatypes

sc_dim_market.dtypes

market      object
sub_zone    object
region      object
dtype: object

In [19]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_market['market'].nunique())
sc_dim_market['market'].value_counts()

27


China          1
Norway         1
Mexico         1
Columbia       1
Chile          1
              ..
Philiphines    1
Pakistan       1
Japan          1
Indonesia      1
Brazil         1
Name: market, Length: 27, dtype: int64

In [20]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_market['sub_zone'].nunique())
sc_dim_market['sub_zone'].value_counts()

7


ROA      7
NE       7
SE       4
LATAM    4
ANZ      2
nan      2
India    1
Name: sub_zone, dtype: int64

In [21]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_market['region'].nunique())
sc_dim_market['region'].value_counts()

4


EU       11
APAC     10
LATAM     4
nan       2
Name: region, dtype: int64

In [22]:
# change datatypes

sc_dim_market['market'] = sc_dim_market['market'].astype('category')
sc_dim_market['sub_zone'] = sc_dim_market['sub_zone'].astype('category')
sc_dim_market['region'] = sc_dim_market['region'].astype('category')

In [23]:
# check memory usage by column

sc_dim_market.memory_usage(deep=True)

Index        132
market      2836
sub_zone     749
region       441
dtype: int64

### sc_dim_product

In [24]:
sc_dim_product.head(10)

Unnamed: 0,product_code,division,segment,category,product,variant
0,A0118150101,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Standard
1,A0118150102,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Plus
2,A0118150103,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Premium
3,A0118150104,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Premium Plus
4,A0219150201,P & A,Peripherals,Internal HDD,AQ WereWolf NAS Internal Hard Drive HDD – 8.89 cm,Standard
5,A0219150202,P & A,Peripherals,Internal HDD,AQ WereWolf NAS Internal Hard Drive HDD – 8.89 cm,Plus
6,A0220150203,P & A,Peripherals,Internal HDD,AQ WereWolf NAS Internal Hard Drive HDD – 8.89 cm,Premium
7,A0320150301,P & A,Peripherals,Internal HDD,AQ Zion Saga,Standard
8,A0321150302,P & A,Peripherals,Internal HDD,AQ Zion Saga,Plus
9,A0321150303,P & A,Peripherals,Internal HDD,AQ Zion Saga,Premium


In [25]:
# check memory usage by column

sc_dim_product.memory_usage(deep=True)

Index             132
product_code    26996
division        24131
segment         26364
category        27459
product         28750
variant         26317
dtype: int64

In [26]:
# check datatypes

sc_dim_product.dtypes

product_code    object
division        object
segment         object
category        object
product         object
variant         object
dtype: object

In [27]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_product['division'].nunique())
sc_dim_product['division'].value_counts()

3


P & A    200
PC       161
N & S     36
Name: division, dtype: int64

In [28]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_product['segment'].nunique())
sc_dim_product['segment'].value_counts()

6


Notebook       129
Accessories    116
Peripherals     84
Desktop         32
Storage         27
Networking       9
Name: segment, dtype: int64

In [29]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_product['category'].nunique())
sc_dim_product['category'].value_counts()

14


Personal Laptop                61
Mouse                          48
Keyboard                       48
Business Laptop                44
Gaming Laptop                  40
Graphic Card                   36
MotherBoard                    20
Batteries                      20
Processors                     18
Personal Desktop               16
External Solid State Drives    15
USB Flash Drives               12
Internal HDD                   10
Wi fi extender                  9
Name: category, dtype: int64

In [30]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_product['product'].nunique())
sc_dim_product['product'].value_counts()

73


AQ Elite                                           8
AQ Gamer 3                                         8
AQ Gen Y                                           8
AQ Gen X                                           8
AQ Digit                                           8
                                                  ..
AQ 5000 Series Ultron 8 5900X Desktop Processor    3
AQ Electron 5 3600 Desktop Processor               3
AQ Electron 4 3600 Desktop Processor               3
AQ Electron 3 3600 Desktop Processor               3
AQ Wi Power Dx3                                    3
Name: product, Length: 73, dtype: int64

In [31]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_dim_product['variant'].nunique())
sc_dim_product['variant'].value_counts()

27


Plus 2            35
Standard          33
Premium           33
Plus 1            31
Standard 1        23
                  ..
Plus Firey Red     5
Plus Cool Blue     5
Plus Black         5
Plus 1             4
Premium Plus       1
Name: variant, Length: 27, dtype: int64

In [32]:
# change datatypes

sc_dim_product['division'] = sc_dim_product['division'].astype('category')
sc_dim_product['segment'] = sc_dim_product['segment'].astype('category')
sc_dim_product['category'] = sc_dim_product['category'].astype('category')
sc_dim_product['product'] = sc_dim_product['product'].astype('category')
sc_dim_product['variant'] = sc_dim_product['variant'].astype('category')

In [33]:
# check memory usage by column

sc_dim_product.memory_usage(deep=True)

Index             132
product_code    26996
division          688
segment           965
category         1934
product          8076
variant          3296
dtype: int64

### sc_fact_forecast_monthly

In [34]:
sc_fact_forecast_monthly.head(10)

Unnamed: 0,date,division,category,product_code,product,market,platform,channel,customer_code,customer_name,forecast_quantity
0,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Direct,70008169,AltiQ Exclusive,146
1,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Retailer,90008165,Forward Stores,120
2,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Retailer,90008166,Sound,216
3,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Retailer,90008167,Electricalsocity,141
4,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,E-Commerce,Direct,70008170,Atliq e Store,85
5,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Bangladesh,Brick & Mortar,Direct,70010047,AltiQ Exclusive,0
6,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Brazil,E-Commerce,Retailer,90027207,Amazon,14
7,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Direct,70023031,AltiQ Exclusive,30
8,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Retailer,90023022,Nomad Stores,8
9,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Retailer,90023025,Premium Stores,25


In [35]:
# check memory usage by column

sc_fact_forecast_monthly.memory_usage(deep=True)

Index                      132
date                  15087528
division             114801051
category             129990264
product_code         128243988
product              139470262
market               120854350
platform             131763407
channel              122011369
customer_code        122586165
customer_name        127224927
forecast_quantity     15087528
dtype: int64

In [36]:
# check datatypes

sc_fact_forecast_monthly.dtypes

date                 datetime64[ns]
division                     object
category                     object
product_code                 object
product                      object
market                       object
platform                     object
channel                      object
customer_code                object
customer_name                object
forecast_quantity             int64
dtype: object

In [37]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['division'].nunique())
sc_fact_forecast_monthly['division'].value_counts()

3


P & A    1033285
PC        709097
N & S     143559
Name: division, dtype: int64

In [38]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['category'].nunique())
sc_fact_forecast_monthly['category'].value_counts()

14


Personal Laptop                283938
Keyboard                       253973
Mouse                          245104
Business Laptop                203007
Graphic Card                   184247
Gaming Laptop                  177644
Processors                     102404
MotherBoard                     98440
Batteries                       88528
External Solid State Drives     78426
Internal HDD                    60589
Wi fi extender                  46323
Personal Desktop                44508
USB Flash Drives                18810
Name: category, dtype: int64

In [39]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['product_code'].nunique())
sc_fact_forecast_monthly['product_code'].value_counts()

389


A2218150201    7982
A2118150103    7982
A2118150105    7982
A3019150206    7982
A3019150204    7982
               ... 
A5318110102    1638
A5318110101    1635
A6018110101    1462
A6018110102    1454
A3718150104    1248
Name: product_code, Length: 389, dtype: int64

In [40]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['product'].nunique())
sc_fact_forecast_monthly['product'].value_counts()

71


AQ Digit                                56068
AQ BZ Compact                           54125
AQ Mforce Gen Y                         50530
AQ Master wireless x1                   47883
AQ Elite                                46882
                                        ...  
AQ Lumina                               11286
AQ Marquee P4                            9405
AQ Electron 3 3600 Desktop Processor     9405
AQ Clx3                                  7524
AQ Pen Drive 2 IN 1                      5643
Name: product, Length: 71, dtype: int64

In [41]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['market'].nunique())
sc_fact_forecast_monthly['market']

27


0          Australia
1          Australia
2          Australia
3          Australia
4          Australia
             ...    
1885936          USA
1885937          USA
1885938          USA
1885939          USA
1885940          USA
Name: market, Length: 1885941, dtype: object

In [42]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['platform'].nunique())
sc_fact_forecast_monthly['platform'].value_counts()

2


Brick & Mortar    1351340
E-Commerce         534601
Name: platform, dtype: int64

In [43]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['channel'].nunique())
sc_fact_forecast_monthly['channel'].value_counts()

3


Retailer       1469663
Direct          364726
Distributor      51552
Name: channel, dtype: int64

In [44]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['customer_code'].nunique())
sc_fact_forecast_monthly['customer_code'].value_counts()

209


90002008    10402
80007195    10392
80007196    10392
90002009    10386
90002004    10375
            ...  
90020101     6686
90020097     6678
90024184     6574
90024183     6570
90025209     6258
Name: customer_code, Length: 209, dtype: int64

In [45]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_forecast_monthly['customer_name'].nunique())
sc_fact_forecast_monthly['customer_name'].value_counts()

75


Amazon             260583
Atliq e Store      215548
AltiQ Exclusive    149178
Expert              43830
Euronics            32565
                    ...  
Otto                 9013
Notebillig           9009
Saturn               9000
All-Out              7534
Nova                 6734
Name: customer_name, Length: 75, dtype: int64

In [46]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_fact_forecast_monthly['forecast_quantity'].min())
print(sc_fact_forecast_monthly['forecast_quantity'].max())

0
7554


In [47]:
# change datatypes

sc_fact_forecast_monthly['division'] = sc_fact_forecast_monthly['division'].astype('category')
sc_fact_forecast_monthly['category'] = sc_fact_forecast_monthly['category'].astype('category')
sc_fact_forecast_monthly['product'] = sc_fact_forecast_monthly['product'].astype('category')
sc_fact_forecast_monthly['market'] = sc_fact_forecast_monthly['market'].astype('category')
sc_fact_forecast_monthly['platform'] = sc_fact_forecast_monthly['platform'].astype('category')
sc_fact_forecast_monthly['channel'] = sc_fact_forecast_monthly['channel'].astype('category')
sc_fact_forecast_monthly['customer_name'] = sc_fact_forecast_monthly['customer_name'].astype('category')
sc_fact_forecast_monthly['forecast_quantity'] = sc_fact_forecast_monthly['forecast_quantity'].astype('int16')

In [48]:
# check memory usage by column

sc_fact_forecast_monthly.memory_usage(deep=True)

Index                      132
date                  15087528
division               1886232
category               1887478
product_code         128243988
product                1893489
market                 1888750
platform               1886187
channel                1886245
customer_code        122586165
customer_name          1893090
forecast_quantity      3771882
dtype: int64

### sc_fact_sales_monthly

In [49]:
sc_fact_sales_monthly.head(10)

Unnamed: 0,date,division,category,product_code,product,market,platform,channel,customer_code,customer_name,sold_quantity
0,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Direct,70008169,AltiQ Exclusive,81
1,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Retailer,90008165,Forward Stores,157
2,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Retailer,90008166,Sound,126
3,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Retailer,90008167,Electricalsocity,160
4,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,E-Commerce,Direct,70008170,Atliq e Store,120
5,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Brazil,E-Commerce,Retailer,90027207,Amazon,9
6,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Direct,70023031,AltiQ Exclusive,9
7,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Retailer,90023022,Nomad Stores,24
8,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Retailer,90023025,Premium Stores,22
9,2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Retailer,90023026,Relief,37


In [50]:
# check memory usage by column

sc_fact_sales_monthly.memory_usage(deep=True)

Index                  132
date              11405648
division          86867342
category          98199260
product_code      96948008
product          106581796
market            91345986
platform          99583322
channel           92237730
customer_code     92670890
customer_name     96099224
sold_quantity     11405648
dtype: int64

In [51]:
# check datatypes

sc_fact_sales_monthly.dtypes

date             datetime64[ns]
division                 object
category                 object
product_code             object
product                  object
market                   object
platform                 object
channel                  object
customer_code            object
customer_name            object
sold_quantity             int64
dtype: object

In [52]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['division'].nunique())
sc_fact_sales_monthly['division'].value_counts()

3


P & A    802773
PC       508810
N & S    114123
Name: division, dtype: int64

In [53]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['category'].nunique())
sc_fact_sales_monthly['category'].value_counts()

14


Personal Laptop                206710
Keyboard                       197198
Mouse                          187221
Graphic Card                   146147
Business Laptop                145901
Gaming Laptop                  129168
Processors                      83075
MotherBoard                     76631
Batteries                       62075
External Solid State Drives     61766
Internal HDD                    50426
Wi fi extender                  38576
Personal Desktop                27031
USB Flash Drives                13781
Name: category, dtype: int64

In [54]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['product_code'].nunique())
sc_fact_sales_monthly['product_code'].value_counts()

389


A2118150103    6729
A2218150201    6728
A2219150203    6726
A2118150104    6725
A2118150105    6724
               ... 
A2821150804     627
A2821150805     627
A2821150806     627
A1521150603     627
A4821110808     627
Name: product_code, Length: 389, dtype: int64

In [55]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['product'].nunique())
sc_fact_sales_monthly['product'].value_counts()

71


AQ Digit                 44641
AQ Mforce Gen Y          43745
AQ BZ Compact            41966
AQ Master wireless x1    40326
AQ Mforce Gen X          40060
                         ...  
AQ F16                    5016
AQ Marquee P4             4389
AQ Pen Drive 2 IN 1       4382
AQ Clx3                   3762
AQ Lumina                 3762
Name: product, Length: 71, dtype: int64

In [56]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['market'].nunique())
sc_fact_sales_monthly['market'].value_counts()

27


India       147278
USA         121400
Canada       84873
Italy        80172
Portugal     77571
             ...  
China        23031
Brazil       14021
Mexico       12736
Chile         8755
Columbia      3782
Name: market, Length: 27, dtype: int64

In [57]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['platform'].nunique())
sc_fact_sales_monthly['platform'].value_counts()

2


Brick & Mortar    1015255
E-Commerce         410451
Name: platform, dtype: int64

In [58]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['channel'].nunique())
sc_fact_sales_monthly['channel'].value_counts()

3


Retailer       1106671
Direct          278053
Distributor      40982
Name: channel, dtype: int64

In [59]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['customer_code'].nunique())
sc_fact_sales_monthly['customer_code'].value_counts()

209


80007196    8295
90002008    8289
80007195    8287
90002009    8265
70002018    8261
            ... 
90020097    4328
90020101    4322
90020099    4312
90024183    4287
90025209    3782
Name: customer_code, Length: 209, dtype: int64

In [60]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_fact_sales_monthly['customer_name'].nunique())
sc_fact_sales_monthly['customer_name'].value_counts()

75


Amazon             199382
Atliq e Store      165005
AltiQ Exclusive    113048
Expert              32732
Euronics            23340
                    ...  
Saturn               6991
Notebillig           6960
Billa                6911
All-Out              5192
Nova                 4355
Name: customer_name, Length: 75, dtype: int64

In [61]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_fact_sales_monthly['sold_quantity'].min())
print(sc_fact_sales_monthly['sold_quantity'].max())

0
5832


In [62]:
# change datatypes

sc_fact_sales_monthly['division'] = sc_fact_sales_monthly['division'].astype('category')
sc_fact_sales_monthly['category'] = sc_fact_sales_monthly['category'].astype('category')
sc_fact_sales_monthly['product'] = sc_fact_sales_monthly['product'].astype('category')
sc_fact_sales_monthly['market'] = sc_fact_sales_monthly['market'].astype('category')
sc_fact_sales_monthly['platform'] = sc_fact_sales_monthly['platform'].astype('category')
sc_fact_sales_monthly['channel'] = sc_fact_sales_monthly['channel'].astype('category')
sc_fact_sales_monthly['customer_name'] = sc_fact_sales_monthly['customer_name'].astype('category')
sc_fact_sales_monthly['sold_quantity'] = sc_fact_sales_monthly['sold_quantity'].astype('int16')

In [63]:
# check memory usage by column

sc_fact_sales_monthly.memory_usage(deep=True)

Index                 132
date             11405648
division          1425997
category          1427243
product_code     96948008
product           1433254
market            1428515
platform          1425952
channel           1426010
customer_code    92670890
customer_name     1432855
sold_quantity     2851412
dtype: int64

### sc_freight_cost

In [64]:
sc_freight_cost.head(10)

Unnamed: 0,market,fiscal_year,freight_pct,other_cost_pct
0,Australia,2018,0.0188,0.005
1,Austria,2018,0.0272,0.0053
2,Bangladesh,2018,0.0219,0.0058
3,Brazil,2018,0.0239,0.0033
4,Canada,2018,0.0264,0.0054
5,Chile,2018,0.0267,0.0022
6,China,2018,0.0204,0.0043
7,Columbia,2018,0.0216,0.0028
8,France,2018,0.019,0.0038
9,Germany,2018,0.0301,0.0061


In [65]:
# check memory usage by column

sc_freight_cost.memory_usage(deep=True)

Index               132
market             8685
fiscal_year        8235
freight_pct       15120
other_cost_pct    15120
dtype: int64

In [66]:
# check datatypes

sc_freight_cost.dtypes

market            object
fiscal_year       object
freight_pct       object
other_cost_pct    object
dtype: object

In [67]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_freight_cost['market'].nunique())
sc_freight_cost['market'].value_counts()

27


Australia         5
Mexico            5
United Kingdom    5
Sweden            5
Spain             5
                 ..
Chile             5
Canada            5
Brazil            5
Bangladesh        5
USA               5
Name: market, Length: 27, dtype: int64

In [68]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_freight_cost['fiscal_year'].nunique())
sc_freight_cost['fiscal_year'].value_counts()

5


2018    27
2019    27
2020    27
2021    27
2022    27
Name: fiscal_year, dtype: int64

In [69]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_freight_cost['freight_pct'].min())
print(sc_freight_cost['freight_pct'].max())

0.0187000000
0.0312000000


In [70]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_freight_cost['other_cost_pct'].min())
print(sc_freight_cost['other_cost_pct'].max())

0.0022000000
0.0061000000


In [71]:
# change datatypes

sc_freight_cost['market'] = sc_freight_cost['market'].astype('category')
sc_freight_cost['fiscal_year'] = sc_freight_cost['fiscal_year'].astype('int16')
sc_freight_cost['freight_pct'] = sc_freight_cost['freight_pct'].astype('float64')
sc_freight_cost['other_cost_pct'] = sc_freight_cost['other_cost_pct'].astype('float64')

In [72]:
# check memory usage by column

sc_freight_cost.memory_usage(deep=True)

Index              132
market            2944
fiscal_year        270
freight_pct       1080
other_cost_pct    1080
dtype: int64

### sc_gross_price

In [73]:
sc_gross_price.head(10)

Unnamed: 0,product_code,fiscal_year,gross_price
0,A0118150101,2018,15.3952
1,A0118150101,2019,14.4392
2,A0118150101,2020,16.2323
3,A0118150101,2021,19.0573
4,A0118150102,2018,19.5875
5,A0118150102,2019,18.5595
6,A0118150102,2020,19.8577
7,A0118150102,2021,21.4565
8,A0118150103,2018,19.363
9,A0118150103,2019,19.3442


In [74]:
# check memory usage by column

sc_gross_price.memory_usage(deep=True)

Index              132
product_code     81396
fiscal_year      73017
gross_price     134064
dtype: int64

In [75]:
# check datatypes

sc_gross_price.dtypes

product_code    object
fiscal_year     object
gross_price     object
dtype: object

In [76]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_gross_price['product_code'].nunique())
sc_gross_price['product_code'].value_counts()

397


A2918150105    5
A1219150301    5
A3718150102    5
A3718150103    5
A3718150105    5
              ..
A5820110105    1
A2821150802    1
A2821150803    1
A2821150804    1
A4821110808    1
Name: product_code, Length: 397, dtype: int64

In [77]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_gross_price['fiscal_year'].nunique())
sc_gross_price['fiscal_year'].value_counts()

5


2022    353
2021    338
2020    247
2019    172
2018     87
Name: fiscal_year, dtype: int64

In [78]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_gross_price['gross_price'].min())
print(sc_gross_price['gross_price'].max())

2.2831000000
890.1364000000


In [79]:
# change datatypes

sc_gross_price['fiscal_year'] = sc_gross_price['fiscal_year'].astype('int16')
sc_gross_price['gross_price'] = sc_gross_price['gross_price'].astype('float64')

In [80]:
# check memory usage by column

sc_gross_price.memory_usage(deep=True)

Index             132
product_code    81396
fiscal_year      2394
gross_price      9576
dtype: int64

### sc_manufacturing_cost

In [81]:
sc_manufacturing_cost.head(10)

Unnamed: 0,product_code,cost_year,manufacturing_cost
0,A0118150101,2018,4.619
1,A0118150101,2019,4.2033
2,A0118150101,2020,5.0207
3,A0118150101,2021,5.5172
4,A0118150102,2018,5.6036
5,A0118150102,2019,5.3235
6,A0118150102,2020,5.718
7,A0118150102,2021,6.2835
8,A0118150103,2018,5.9469
9,A0118150103,2019,5.5306


In [82]:
# check memory usage by column

sc_manufacturing_cost.memory_usage(deep=True)

Index                    132
product_code           81396
cost_year              73017
manufacturing_cost    134064
dtype: int64

In [83]:
# check datatypes

sc_manufacturing_cost.dtypes

product_code          object
cost_year             object
manufacturing_cost    object
dtype: object

In [84]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_manufacturing_cost['product_code'].nunique())
sc_manufacturing_cost['product_code'].value_counts()

397


A2918150105    5
A1219150301    5
A3718150102    5
A3718150103    5
A3718150105    5
              ..
A5820110105    1
A2821150802    1
A2821150803    1
A2821150804    1
A4821110808    1
Name: product_code, Length: 397, dtype: int64

In [85]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_manufacturing_cost['cost_year'].nunique())
sc_manufacturing_cost['cost_year'].value_counts()

5


2022    353
2021    338
2020    247
2019    172
2018     87
Name: cost_year, dtype: int64

In [86]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_manufacturing_cost['manufacturing_cost'].min())
print(sc_manufacturing_cost['manufacturing_cost'].max())

0.7064000000
263.4207000000


In [87]:
# change datatypes

sc_manufacturing_cost['cost_year'] = sc_manufacturing_cost['cost_year'].astype('int16')
sc_manufacturing_cost['manufacturing_cost'] = sc_manufacturing_cost['manufacturing_cost'].astype('float64')

In [88]:
# check memory usage by column

sc_manufacturing_cost.memory_usage(deep=True)

Index                   132
product_code          81396
cost_year              2394
manufacturing_cost     9576
dtype: int64

### sc_post_invoice_deductions

In [89]:
sc_post_invoice_deductions.head(10)

Unnamed: 0,customer_code,product_code,date,discounts_pct,other_deductions_pct
0,70002017,A0118150101,2017-09-01,0.2659568373,0.0718706275
1,70002017,A0118150101,2017-10-01,0.3089921425,0.0976271962
2,70002017,A0118150101,2017-11-01,0.3312678749,0.0752107404
3,70002017,A0118150101,2018-01-01,0.2957916324,0.0720356315
4,70002017,A0118150101,2018-02-01,0.3207865527,0.0793345762
5,70002017,A0118150101,2018-03-01,0.2634826863,0.1007454661
6,70002017,A0118150101,2018-05-01,0.223149173,0.081960474
7,70002017,A0118150101,2018-06-01,0.3019501683,0.0791333546
8,70002017,A0118150101,2018-07-01,0.312340645,0.092944873
9,70002017,A0118150102,2017-09-01,0.2302247453,0.086882152


In [90]:
# check memory usage by column

sc_post_invoice_deductions.memory_usage(deep=True)

Index                         132
customer_code           134099940
product_code            140289168
date                     82523040
discounts_pct           231064512
other_deductions_pct    231064512
dtype: int64

In [91]:
# check datatypes

sc_post_invoice_deductions.dtypes

customer_code           object
product_code            object
date                    object
discounts_pct           object
other_deductions_pct    object
dtype: object

In [92]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_post_invoice_deductions['customer_code'].nunique())
sc_post_invoice_deductions['customer_code'].value_counts()

209


70002017    10638
90011188    10638
90013123    10638
90013122    10638
90013121    10638
            ...  
90009129     8316
90004061     8316
70009133     8316
70004069     8316
90025209     8316
Name: customer_code, Length: 209, dtype: int64

In [93]:
# ensure datatype 'category' is apropriate by counting unique values

print(sc_post_invoice_deductions['product_code'].nunique())
sc_post_invoice_deductions['product_code'].value_counts()

389


A3019150205    8065
A3019150204    8065
A1219150301    8065
A0418150104    8065
A0418150105    8065
               ... 
A3621150803    1881
A3621150804    1881
A3621150805    1881
A3621150806    1881
A6621160503    1881
Name: product_code, Length: 389, dtype: int64

In [94]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_post_invoice_deductions['discounts_pct'].min())
print(sc_post_invoice_deductions['discounts_pct'].max())

0E-10
0.3597894498


In [95]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_post_invoice_deductions['other_deductions_pct'].min())
print(sc_post_invoice_deductions['other_deductions_pct'].max())

0E-10
0.2076698904


In [96]:
# change datatypes

sc_post_invoice_deductions['date'] = sc_post_invoice_deductions['date'].astype('datetime64')
sc_post_invoice_deductions['discounts_pct'] = sc_post_invoice_deductions['discounts_pct'].astype('float64')
sc_post_invoice_deductions['other_deductions_pct'] = sc_post_invoice_deductions['other_deductions_pct'].astype('float64')

In [97]:
# check memory usage by column

sc_post_invoice_deductions.memory_usage(deep=True)

Index                         132
customer_code           134099940
product_code            140289168
date                     16504608
discounts_pct            16504608
other_deductions_pct     16504608
dtype: int64

In [98]:
sc_post_invoice_deductions['date'].dt.year.value_counts()

2021    635151
2020    516648
2022    432630
2019    317775
2018    134280
2017     26592
Name: date, dtype: int64

In [99]:
# split dataframe into two to save on file size

print(sc_post_invoice_deductions[sc_post_invoice_deductions['date'].dt.year <= 2020]['date'].dt.year.value_counts().sum())
print(sc_post_invoice_deductions[sc_post_invoice_deductions['date'].dt.year > 2020]['date'].dt.year.value_counts().sum())

995295
1067781


In [100]:
sc_post_invoice_deductions_18_20 = sc_post_invoice_deductions[sc_post_invoice_deductions['date'].dt.year <= 2020].reset_index().drop('index', axis=1)
sc_post_invoice_deductions_21_22 = sc_post_invoice_deductions[sc_post_invoice_deductions['date'].dt.year > 2020].reset_index().drop('index', axis=1)

In [101]:
# check memory usage by column

sc_post_invoice_deductions_18_20.memory_usage(deep=True)

Index                        132
customer_code           64694175
product_code            67680060
date                     7962360
discounts_pct            7962360
other_deductions_pct     7962360
dtype: int64

In [102]:
# check memory usage by column

sc_post_invoice_deductions_21_22.memory_usage(deep=True)

Index                        132
customer_code           69405765
product_code            72609108
date                     8542248
discounts_pct            8542248
other_deductions_pct     8542248
dtype: int64

### sc_pre_invoice_deductions

In [103]:
sc_pre_invoice_deductions.head(10)

Unnamed: 0,customer_code,fiscal_year,pre_invoice_discount_pct
0,70002017,2018,0.0824421975353929
1,70002017,2019,0.0776586134860565
2,70002017,2020,0.0734578107313522
3,70002017,2021,0.0702694757190856
4,70002017,2022,0.105677829773947
5,70002018,2018,0.295567708460413
6,70002018,2019,0.257654803440745
7,70002018,2020,0.225480979087521
8,70002018,2021,0.206107123576155
9,70002018,2022,0.293092710386539


In [104]:
# check memory usage by column

sc_pre_invoice_deductions.memory_usage(deep=True)

Index                          132
customer_code                67925
fiscal_year                  63745
pre_invoice_discount_pct    117040
dtype: int64

In [105]:
# check datatypes

sc_pre_invoice_deductions.dtypes

customer_code               object
fiscal_year                 object
pre_invoice_discount_pct    object
dtype: object

In [106]:
# ensure datype 'int16' is apropriate by checking the range

print(sc_pre_invoice_deductions['pre_invoice_discount_pct'].min())
print(sc_pre_invoice_deductions['pre_invoice_discount_pct'].max())

0.05101586478809630000
0.30990802019830200000


In [107]:
# change datatypes

sc_pre_invoice_deductions['fiscal_year'] = sc_pre_invoice_deductions['fiscal_year'].astype('int16')
sc_pre_invoice_deductions['pre_invoice_discount_pct'] = sc_pre_invoice_deductions['pre_invoice_discount_pct'].astype('float64')

In [108]:
# check memory usage by column

sc_pre_invoice_deductions.memory_usage(deep=True)

Index                         132
customer_code               67925
fiscal_year                  2090
pre_invoice_discount_pct     8360
dtype: int64

## Save dataframes

In [109]:
# Save dtaframes into a new dictionary

sc_dataframes_cleaned = {
    'sc_dim_customer': sc_dim_customer,
    'sc_dim_market': sc_dim_market,
    'sc_dim_product': sc_dim_product,
    'sc_fact_forecast_monthly': sc_fact_forecast_monthly,
    'sc_fact_sales_monthly': sc_fact_sales_monthly,
    'sc_freight_cost': sc_freight_cost,
    'sc_gross_price': sc_gross_price,
    'sc_manufacturing_cost': sc_manufacturing_cost,
    'sc_post_invoice_deductions_18_20': sc_post_invoice_deductions_18_20,
    'sc_post_invoice_deductions_21_22': sc_post_invoice_deductions_21_22,
    'sc_pre_invoice_deductions': sc_pre_invoice_deductions,
}

%store sc_dataframes_cleaned

Stored 'sc_dataframes_cleaned' (dict)
