In [233]:
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict

## Connection to the database

**Note:** In the beginning I download the whole database (like SELECT *) because as we see later it is obviously a small portion of the full database. But further if appropriate I will also use more 'precise' SQL queries to align with the rules of the project

In [234]:
### connect to the database
con = sqlite3.connect('atliq_db.sqlite3')

### check all tables in the database
cursor = con.cursor()
table_names = cursor.execute("SELECT name FROM sqlite_master  WHERE type='table';").fetchall()
print(table_names)

[('dim_customer',), ('dim_product',), ('fact_pre_discount',), ('fact_manufacturing_cost',), ('fact_gross_price',), ('fact_sales_monthly',)]


In [235]:
# download all tables into the dictionary of DataFrames to quicly look through the data
data_types = {
    'customer_code': 'float64',
    'customer':'str',
    'platform':'category',
    'channel':'category',
    'market':'category',
    'sub_zone':'category',
    'region':'category',
    'product_code':'category',
    'division':'category',
    'segment':'category',
    'category':'category',
    'product':'str',
    'variant':'str',
    'fiscal_year':'float64',
    'pre_invoice_discount_pct':'float64',
    'cost_year':'int16',
    'manufacturing_cost':'float64',
    'gross_price':'float64',
    # 'date':,
    'sold_quantity':'float64',
}

data_dates = {
    'date' : '%Y-%m-%d'
}

data_dict = {}
for tbl in table_names:
    query_all = """Select * from """ + str(tbl[0])
    col_names = [desc[0] for desc in cursor.execute(query_all).description if desc[0] !='date']
    tbl_types = {}
    tbl_types = {col:data_types[col] for col in col_names}
    data_dict[tbl[0]] = pd.read_sql_query(query_all, con, dtype=tbl_types, parse_dates=data_dates)
    print(str(tbl[0]))
    data_dict[tbl[0]].info()
    print()

dim_customer
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   customer_code  209 non-null    float64 
 1   customer       209 non-null    object  
 2   platform       209 non-null    category
 3   channel        209 non-null    category
 4   market         209 non-null    category
 5   sub_zone       209 non-null    category
 6   region         209 non-null    category
dtypes: category(5), float64(1), object(1)
memory usage: 6.5+ KB

dim_product
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   product_code  397 non-null    category
 1   division      397 non-null    category
 2   segment       397 non-null    category
 3   category      397 non-null    category
 4   product       397 non-null 

**Note:** There is only one row with empty cells. It could be dropped.

In [236]:
# drop 1 NaN row
display(data_dict['fact_sales_monthly'].tail(1))
data_dict['fact_sales_monthly'].dropna(inplace=True)
data_dict['fact_sales_monthly'].isna().sum()

Unnamed: 0,date,product_code,customer_code,sold_quantity,fiscal_year
67250,2019-06-01,A0,,,


date             0
product_code     0
customer_code    0
sold_quantity    0
fiscal_year      0
dtype: int64

In [237]:
# now we can convert customer_code and fiscal_year to 'int'

data_dict['dim_customer']['customer_code'] = data_dict['dim_customer']['customer_code'].astype('int32')

data_dict['fact_pre_discount']['customer_code'] = data_dict['fact_pre_discount']['customer_code'].astype('int32')
data_dict['fact_pre_discount']['fiscal_year'] = data_dict['fact_pre_discount']['fiscal_year'].astype('int16')

data_dict['fact_gross_price']['fiscal_year'] = data_dict['fact_gross_price']['fiscal_year'].astype('int16')

data_dict['fact_sales_monthly']['customer_code'] = data_dict['fact_sales_monthly']['customer_code'].astype('int32')
data_dict['fact_sales_monthly']['fiscal_year'] = data_dict['fact_sales_monthly']['fiscal_year'].astype('int16')


In [238]:
# Let's take a look at data
for key, data in data_dict.items():
    print(key)
    display(data.sample(10))

dim_customer


Unnamed: 0,customer_code,customer,platform,channel,market,sub_zone,region
26,70015152,Atliq e Store,E-Commerce,Direct,Norway,NE,EU
68,90004064,All-Out,Brick & Mortar,Retailer,Japan,ROA,APAC
97,90011188,Elkjøp,Brick & Mortar,Retailer,France,SE,EU
106,90012037,Saturn,Brick & Mortar,Retailer,Germany,NE,EU
37,70023032,Atliq e Store,E-Commerce,Direct,Canada,,
135,90016172,Power,Brick & Mortar,Retailer,Poland,NE,EU
175,90021089,Atlas Stores,Brick & Mortar,Retailer,United Kingdom,NE,EU
133,90015150,Amazon,E-Commerce,Retailer,Norway,NE,EU
202,90023029,Staples,Brick & Mortar,Retailer,Canada,,
57,90002011,Atliq Exclusive,Brick & Mortar,Retailer,India,India,APAC


dim_product


Unnamed: 0,product_code,division,segment,category,product,variant
306,A5519110302,PC,Notebook,Gaming Laptop,AQ Gamer 3,Standard Cool Blue
243,A4620110607,PC,Notebook,Personal Laptop,AQ Gen Y,Premium Black
59,A1420150502,P & A,Peripherals,Processors,AQ Electron 4 3600 Desktop Processor,Plus
80,A2020150501,P & A,Peripherals,MotherBoard,AQ MB Lito 2,Standard
224,A4419110404,PC,Notebook,Personal Laptop,AQ Elite,Plus Grey
285,A5219110405,PC,Notebook,Business Laptop,AQ BZ Gen Z,Plus Blue
267,A5018110203,PC,Notebook,Business Laptop,AQ BZ Compact,Standard Red
41,A0821150504,P & A,Peripherals,Graphic Card,AQ Marquee P3,Premium
176,A3621150803,P & A,Accessories,Keyboard,AQ Lumina,Plus 1
325,A5721110505,PC,Notebook,Gaming Laptop,AQ Smash 2,Plus Cool Blue


fact_pre_discount


Unnamed: 0,customer_code,fiscal_year,pre_invoice_discount_pct
205,80006154,2018,0.2728
364,90004068,2022,0.2856
1024,90024183,2022,0.1809
269,90002007,2022,0.2599
747,90017058,2020,0.2742
640,90015145,2018,0.3052
734,90017055,2022,0.229
397,90007197,2020,0.2182
495,90011190,2018,0.2717
746,90017058,2019,0.2658


fact_manufacturing_cost


Unnamed: 0,product_code,cost_year,manufacturing_cost
522,A3420150602,2022,7.6559
455,A3019150206,2019,3.4446
491,A3220150403,2020,5.6897
4,A0118150102,2018,5.6036
1066,A6018110102,2020,215.3716
567,A3718150105,2019,4.3881
171,A1219150301,2021,41.6092
41,A0418150102,2020,5.2169
204,A1618150101,2020,5.5908
591,A3819150205,2022,7.3333


fact_gross_price


Unnamed: 0,product_code,fiscal_year,gross_price
31,A0320150301,2022,25.8169
1165,A7119160103,2021,28.7736
468,A3119150303,2020,14.8815
1011,A5621110405,2020,579.0268
590,A3819150205,2021,21.6066
554,A3718150102,2019,15.5216
1114,A6319160201,2019,14.557
598,A3920150303,2020,23.0298
608,A4020150401,2022,29.6179
933,A5419110202,2018,508.189


fact_sales_monthly


Unnamed: 0,date,product_code,customer_code,sold_quantity,fiscal_year
24608,2021-08-01,A0118150104,90017050,23.0,2021
41873,2020-02-01,A0320150301,70002017,36.0,2020
66131,2018-09-01,A0418150104,80006154,102.0,2019
44121,2021-04-01,A0320150301,90019202,8.0,2021
4388,2020-10-01,A0118150101,90017049,25.0,2021
22993,2020-10-01,A0118150104,90006153,76.0,2021
28711,2020-10-01,A0219150201,70027208,4.0,2021
32164,2019-06-01,A0219150202,90012037,14.0,2019
64228,2021-06-01,A0418150103,90017052,11.0,2021
6283,2017-12-01,A0118150102,90013119,1.0,2018


In [239]:
# describe()
for key, data in data_dict.items():
    print(key)
    display(data.describe().T)

dim_customer


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_code,209.0,85946460.0,7919384.0,70002017.0,90002006.0,90012035.0,90018110.0,90027207.0


dim_product


Unnamed: 0,count,unique,top,freq
product_code,397,397,A0118150101,1
division,397,3,P & A,200
segment,397,6,Notebook,129
category,397,14,Personal Laptop,61
product,397,73,AQ Elite,8
variant,397,27,Plus 2,35


fact_pre_discount


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_code,1045.0,85946460.0,7904198.0,70002020.0,90002010.0,90012040.0,90018110.0,90027210.0
fiscal_year,1045.0,2020.0,1.414891,2018.0,2019.0,2020.0,2021.0,2022.0
pre_invoice_discount_pct,1045.0,0.2338066,0.05807724,0.051,0.2048,0.2439,0.2767,0.3099


fact_manufacturing_cost


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cost_year,1182.0,2020.57445,1.249199,2018.0,2020.0,2021.0,2022.0,2022.0
manufacturing_cost,1182.0,63.000676,74.015524,0.8654,5.41925,11.4176,122.56035,263.4207


fact_gross_price


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fiscal_year,1182.0,2020.57445,1.249199,2018.0,2020.0,2021.0,2022.0,2022.0
gross_price,1182.0,211.086558,248.388384,2.8445,18.0776,38.3837,414.7115,890.1364


fact_sales_monthly


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_code,67250.0,85794600.0,7982553.0,70002017.0,90002003.0,90011191.0,90018107.0,90027207.0
sold_quantity,67250.0,56.25182,136.97,0.0,7.0,20.0,52.0,4127.0
fiscal_year,67250.0,2020.056,1.127549,2018.0,2019.0,2020.0,2021.0,2022.0


# Data cleaning and preprocessing

## Duplicates

In [240]:
# full duplicates
print('Number of full duplicates in the table:')
for key, data in data_dict.items():
    print(f'{(key + ":"):<25}{data.duplicated().sum():>5}')

Number of full duplicates in the table:
dim_customer:                0
dim_product:                 0
fact_pre_discount:           0
fact_manufacturing_cost:     0
fact_gross_price:            0
fact_sales_monthly:          0


In [241]:
# other duplicates ## dim_customer

print('Number of duplicates in dim_customer')
print(f'{"customer_code:":<40}{data_dict["dim_customer"]["customer_code"].duplicated().sum()}')
print(f'{"customer identification:":<40}{data_dict["dim_customer"].loc[:, ["customer", "platform","channel", "market"]].duplicated().sum()}')

# # the same product duplicated in various other analytics
display(data_dict['dim_customer'].groupby(['market'], as_index=False).filter(lambda x: x['sub_zone'].nunique()>1))
display(data_dict['dim_customer'].groupby(['market'], as_index=False).filter(lambda x: x['region'].nunique()>1))
display(data_dict['dim_customer'].groupby(['sub_zone'], as_index=False).filter(lambda x: x['region'].nunique()>1))
display(data_dict['dim_customer'].groupby(['customer'], as_index=False).filter(lambda x: x['platform'].nunique()>1))


Number of duplicates in dim_customer
customer_code:                          0
customer identification:                0


Unnamed: 0,customer_code,customer,platform,channel,market,sub_zone,region


Unnamed: 0,customer_code,customer,platform,channel,market,sub_zone,region


Unnamed: 0,customer_code,customer,platform,channel,market,sub_zone,region


Unnamed: 0,customer_code,customer,platform,channel,market,sub_zone,region


**Conclusion:** No problems with dim_customer

In [242]:
# other duplicates ## dim_product

print('Number of duplicates in dim_product')
print(f'{"product_code:":<20}{data_dict["dim_product"]["product_code"].duplicated().sum()}')
print(f'{"product + variant:":<20}{data_dict["dim_product"].loc[:, ["product_code","variant"]].duplicated().sum()}')

# the same product duplicated in various other analytics
display(data_dict['dim_product'].groupby(['product', 'variant'], as_index=False).filter(lambda x: x['product_code'].nunique()>1))
display(data_dict['dim_product'].groupby(['product'], as_index=False).filter(lambda x: x['division'].nunique()>1))
display(data_dict['dim_product'].groupby(['product'], as_index=False).filter(lambda x: x['segment'].nunique()>1))
display(data_dict['dim_product'].groupby(['product'], as_index=False).filter(lambda x: x['category'].nunique()>1))
display(data_dict['dim_product'].groupby(['category'], as_index=False).filter(lambda x: x['segment'].nunique()>1))
display(data_dict['dim_product'].groupby(['category'], as_index=False).filter(lambda x: x['division'].nunique()>1))
display(data_dict['dim_product'].groupby(['segment'], as_index=False).filter(lambda x: x['division'].nunique()>1))


Number of duplicates in dim_product
product_code:       0
product + variant:  0


Unnamed: 0,product_code,division,segment,category,product,variant


Unnamed: 0,product_code,division,segment,category,product,variant


Unnamed: 0,product_code,division,segment,category,product,variant


Unnamed: 0,product_code,division,segment,category,product,variant


Unnamed: 0,product_code,division,segment,category,product,variant
261,A4918110101,PC,Notebook,Business Laptop,AQ BZ 101,Standard Grey
262,A4918110102,PC,Notebook,Business Laptop,AQ BZ 101,Standard Blue
263,A4918110103,PC,Notebook,Business Laptop,AQ BZ 101,Premium Black
264,A4918110104,PC,Notebook,Business Laptop,AQ BZ 101,Premium Misty Green
265,A5018110201,PC,Notebook,Business Laptop,AQ BZ Compact,Standard Grey
266,A5018110202,PC,Notebook,Business Laptop,AQ BZ Compact,Standard Blue
267,A5018110203,PC,Notebook,Business Laptop,AQ BZ Compact,Standard Red
268,A5018110204,PC,Notebook,Business Laptop,AQ BZ Compact,Plus Grey
269,A5018110205,PC,Notebook,Business Laptop,AQ BZ Compact,Plus Blue
270,A5018110206,PC,Notebook,Business Laptop,AQ BZ Compact,Plus Red


Unnamed: 0,product_code,division,segment,category,product,variant


Unnamed: 0,product_code,division,segment,category,product,variant


**Conclusion:** we have category 'Business Laptop' that falls both into 'desktop' and 'notebook'. Let's correct 'desktop' segment which seems to be an error for Laptop category. (It is our assumption, in reality we would check it with the data provider)

In [243]:
# replace segment for 'Business Laptop'
data_dict['dim_product'].loc[data_dict['dim_product']['category'] == 'Business Laptop', 'segment'] = 'Notebook'
data_dict['dim_product'][data_dict['dim_product']['category'] == 'Business Laptop']

Unnamed: 0,product_code,division,segment,category,product,variant
261,A4918110101,PC,Notebook,Business Laptop,AQ BZ 101,Standard Grey
262,A4918110102,PC,Notebook,Business Laptop,AQ BZ 101,Standard Blue
263,A4918110103,PC,Notebook,Business Laptop,AQ BZ 101,Premium Black
264,A4918110104,PC,Notebook,Business Laptop,AQ BZ 101,Premium Misty Green
265,A5018110201,PC,Notebook,Business Laptop,AQ BZ Compact,Standard Grey
266,A5018110202,PC,Notebook,Business Laptop,AQ BZ Compact,Standard Blue
267,A5018110203,PC,Notebook,Business Laptop,AQ BZ Compact,Standard Red
268,A5018110204,PC,Notebook,Business Laptop,AQ BZ Compact,Plus Grey
269,A5018110205,PC,Notebook,Business Laptop,AQ BZ Compact,Plus Blue
270,A5018110206,PC,Notebook,Business Laptop,AQ BZ Compact,Plus Red


In [244]:
# other duplicates ## Other data where duplication might cause interference
print('Number of duplicates')
print(f'{"customer_code + fiscal_year:":<40}{data_dict["fact_pre_discount"].loc[:, ["customer_code","fiscal_year"]].duplicated().sum()}')
print(f'{"product_code + cost_year:":<40}{data_dict["fact_manufacturing_cost"].loc[:, ["product_code","cost_year"]].duplicated().sum()}')
print(f'{"product_code + fiscal_year:":<40}{data_dict["fact_gross_price"].loc[:, ["product_code","fiscal_year"]].duplicated().sum()}')


Number of duplicates
customer_code + fiscal_year:            0
product_code + cost_year:               0
product_code + fiscal_year:             0


## Logical checks

In [245]:
# check time range of 'fiscal_year'

data_dict['fact_sales_monthly'].groupby('fiscal_year').agg(start_date=('date', 'min'), end_date = ('date', 'max'))

Unnamed: 0_level_0,start_date,end_date
fiscal_year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,2017-09-01,2018-08-01
2019,2018-09-01,2019-08-01
2020,2019-09-01,2020-08-01
2021,2020-09-01,2021-08-01
2022,2021-09-01,2021-12-01


**Conlusion:** as we can see 'fiscal year' lasts from September to August. And transactions are dated on monthly basis. 
Important: Fiscal year of 2022 is not full

As no other data are provided, let's assume that 'cost year' corresponds to calendar year (otherwise why differentiate between them?) of the transaction date.

In [246]:
# create cost_year column is sales table

data_dict['fact_sales_monthly']['cost_year'] = data_dict['fact_sales_monthly']['date'].dt.year

In [247]:
# do all customer codes exist in dim_customer and in 
set(data_dict['fact_sales_monthly']['customer_code']) - set(data_dict['dim_customer']['customer_code'])

set()

In [248]:
# do all customer codes exist in fact_pre_discount
set(data_dict['fact_sales_monthly']['customer_code']) - set(data_dict['fact_pre_discount']['customer_code'])

set()

In [249]:
# do all pairs customer_code + fiscal_year exist in fact_pre_discount
set(
    data_dict['fact_sales_monthly'].groupby(['customer_code', 'fiscal_year']).groups.keys()
) - set(
    data_dict['fact_pre_discount'].groupby(['customer_code', 'fiscal_year']).groups.keys()
)

set()

In [250]:
# do all product_codes exist in fact_manufacturing_cost
set(data_dict['fact_sales_monthly']['product_code']) - set(data_dict['fact_manufacturing_cost']['product_code'])

set()

In [251]:
# do all pairs product_code + cost_year exist in fact_manufacturing_cost
set(
    data_dict['fact_sales_monthly'].groupby(['product_code', 'cost_year']).groups.keys()
) - set(
    data_dict['fact_manufacturing_cost'].groupby(['product_code', 'cost_year']).groups.keys()
)

{('A0118150101', 2017),
 ('A0118150102', 2017),
 ('A0118150103', 2017),
 ('A0118150104', 2017),
 ('A0219150201', 2018),
 ('A0219150202', 2018),
 ('A0220150203', 2019),
 ('A0320150301', 2019),
 ('A0321150302', 2020),
 ('A0321150303', 2020),
 ('A0418150101', 2017),
 ('A0418150102', 2017),
 ('A0418150103', 2017),
 ('A0418150104', 2017)}

**Conclusion:** There several cases when manufacturing cost are not determined for the specific product in the specific year. Let's restore this as the average costs for that product.

In [252]:
# add missing product costs
missing_costs = set(
    data_dict['fact_sales_monthly'].groupby(['product_code', 'cost_year']).groups.keys()
) - set(
    data_dict['fact_manufacturing_cost'].groupby(['product_code', 'cost_year']).groups.keys()
)

missing_costs_dict = defaultdict(list)
for product, year in missing_costs:
    temp_avg_cost = \
        data_dict['fact_manufacturing_cost'][data_dict['fact_manufacturing_cost']['product_code'] == product]['manufacturing_cost'].mean()
    missing_costs_dict['product_code'].append(product)
    missing_costs_dict['cost_year'].append(year)
    missing_costs_dict['manufacturing_cost'].append(temp_avg_cost)



In [253]:
# add missing product costs
data_dict['fact_manufacturing_cost'] = pd.concat([data_dict['fact_manufacturing_cost'],pd.DataFrame.from_dict(missing_costs_dict)], ignore_index=True)

data_dict['fact_manufacturing_cost'].tail(len(missing_costs))

Unnamed: 0,product_code,cost_year,manufacturing_cost
1182,A0219150201,2018,6.700975
1183,A0118150103,2017,6.3154
1184,A0321150303,2020,8.8084
1185,A0320150301,2019,7.307267
1186,A0118150104,2017,6.39686
1187,A0418150103,2017,5.43664
1188,A0418150102,2017,4.9609
1189,A0220150203,2019,7.282767
1190,A0418150101,2017,4.7456
1191,A0321150302,2020,8.12875


In [254]:
# double check
set(
    data_dict['fact_sales_monthly'].groupby(['product_code', 'cost_year']).groups.keys()
) - set(
    data_dict['fact_manufacturing_cost'].groupby(['product_code', 'cost_year']).groups.keys()
)

set()

In [255]:
# do all pairs product_code + fiscal year exist in fact_gross_price
set(
    data_dict['fact_sales_monthly'].groupby(['product_code', 'fiscal_year']).groups.keys()
) - set(
    data_dict['fact_gross_price'].groupby(['product_code', 'fiscal_year']).groups.keys()
)

set()

## Merge all data into single DataFrame
**Note:** we can do so as it is a small data set. In reality, use specific queries.

In [257]:
data = data_dict['fact_sales_monthly'].merge(data_dict['dim_customer'], on='customer_code', how='left')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67250 entries, 0 to 67249
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           67250 non-null  datetime64[ns]
 1   product_code   67250 non-null  category      
 2   customer_code  67250 non-null  int32         
 3   sold_quantity  67250 non-null  float64       
 4   fiscal_year    67250 non-null  int16         
 5   cost_year      67250 non-null  int64         
 6   customer       67250 non-null  object        
 7   platform       67250 non-null  category      
 8   channel        67250 non-null  category      
 9   market         67250 non-null  category      
 10  sub_zone       67250 non-null  category      
 11  region         67250 non-null  category      
dtypes: category(6), datetime64[ns](1), float64(1), int16(1), int32(1), int64(1), object(1)
memory usage: 3.3+ MB


In [258]:
data = data.merge(data_dict['dim_product'], on='product_code', how='left')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67250 entries, 0 to 67249
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           67250 non-null  datetime64[ns]
 1   product_code   67250 non-null  object        
 2   customer_code  67250 non-null  int32         
 3   sold_quantity  67250 non-null  float64       
 4   fiscal_year    67250 non-null  int16         
 5   cost_year      67250 non-null  int64         
 6   customer       67250 non-null  object        
 7   platform       67250 non-null  category      
 8   channel        67250 non-null  category      
 9   market         67250 non-null  category      
 10  sub_zone       67250 non-null  category      
 11  region         67250 non-null  category      
 12  division       67250 non-null  category      
 13  segment        67250 non-null  category      
 14  category       67250 non-null  category      
 15  product        6725

In [259]:
data = data.merge(data_dict['fact_pre_discount'], on=['customer_code', 'fiscal_year'] , how='left')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67250 entries, 0 to 67249
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      67250 non-null  datetime64[ns]
 1   product_code              67250 non-null  object        
 2   customer_code             67250 non-null  int32         
 3   sold_quantity             67250 non-null  float64       
 4   fiscal_year               67250 non-null  int16         
 5   cost_year                 67250 non-null  int64         
 6   customer                  67250 non-null  object        
 7   platform                  67250 non-null  category      
 8   channel                   67250 non-null  category      
 9   market                    67250 non-null  category      
 10  sub_zone                  67250 non-null  category      
 11  region                    67250 non-null  category      
 12  division          

In [260]:
data = data.merge(data_dict['fact_manufacturing_cost'], on=['product_code', 'cost_year'] , how='left')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67250 entries, 0 to 67249
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      67250 non-null  datetime64[ns]
 1   product_code              67250 non-null  object        
 2   customer_code             67250 non-null  int32         
 3   sold_quantity             67250 non-null  float64       
 4   fiscal_year               67250 non-null  int16         
 5   cost_year                 67250 non-null  int64         
 6   customer                  67250 non-null  object        
 7   platform                  67250 non-null  category      
 8   channel                   67250 non-null  category      
 9   market                    67250 non-null  category      
 10  sub_zone                  67250 non-null  category      
 11  region                    67250 non-null  category      
 12  division          

In [261]:
data = data.merge(data_dict['fact_gross_price'], on=['product_code', 'fiscal_year'] , how='left')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67250 entries, 0 to 67249
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      67250 non-null  datetime64[ns]
 1   product_code              67250 non-null  object        
 2   customer_code             67250 non-null  int32         
 3   sold_quantity             67250 non-null  float64       
 4   fiscal_year               67250 non-null  int16         
 5   cost_year                 67250 non-null  int64         
 6   customer                  67250 non-null  object        
 7   platform                  67250 non-null  category      
 8   channel                   67250 non-null  category      
 9   market                    67250 non-null  category      
 10  sub_zone                  67250 non-null  category      
 11  region                    67250 non-null  category      
 12  division          

In [262]:
# adjust datatypes

data['product_code'] = data['product_code'].astype('category')
data['customer'] = data['customer'].astype('category')
data['product'] = data['product'].astype('category')
data['variant'] = data['variant'].astype('category')
data['fiscal_year'] = data['fiscal_year'].astype('category')
data['cost_year'] = data['cost_year'].astype('category')
data['customer_code'] = data['customer_code'].astype('category')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67250 entries, 0 to 67249
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      67250 non-null  datetime64[ns]
 1   product_code              67250 non-null  category      
 2   customer_code             67250 non-null  category      
 3   sold_quantity             67250 non-null  float64       
 4   fiscal_year               67250 non-null  category      
 5   cost_year                 67250 non-null  category      
 6   customer                  67250 non-null  category      
 7   platform                  67250 non-null  category      
 8   channel                   67250 non-null  category      
 9   market                    67250 non-null  category      
 10  sub_zone                  67250 non-null  category      
 11  region                    67250 non-null  category      
 12  division          

## Add revenue and margin

In [None]:
# calculate new columns
data['gross_revenue'] = data['gross_price'] * data['sold_quantity']
data['discount'] = data['gross_revenue'] * data['pre_invoice_discount_pct']
data['net_revenue'] = data['gross_revenue'] - data['discount']
data['costs'] = data['manufacturing_cost'] * data['sold_quantity']
data['margin'] = data['net_revenue'] - data['costs']


In [264]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67250 entries, 0 to 67249
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      67250 non-null  datetime64[ns]
 1   product_code              67250 non-null  category      
 2   customer_code             67250 non-null  category      
 3   sold_quantity             67250 non-null  float64       
 4   fiscal_year               67250 non-null  category      
 5   cost_year                 67250 non-null  category      
 6   customer                  67250 non-null  category      
 7   platform                  67250 non-null  category      
 8   channel                   67250 non-null  category      
 9   market                    67250 non-null  category      
 10  sub_zone                  67250 non-null  category      
 11  region                    67250 non-null  category      
 12  division          

In [275]:
data.pivot_table(
    values='net_revenue',
    index=['platform', 'region'],
    columns=['category','fiscal_year'],
    aggfunc='sum',
    observed=True,
    fill_value=0
).T.style.format('{:,.0f}')


Unnamed: 0_level_0,platform,Brick & Mortar,Brick & Mortar,Brick & Mortar,Brick & Mortar,E-Commerce,E-Commerce,E-Commerce,E-Commerce
Unnamed: 0_level_1,region,APAC,EU,NA,LATAM,APAC,EU,NA,LATAM
category,fiscal_year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Graphic Card,2018,637060,56848,158483,0,244222,13342,84675,8595
Graphic Card,2019,686039,201058,214864,0,280751,49060,112775,11511
Graphic Card,2020,648460,275690,222537,504,392667,103207,172192,15403
Graphic Card,2021,460133,219237,154953,766,181129,55222,88422,3481
Graphic Card,2022,1464333,642193,465877,3086,520705,171041,240161,9994
Internal HDD,2018,544728,49380,145125,0,213357,11592,68078,7699
Internal HDD,2019,1584675,462521,500115,0,608946,112236,281861,28811
Internal HDD,2020,2910430,1238728,990912,1935,1765943,480614,823395,70075
Internal HDD,2021,6898928,3291283,2502788,12402,2717845,879077,1333379,54318
Internal HDD,2022,10556177,4898869,3784610,19264,3988851,1270269,1860086,72399


In [279]:
data.pivot_table(
    values='margin',
    index=['platform', 'region'],
    columns=['category','fiscal_year'],
    aggfunc='sum',
    observed=True,
    fill_value=0
).T.style.format('{:,.0f}').background_gradient(cmap='viridis')

Unnamed: 0_level_0,platform,Brick & Mortar,Brick & Mortar,Brick & Mortar,Brick & Mortar,E-Commerce,E-Commerce,E-Commerce,E-Commerce
Unnamed: 0_level_1,region,APAC,EU,NA,LATAM,APAC,EU,NA,LATAM
category,fiscal_year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Graphic Card,2018,388092,34931,97203,0,149158,8128,50301,5258
Graphic Card,2019,412349,119622,127312,0,169313,29515,66257,6852
Graphic Card,2020,411328,173764,140977,305,246879,64388,105753,9453
Graphic Card,2021,288373,137457,97840,479,113320,34389,55061,2137
Graphic Card,2022,944037,414125,304441,2017,329557,109185,154363,6322
Internal HDD,2018,325848,29739,87685,0,128233,6940,39680,4640
Internal HDD,2019,958939,277081,299302,0,370638,68025,166637,17340
Internal HDD,2020,1839148,776961,623905,1160,1108048,299842,506490,43008
Internal HDD,2021,4252878,2027429,1551859,7673,1663586,538071,819888,32604
Internal HDD,2022,6707208,3112695,2430382,12399,2486426,798448,1181321,44983
