In [15]:
import pandas as pd
import sqlite3

## Connection to the database

**Note:** In the beginning I download the whole database (like SELECT *) because as we see later it is obviously a small portion of the full database. But further if appropriate I will also use more 'precise' SQL queries to align with the rules of the project

In [14]:
### connect to the database
con = sqlite3.connect('atliq_db.sqlite3')

### check all tables in the database
cursor = con.cursor()
table_names = cursor.execute("SELECT name FROM sqlite_master  WHERE type='table';").fetchall()
print(table_names)

[('dim_customer',), ('dim_product',), ('fact_pre_discount',), ('fact_manufacturing_cost',), ('fact_gross_price',), ('fact_sales_monthly',)]


In [44]:
# download all tables into the dictionary of DataFrames to quicly look through the data
data_types = {
    'customer_code': 'float64',
    'customer':'str',
    'platform':'category',
    'channel':'category',
    'market':'category',
    'sub_zone':'category',
    'region':'category',
    'product_code':'category',
    'division':'category',
    'segment':'category',
    'category':'category',
    'product':'str',
    'variant':'str',
    'fiscal_year':'float64',
    'pre_invoice_discount_pct':'float64',
    'cost_year':'int16',
    'manufacturing_cost':'float64',
    'gross_price':'float64',
    # 'date':,
    'sold_quantity':'float64',
}

data_dates = {
    'date' : '%Y-%m-%d'
}

data_dict = {}
for tbl in table_names:
    query_all = """Select * from """ + str(tbl[0])
    col_names = [desc[0] for desc in cursor.execute(query_all).description if desc[0] !='date']
    tbl_types = {}
    tbl_types = {col:data_types[col] for col in col_names}
    data_dict[tbl[0]] = pd.read_sql_query(query_all, con, dtype=tbl_types, parse_dates=data_dates)
    print(str(tbl[0]))
    data_dict[tbl[0]].info()
    print()

dim_customer
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   customer_code  209 non-null    float64 
 1   customer       209 non-null    object  
 2   platform       209 non-null    category
 3   channel        209 non-null    category
 4   market         209 non-null    category
 5   sub_zone       209 non-null    category
 6   region         209 non-null    category
dtypes: category(5), float64(1), object(1)
memory usage: 6.5+ KB

dim_product
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   product_code  397 non-null    category
 1   division      397 non-null    category
 2   segment       397 non-null    category
 3   category      397 non-null    category
 4   product       397 non-null 

**Note:** There is only one row with empty cells. It could be dropped.

In [56]:
display(data_dict['fact_sales_monthly'].tail(1))
data_dict['fact_sales_monthly'].dropna(inplace=True)
data_dict['fact_sales_monthly'].isna().sum()

Unnamed: 0,date,product_code,customer_code,sold_quantity,fiscal_year
67250,2019-06-01,A0,,,


date             0
product_code     0
customer_code    0
sold_quantity    0
fiscal_year      0
dtype: int64

In [59]:
# now we can convert customer_code and fiscal_year to 'int'

data_dict['dim_customer']['customer_code'] = data_dict['dim_customer']['customer_code'].astype('int32')

data_dict['fact_pre_discount']['customer_code'] = data_dict['fact_pre_discount']['customer_code'].astype('int32')
data_dict['fact_pre_discount']['fiscal_year'] = data_dict['fact_pre_discount']['fiscal_year'].astype('int16')

data_dict['fact_gross_price']['fiscal_year'] = data_dict['fact_gross_price']['fiscal_year'].astype('int16')

data_dict['fact_sales_monthly']['customer_code'] = data_dict['fact_sales_monthly']['customer_code'].astype('int32')
data_dict['fact_sales_monthly']['fiscal_year'] = data_dict['fact_sales_monthly']['fiscal_year'].astype('int16')


In [60]:
# Let's take a look at data
for key, data in data_dict.items():
    print(key)
    display(data.sample(10))

dim_customer


Unnamed: 0,customer_code,customer,platform,channel,market,sub_zone,region
160,90018114,Amazon,E-Commerce,Retailer,Spain,SE,EU
95,90011186,Elite,Brick & Mortar,Retailer,France,SE,EU
115,90013120,Coolblue,Brick & Mortar,Retailer,Italy,SE,EU
195,90023022,Nomad Stores,Brick & Mortar,Retailer,Canada,,
96,90011187,Chip 7,Brick & Mortar,Retailer,France,SE,EU
8,70006158,Atliq e Store,E-Commerce,Direct,Philiphines,ROA,APAC
183,90022072,Path,Brick & Mortar,Retailer,USA,,
56,90002010,Ebay,E-Commerce,Retailer,India,India,APAC
105,90012036,Billa,Brick & Mortar,Retailer,Germany,NE,EU
32,70020104,Atliq e Store,E-Commerce,Direct,Austria,NE,EU


dim_product


Unnamed: 0,product_code,division,segment,category,product,variant
315,A5621110403,PC,Notebook,Gaming Laptop,AQ Smash 1,Standard Black
253,A4821110801,PC,Notebook,Personal Laptop,AQ F16,Standard Grey
278,A5119110306,PC,Notebook,Business Laptop,AQ BZ Gen Y,Plus Red
23,A0519150206,P & A,Peripherals,Graphic Card,AQ Mforce Gen Y,Plus 3
280,A5119110308,PC,Notebook,Business Laptop,AQ BZ Gen Y,Premium Misty Green
310,A5520110306,PC,Notebook,Gaming Laptop,AQ Gamer 3,Plus Black
71,A1718150204,P & A,Peripherals,MotherBoard,AQ MB Crossx,Premium
307,A5519110303,PC,Notebook,Gaming Laptop,AQ Gamer 3,Standard Black
85,A2118150102,P & A,Accessories,Mouse,AQ Master wired x1 Ms,Standard 2
376,A6718160102,N & S,Storage,USB Flash Drives,AQ Pen Drive 2 IN 1,Plus


fact_pre_discount


Unnamed: 0,customer_code,fiscal_year,pre_invoice_discount_pct
608,90014136,2021,0.2034
228,90001020,2021,0.1965
792,90018112,2020,0.2826
28,70004070,2021,0.2612
333,90004062,2021,0.2863
102,70012043,2020,0.1973
307,90002015,2020,0.2227
229,90001020,2022,0.2729
745,90017058,2018,0.2919
664,90015149,2022,0.2674


fact_manufacturing_cost


Unnamed: 0,product_code,cost_year,manufacturing_cost
49,A0418150104,2020,5.649
98,A0619150302,2021,9.9617
1041,A5820110102,2022,219.1019
100,A0620150303,2020,10.3318
1120,A6319160202,2021,5.364
677,A4218110207,2022,76.7172
497,A3220150405,2020,5.649
840,A5018110207,2021,140.5364
369,A2520150506,2021,4.4444
189,A1320150403,2022,49.9034


fact_gross_price


Unnamed: 0,product_code,fiscal_year,gross_price
572,A3818150201,2019,13.8166
408,A2918150102,2020,7.7183
360,A2520150501,2022,12.4056
900,A5318110103,2020,514.3688
826,A5018110204,2019,364.5204
168,A1219150301,2018,123.3768
793,A4918110101,2021,419.1936
953,A5419110206,2019,514.3609
240,A1819150302,2021,31.9525
798,A4918110102,2021,451.4881


fact_sales_monthly


Unnamed: 0,date,product_code,customer_code,sold_quantity,fiscal_year
5615,2021-06-01,A0118150101,90012036,27.0,2021
52736,2019-09-01,A0418150101,70002018,40.0,2020
2972,2020-01-01,A0118150101,90002006,170.0,2020
26929,2019-10-01,A0219150201,90008166,13.0,2020
35811,2021-06-01,A0219150202,90020102,3.0,2021
7467,2019-01-01,A0118150102,90022075,28.0,2019
56250,2019-04-01,A0418150102,70007199,10.0,2019
27523,2020-02-01,A0219150201,90002004,163.0,2020
65310,2017-10-01,A0418150104,70016177,1.0,2018
39684,2021-03-01,A0220150203,90026205,10.0,2021


In [61]:
for key, data in data_dict.items():
    print(key)
    display(data.describe().T)

dim_customer


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_code,209.0,85946460.0,7919384.0,70002017.0,90002006.0,90012035.0,90018110.0,90027207.0


dim_product


Unnamed: 0,count,unique,top,freq
product_code,397,397,A0118150101,1
division,397,3,P & A,200
segment,397,6,Notebook,129
category,397,14,Personal Laptop,61
product,397,73,AQ Elite,8
variant,397,27,Plus 2,35


fact_pre_discount


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_code,1045.0,85946460.0,7904198.0,70002020.0,90002010.0,90012040.0,90018110.0,90027210.0
fiscal_year,1045.0,2020.0,1.414891,2018.0,2019.0,2020.0,2021.0,2022.0
pre_invoice_discount_pct,1045.0,0.2338066,0.05807724,0.051,0.2048,0.2439,0.2767,0.3099


fact_manufacturing_cost


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cost_year,1182.0,2020.57445,1.249199,2018.0,2020.0,2021.0,2022.0,2022.0
manufacturing_cost,1182.0,63.000676,74.015524,0.8654,5.41925,11.4176,122.56035,263.4207


fact_gross_price


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fiscal_year,1182.0,2020.57445,1.249199,2018.0,2020.0,2021.0,2022.0,2022.0
gross_price,1182.0,211.086558,248.388384,2.8445,18.0776,38.3837,414.7115,890.1364


fact_sales_monthly


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_code,67250.0,85794600.0,7982553.0,70002017.0,90002003.0,90011191.0,90018107.0,90027207.0
sold_quantity,67250.0,56.25182,136.97,0.0,7.0,20.0,52.0,4127.0
fiscal_year,67250.0,2020.056,1.127549,2018.0,2019.0,2020.0,2021.0,2022.0
