# Project setup

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from importlib import reload
import src.modules.data_cleaning as dc

# Reload the data_cleaning module to ensure that any changes made to it are reflected in this notebook
reload(dc)

<module 'src.modules.data_cleaning' from '/Users/ds/data_science/demand_prediction_data/src/modules/data_cleaning.py'>

In [3]:
import os

global DATA_FOLDER, SRC_FOLDER, MODULES_FOLDER, TESTS_FOLDER, OUTPUT_FOLDER, FIGURES_FOLDER

# Define folder paths as global variables
DATA_FOLDER = "data/"
SRC_FOLDER = "src/"
MODULES_FOLDER = "src/modules/"
TESTS_FOLDER = "src/tests/"
OUTPUT_FOLDER = "output/"
FIGURES_FOLDER = "output/figures/"

folders = [
    DATA_FOLDER,
    SRC_FOLDER,
    MODULES_FOLDER,
    TESTS_FOLDER,
    OUTPUT_FOLDER,
    FIGURES_FOLDER,
]

for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)
        print(f"Created folder: {folder}")
    else:
        print(f"Folder already exists: {folder}")


Folder already exists: data/
Folder already exists: src/
Folder already exists: src/modules/
Folder already exists: src/tests/
Folder already exists: output/
Folder already exists: output/figures/


In [4]:
global DATA_PATH, DATA_ORIGINAL_PATH, DATA_GENERATED_PATH, IMAGE_GENERATED_PATH

DATA_PATH = DATA_FOLDER
DATA_ORIGINAL_PATH = DATA_FOLDER
DATA_GENERATED_PATH = OUTPUT_FOLDER
IMAGE_GENERATED_PATH = FIGURES_FOLDER

# ETL

## Extract

In [5]:
import pandas as pd

# read the csv:

items = pd.read_csv(DATA_ORIGINAL_PATH + 'items.csv', sep=';')
orders = pd.read_csv(DATA_ORIGINAL_PATH + 'orders.csv', sep=';')

In [6]:

dc.evaluate_zero_value_fixability(orders, 'unit_cogs', 'item_code')

The number of 0 values in unit_cogs is:
795159
The percentage of 0 values in unit_cogs is:
37.357295477993205 %
By using the unit_cogs values for other rows with the same item_code, we can fix:
17234
The percentage of fixable rows is:
2.1673652690845477 %
The percentage of unfixable rows as a percentage of all rows is:
36.547626430333885 %


In [7]:
dc.evaluate_zero_value_fixability(orders, 'unit_rrp_vat_excl', 'item_code')

The number of 0 values in unit_rrp_vat_excl is:
7092
The percentage of 0 values in unit_rrp_vat_excl is:
0.33318863212254124 %
By using the unit_rrp_vat_excl values for other rows with the same item_code, we can fix:
6530
The percentage of fixable rows is:
92.07557811618724 %
The percentage of unfixable rows as a percentage of all rows is:
0.026403272878295005 %


## Transform

### Merge

In [8]:
import pandas as pd

df = pd.merge(orders, items, on='item_code', how='left')


In [9]:
dc.print_df_with_dtypes(df, max_rows=200, print_on=True, data_generated_path=DATA_GENERATED_PATH, name_extension='original')

+-----+------------+---------------------+------------+-------------+------------+-----------------+-----------------------+-------------+---------------------+--------------+-----------+----------+-----------------+---------------------+-----------------------------------------+----------------+------------+-------------------+-------------+------------+---------------+----------------+---------------+-------------------------+---------------+---------------+---------------+---------------+----------------+
|     |   order_id | date                | payment    | item_code   |   quantity |   gift_quantity |   unit_price_vat_excl |   unit_cogs |   unit_rrp_vat_excl | department   | channel   | owner    | site            | CreatedAt           | item_name                               | style          |   brand_id | name              |   group0_id | group0     |     group1_id | group1         |     group2_id | group2                  | category      | gender        | age           | color

1. **unit_price_vat_excl**: This likely stands for the "unit price excluding Value Added Tax (VAT)". It represents the cost of a single item without considering the VAT imposed by the government. In many countries, VAT is applied to products and services, and this column represents the base price of a product without that tax.

2. **unit_cogs**: This likely stands for "unit Cost of Goods Sold (COGS)". COGS represents the total cost of producing a single item, including raw materials, labor, and other expenses directly related to the production of that item. This column contains the cost information for a single unit of a product.

3. **unit_rrp_vat_excl**: This likely stands for "unit Recommended Retail Price (RRP) excluding VAT". The RRP is the price that a manufacturer suggests retailers sell a product for, which is usually higher than the wholesale price. This column represents the suggested selling price for a single item, without including the VAT imposed by the government.

In summary, these columns provide information about the costs and recommended pricing for individual products without considering the Value Added Tax.


In [10]:
# Fill missing values and convert data types
df = dc.data_clean(df)

The number of rows in the merged dataframe is:  2128524
The number of rows in the merged dataframe after dropping duplicates is:  2127925
The number of rows dropped is:  599
The percentage of rows dropped is:  0.02814156664430375 %
The data types of the date and CreatedAt columns have been converted to datetime.
The date column has been set as the index.
The DataFrame has been sorted by the index.
The NaN values in the gift_quantity column have been replaced with 0.
The datatype of the gift_quantity column has been changed to int64.
The "- žádný výrobce -" values in the name column have been replaced with "no_manufacturer".
The missing values in the payment, group0_id, group1_id, and group2_id columns have been filled with "unspecified" and converted to string.
The name column has been renamed to brand_name.
The order_id, brand_id, group0_id, group1_id, and group2_id columns have been converted to object.


In [11]:
dc.print_df_with_dtypes(df, max_rows=200, print_on=True, data_generated_path=DATA_GENERATED_PATH, name_extension='before_fixing_zero_values')

+---------------------+------------+---------------+-------------+------------+-----------------+-----------------------+-------------+---------------------+--------------+-----------+----------+-----------------+---------------------+----------------------------------------+------------------+------------+------------------+-------------+-----------+-------------+-----------------+-------------+-------------------------+-------------+-------------+-------------+-------------+-------------+
| date                |   order_id | payment       | item_code   |   quantity |   gift_quantity |   unit_price_vat_excl |   unit_cogs |   unit_rrp_vat_excl | department   | channel   | owner    | site            | CreatedAt           | item_name                              | style            |   brand_id | brand_name       |   group0_id | group0    |   group1_id | group1          |   group2_id | group2                  | category    | gender      | age         | color       | size        |
|       

In [12]:
dc.evaluate_zero_value_fixability(df, 'unit_rrp_vat_excl', 'item_code')

The number of 0 values in unit_rrp_vat_excl is:
7190
The percentage of 0 values in unit_rrp_vat_excl is:
0.33788784849090076 %
By using the unit_rrp_vat_excl values for other rows with the same item_code, we can fix:
6541
The percentage of fixable rows is:
90.97357440890124 %
The percentage of unfixable rows as a percentage of all rows is:
0.03049919522539563 %


In [13]:

df = dc.fix_zero_value_by_cross_reference(df, 'unit_rrp_vat_excl', 'item_code')

In [14]:
dc.evaluate_zero_value_fixability(df, 'unit_rrp_vat_excl', 'item_code')

The number of 0 values in unit_rrp_vat_excl is:
649
The percentage of 0 values in unit_rrp_vat_excl is:
0.03049919522539563 %
By using the unit_rrp_vat_excl values for other rows with the same item_code, we can fix:
0
The percentage of fixable rows is:
0.0 %
The percentage of unfixable rows as a percentage of all rows is:
0.03049919522539563 %


In [15]:
dc.print_df_with_dtypes(df, max_rows=200, print_on=True, data_generated_path=DATA_GENERATED_PATH, name_extension='cleaned')

+---------------------+------------+---------------+-------------+------------+-----------------+-----------------------+-------------+---------------------+--------------+-----------+----------+-----------------+---------------------+----------------------------------------+------------------+------------+------------------+-------------+-----------+-------------+-----------------+-------------+-------------------------+-------------+-------------+-------------+-------------+-------------+
| date                |   order_id | payment       | item_code   |   quantity |   gift_quantity |   unit_price_vat_excl |   unit_cogs |   unit_rrp_vat_excl | department   | channel   | owner    | site            | CreatedAt           | item_name                              | style            |   brand_id | brand_name       |   group0_id | group0    |   group1_id | group1          |   group2_id | group2                  | category    | gender      | age         | color       | size        |
|       

## Load To Dadabase

In [16]:

# save the merged dataframe to parquet
df.to_parquet(DATA_GENERATED_PATH + 'ETL_finished.parquet')

# print the load to database is successful
print('The load to database is successful')

The load to database is successful


In [17]:
# save the merged dataframe to csv
df.to_csv(DATA_GENERATED_PATH + 'ETL_finished.csv')
