### import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator, MaxNLocator

### settings

In [2]:
%matplotlib inline

pd.set_option('display.max_rows', None)

### import data

In [3]:
%store -r sc_dataframes_cleaned

In [4]:
sc_dataframes_cleaned.keys()

dict_keys(['sc_dim_customer', 'sc_dim_market', 'sc_dim_product', 'sc_fact_forecast_monthly', 'sc_fact_sales_monthly', 'sc_freight_cost', 'sc_gross_price', 'sc_manufacturing_cost', 'sc_post_invoice_deductions_18_20', 'sc_post_invoice_deductions_21_22', 'sc_pre_invoice_deductions'])

In [5]:
# Create copies of each dataframe

sc_dim_customer = sc_dataframes_cleaned['sc_dim_customer'].copy()
sc_dim_market = sc_dataframes_cleaned['sc_dim_market'].copy()
sc_dim_product = sc_dataframes_cleaned['sc_dim_product'].copy()
sc_fact_forecast_monthly = sc_dataframes_cleaned['sc_fact_forecast_monthly'].copy()
sc_fact_sales_monthly = sc_dataframes_cleaned['sc_fact_sales_monthly'].copy()
sc_freight_cost = sc_dataframes_cleaned['sc_freight_cost'].copy()
sc_gross_price = sc_dataframes_cleaned['sc_gross_price'].copy()
sc_manufacturing_cost = sc_dataframes_cleaned['sc_manufacturing_cost'].copy()
sc_post_invoice_deductions_18_20 = sc_dataframes_cleaned['sc_post_invoice_deductions_18_20'].copy()
sc_post_invoice_deductions_21_22 = sc_dataframes_cleaned['sc_post_invoice_deductions_21_22'].copy()
sc_pre_invoice_deductions = sc_dataframes_cleaned['sc_pre_invoice_deductions'].copy()

### Analysis

In [6]:
# Get all column names to input into large language model

# Initialize an empty set to store unique column names
all_column_names = set()

# Iterate over each key-value pair in the dictionary
for df_name, df in sc_dataframes_cleaned.items():
    # Get the column names of the current DataFrame and add them to the set
    all_column_names.update(df.columns)
    
# List all column names
list(all_column_names)

['freight_pct',
 'customer_code',
 'forecast_quantity',
 'region',
 'fiscal_year',
 'date',
 'category',
 'other_cost_pct',
 'cost_year',
 'market',
 'sold_quantity',
 'product',
 'segment',
 'channel',
 'platform',
 'customer',
 'variant',
 'gross_price',
 'sub_zone',
 'other_deductions_pct',
 'division',
 'pre_invoice_discount_pct',
 'discounts_pct',
 'manufacturing_cost',
 'product_code']

In [7]:
sc_dim_customer.head(5)

Unnamed: 0,customer,market,platform,channel,customer_code
0,Brick & Mortar,Brick & Mortar,Brick & Mortar,Retailer,90002012
1,Brick & Mortar,Brick & Mortar,Brick & Mortar,Retailer,90002013
2,E-Commerce,E-Commerce,E-Commerce,Retailer,90002010
3,Brick & Mortar,Brick & Mortar,Brick & Mortar,Retailer,90002011
4,Brick & Mortar,Brick & Mortar,Brick & Mortar,Retailer,90002014


In [8]:
sc_dim_market.head(5)

Unnamed: 0,market,sub_zone,region
0,China,ROA,APAC
1,India,India,APAC
2,Indonesia,ROA,APAC
3,Japan,ROA,APAC
4,Pakistan,ROA,APAC


In [9]:
sc_dim_product.head(5)

Unnamed: 0,product_code,division,segment,category,product,variant
0,A0118150101,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Standard
1,A0118150102,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Plus
2,A0118150103,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Premium
3,A0118150104,P & A,Peripherals,Internal HDD,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,Premium Plus
4,A0219150201,P & A,Peripherals,Internal HDD,AQ WereWolf NAS Internal Hard Drive HDD – 8.89 cm,Standard


In [10]:
sc_fact_forecast_monthly.head(5)

Unnamed: 0,date,product_code,customer_code,forecast_quantity,fiscal_year
0,2017-09-01,A6218160101,70008169,146,2018
1,2017-09-01,A6218160101,90008165,120,2018
2,2017-09-01,A6218160101,90008166,216,2018
3,2017-09-01,A6218160101,90008167,141,2018
4,2017-09-01,A6218160101,70008170,85,2018


In [11]:
sc_fact_sales_monthly.head(5)

Unnamed: 0,date,product_code,customer_code,sold_quantity,fiscal_year
0,2017-09-01,A6218160101,70008169,81,2018
1,2017-09-01,A6218160101,90008165,157,2018
2,2017-09-01,A6218160101,90008166,126,2018
3,2017-09-01,A6218160101,90008167,160,2018
4,2017-09-01,A6218160101,70008170,120,2018


In [12]:
sc_freight_cost.head(5)

Unnamed: 0,market,fiscal_year,freight_pct,other_cost_pct
0,Australia,2018,0.0188,0.005
1,Austria,2018,0.0272,0.0053
2,Bangladesh,2018,0.0219,0.0058
3,Brazil,2018,0.0239,0.0033
4,Canada,2018,0.0264,0.0054


In [13]:
sc_gross_price.head(5)

Unnamed: 0,product_code,fiscal_year,gross_price
0,A0118150101,2018,15.3952
1,A0118150101,2019,14.4392
2,A0118150101,2020,16.2323
3,A0118150101,2021,19.0573
4,A0118150102,2018,19.5875


In [14]:
sc_manufacturing_cost.head(5)

Unnamed: 0,product_code,cost_year,manufacturing_cost
0,A0118150101,2018,4.619
1,A0118150101,2019,4.2033
2,A0118150101,2020,5.0207
3,A0118150101,2021,5.5172
4,A0118150102,2018,5.6036


In [15]:
sc_post_invoice_deductions_18_20.head(5)

Unnamed: 0,customer_code,product_code,date,discounts_pct,other_deductions_pct,fiscal_year
0,70002017,A0118150101,2017-09-01,0.265957,0.071871,2018
1,70002017,A0118150101,2017-10-01,0.308992,0.097627,2018
2,70002017,A0118150101,2017-11-01,0.331268,0.075211,2018
3,70002017,A0118150101,2018-01-01,0.295792,0.072036,2018
4,70002017,A0118150101,2018-02-01,0.320787,0.079335,2018


In [16]:
sc_post_invoice_deductions_21_22.head(5)

Unnamed: 0,customer_code,product_code,date,discounts_pct,other_deductions_pct,fiscal_year
0,70002017,A0118150101,2021-01-01,0.265561,0.135387,2021
1,70002017,A0118150101,2021-02-01,0.231823,0.125004,2021
2,70002017,A0118150101,2021-03-01,0.265881,0.140936,2021
3,70002017,A0118150101,2021-05-01,0.245106,0.138077,2021
4,70002017,A0118150101,2021-06-01,0.253668,0.130621,2021


In [17]:
sc_pre_invoice_deductions.head(5)

Unnamed: 0,customer_code,fiscal_year,pre_invoice_discount_pct
0,70002017,2018,0.082442
1,70002017,2019,0.077659
2,70002017,2020,0.073458
3,70002017,2021,0.070269
4,70002017,2022,0.105678


# Request 1
## Can you identify the top three product categories by gross revenue for each fiscal year?

# Request 2
## Analyze the distribution of manufacturing costs across different product divisions and determine which division has the highest average manufacturing cost?

# Request 3
## Analyze the relationship between the 'forecast_quantity' and 'sold_quantity' of products. Considering different product variants, identify whether there's a consistent pattern of over-forecasting or under-forecasting across various product variants.

## Investigate if certain customer segments or regions tend to have higher discrepancies between forecasted and actual sales quantities

## Propose potential strategies to optimize forecasting accuracy based on these findings."