### import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator, MaxNLocator

### settings

In [2]:
%matplotlib inline

pd.set_option('display.max_rows', None)

### import data

In [3]:
%store -r sc_dataframes_cleaned

In [4]:
sc_dataframes_cleaned.keys()

dict_keys(['sc_dim_customer', 'sc_dim_market', 'sc_dim_product', 'sc_fact_forecast_monthly', 'sc_fact_sales_monthly', 'sc_freight_cost', 'sc_gross_price', 'sc_manufacturing_cost', 'sc_post_invoice_deductions_18_20', 'sc_post_invoice_deductions_21_22', 'sc_pre_invoice_deductions'])

In [5]:
# Create copies of each dataframe

sc_dim_customer = sc_dataframes_cleaned['sc_dim_customer'].copy()
sc_dim_market = sc_dataframes_cleaned['sc_dim_market'].copy()
sc_dim_product = sc_dataframes_cleaned['sc_dim_product'].copy()
sc_fact_forecast_monthly = sc_dataframes_cleaned['sc_fact_forecast_monthly'].copy()
sc_fact_sales_monthly = sc_dataframes_cleaned['sc_fact_sales_monthly'].copy()
sc_freight_cost = sc_dataframes_cleaned['sc_freight_cost'].copy()
sc_gross_price = sc_dataframes_cleaned['sc_gross_price'].copy()
sc_manufacturing_cost = sc_dataframes_cleaned['sc_manufacturing_cost'].copy()
sc_post_invoice_deductions_18_20 = sc_dataframes_cleaned['sc_post_invoice_deductions_18_20'].copy()
sc_post_invoice_deductions_21_22 = sc_dataframes_cleaned['sc_post_invoice_deductions_21_22'].copy()
sc_pre_invoice_deductions = sc_dataframes_cleaned['sc_pre_invoice_deductions'].copy()

### Analysis

In [6]:
# Get all column names to input into large language model

# Initialize an empty set to store unique column names
all_column_names = set()

# Iterate over each key-value pair in the dictionary
for df_name, df in sc_dataframes_cleaned.items():
    # Get the column names of the current DataFrame and add them to the set
    all_column_names.update(df.columns)
    
# List all column names
list(all_column_names)

['cost_of_goods',
 'region',
 'product',
 'gross_margin_pct',
 'category',
 'segment',
 'variant',
 'net_sales',
 'customer',
 'net_profit',
 'net_invoice_sales',
 'platform',
 'gross_price',
 'other_cost_pct',
 'fiscal_year',
 'cost_year',
 'sub_zone',
 'discounts_pct',
 'date',
 'market',
 'product_code',
 'sold_quantity',
 'customer_code',
 'channel',
 'pre_invoice_discount_pct',
 'freight_pct',
 'manufacturing_cost',
 'gross_sales',
 'forecast_quantity',
 'other_deductions_pct',
 'division']

In [7]:
sc_dim_customer.head(5)

Unnamed: 0,customer_code,customer,market,platform,channel
0,90002012,Electricalsocity,India,Brick & Mortar,Retailer
1,90002013,Electricalslytical,India,Brick & Mortar,Retailer
2,90002010,Ebay,India,E-Commerce,Retailer
3,90002011,Atliq Exclusive,India,Brick & Mortar,Retailer
4,90002014,Expression,India,Brick & Mortar,Retailer


In [8]:
sc_dim_market.head(5)

Unnamed: 0,market,sub_zone,region
0,China,ROA,APAC
1,India,India,APAC
2,Indonesia,ROA,APAC
3,Japan,ROA,APAC
4,Pakistan,ROA,APAC


In [9]:
sc_dim_product.head(5)

Unnamed: 0,product_code,product,division,segment,category,variant
0,A0118150101,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,P & A,Peripherals,Internal HDD,Standard
1,A0118150102,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,P & A,Peripherals,Internal HDD,Plus
2,A0118150103,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,P & A,Peripherals,Internal HDD,Premium
3,A0118150104,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,P & A,Peripherals,Internal HDD,Premium Plus
4,A0219150201,AQ WereWolf NAS Internal Hard Drive HDD – 8.89 cm,P & A,Peripherals,Internal HDD,Standard


In [10]:
sc_fact_forecast_monthly.head(5)

Unnamed: 0,date,fiscal_year,product_code,customer_code,forecast_quantity
0,2017-09-01,2018,A6218160101,70008169,146
1,2017-09-01,2018,A6218160101,90008165,120
2,2017-09-01,2018,A6218160101,90008166,216
3,2017-09-01,2018,A6218160101,90008167,141
4,2017-09-01,2018,A6218160101,70008170,85


In [11]:
sc_fact_sales_monthly.head(5)

Unnamed: 0,date,fiscal_year,product_code,customer_code,sold_quantity,gross_sales,net_invoice_sales,net_sales,cost_of_goods,gross_margin_pct,net_profit
0,2017-09-01,2018,A6218160101,70008169,81,1069.2729,967.319139,673.989914,340.676395,49.453784,333.313519
1,2017-09-01,2018,A6218160101,90008165,157,2072.5413,1482.119149,870.389878,660.323383,24.134759,210.066495
2,2017-09-01,2018,A6218160101,90008166,126,1663.3134,1183.649231,647.680403,529.941059,18.178618,117.739344
3,2017-09-01,2018,A6218160101,90008167,160,2112.144,1700.914928,996.808779,672.941027,32.490459,323.867752
4,2017-09-01,2018,A6218160101,70008170,120,1584.108,1283.693887,789.431187,504.70577,36.067161,284.725417


In [12]:
sc_freight_cost.head(5)

Unnamed: 0,market,fiscal_year,freight_pct,other_cost_pct
0,Australia,2018,0.0188,0.005
1,Austria,2018,0.0272,0.0053
2,Bangladesh,2018,0.0219,0.0058
3,Brazil,2018,0.0239,0.0033
4,Canada,2018,0.0264,0.0054


In [13]:
sc_gross_price.head(5)

Unnamed: 0,product_code,fiscal_year,gross_price
0,A0118150101,2018,15.3952
1,A0118150101,2019,14.4392
2,A0118150101,2020,16.2323
3,A0118150101,2021,19.0573
4,A0118150102,2018,19.5875


In [14]:
sc_manufacturing_cost.head(5)

Unnamed: 0,product_code,cost_year,manufacturing_cost
0,A0118150101,2018,4.619
1,A0118150101,2019,4.2033
2,A0118150101,2020,5.0207
3,A0118150101,2021,5.5172
4,A0118150102,2018,5.6036


In [15]:
sc_post_invoice_deductions_18_20.head(5)

Unnamed: 0,date,fiscal_year,customer_code,product_code,discounts_pct,other_deductions_pct
0,2017-09-01,2018,70002017,A0118150101,0.265957,0.071871
1,2017-10-01,2018,70002017,A0118150101,0.308992,0.097627
2,2017-11-01,2018,70002017,A0118150101,0.331268,0.075211
3,2018-01-01,2018,70002017,A0118150101,0.295792,0.072036
4,2018-02-01,2018,70002017,A0118150101,0.320787,0.079335


In [16]:
sc_post_invoice_deductions_21_22.head(5)

Unnamed: 0,date,fiscal_year,customer_code,product_code,discounts_pct,other_deductions_pct
0,2021-01-01,2021,70002017,A0118150101,0.265561,0.135387
1,2021-02-01,2021,70002017,A0118150101,0.231823,0.125004
2,2021-03-01,2021,70002017,A0118150101,0.265881,0.140936
3,2021-05-01,2021,70002017,A0118150101,0.245106,0.138077
4,2021-06-01,2021,70002017,A0118150101,0.253668,0.130621


In [17]:
sc_pre_invoice_deductions.head(5)

Unnamed: 0,customer_code,fiscal_year,pre_invoice_discount_pct
0,70002017,2018,0.082442
1,70002017,2019,0.077659
2,70002017,2020,0.073458
3,70002017,2021,0.070269
4,70002017,2022,0.105678


# Request 1
## Can you identify the top three product categories by gross revenue for each fiscal year?

In [18]:
sc_fact_sales_monthly.head(5)

Unnamed: 0,date,fiscal_year,product_code,customer_code,sold_quantity,gross_sales,net_invoice_sales,net_sales,cost_of_goods,gross_margin_pct,net_profit
0,2017-09-01,2018,A6218160101,70008169,81,1069.2729,967.319139,673.989914,340.676395,49.453784,333.313519
1,2017-09-01,2018,A6218160101,90008165,157,2072.5413,1482.119149,870.389878,660.323383,24.134759,210.066495
2,2017-09-01,2018,A6218160101,90008166,126,1663.3134,1183.649231,647.680403,529.941059,18.178618,117.739344
3,2017-09-01,2018,A6218160101,90008167,160,2112.144,1700.914928,996.808779,672.941027,32.490459,323.867752
4,2017-09-01,2018,A6218160101,70008170,120,1584.108,1283.693887,789.431187,504.70577,36.067161,284.725417


In [29]:
sc_dim_product.head(5)

Unnamed: 0,product_code,product,division,segment,category,variant
0,A0118150101,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,P & A,Peripherals,Internal HDD,Standard
1,A0118150102,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,P & A,Peripherals,Internal HDD,Plus
2,A0118150103,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,P & A,Peripherals,Internal HDD,Premium
3,A0118150104,AQ Dracula HDD – 3.5 Inch SATA 6 Gb/s 5400 RPM...,P & A,Peripherals,Internal HDD,Premium Plus
4,A0219150201,AQ WereWolf NAS Internal Hard Drive HDD – 8.89 cm,P & A,Peripherals,Internal HDD,Standard


In [31]:
# Merge necessary tables

merge = pd.merge(sc_fact_sales_monthly, sc_dim_product, how='left', on='product_code')
merge.head(5)

Unnamed: 0,date,fiscal_year,product_code,customer_code,sold_quantity,gross_sales,net_invoice_sales,net_sales,cost_of_goods,gross_margin_pct,net_profit,product,division,segment,category,variant
0,2017-09-01,2018,A6218160101,70008169,81,1069.2729,967.319139,673.989914,340.676395,49.453784,333.313519,AQ Digit SSD,N & S,Storage,External Solid State Drives,Standard
1,2017-09-01,2018,A6218160101,90008165,157,2072.5413,1482.119149,870.389878,660.323383,24.134759,210.066495,AQ Digit SSD,N & S,Storage,External Solid State Drives,Standard
2,2017-09-01,2018,A6218160101,90008166,126,1663.3134,1183.649231,647.680403,529.941059,18.178618,117.739344,AQ Digit SSD,N & S,Storage,External Solid State Drives,Standard
3,2017-09-01,2018,A6218160101,90008167,160,2112.144,1700.914928,996.808779,672.941027,32.490459,323.867752,AQ Digit SSD,N & S,Storage,External Solid State Drives,Standard
4,2017-09-01,2018,A6218160101,70008170,120,1584.108,1283.693887,789.431187,504.70577,36.067161,284.725417,AQ Digit SSD,N & S,Storage,External Solid State Drives,Standard


In [34]:
# Group by fiscal_year and product category

sales_fiscal_year = merge.groupby(['fiscal_year', 'category'])['gross_sales'].sum().reset_index().rename(columns={'gross_sales':'gross_sales_millions'})

# Round to get rid of scientific notation

sales_fiscal_year['gross_sales_millions'] = (sales_fiscal_year['gross_sales_millions'] / 1000000).round(4)

sales_fiscal_year.head(5)

Unnamed: 0,fiscal_year,category,gross_sales_millions
0,2018,Batteries,0.0
1,2018,Business Laptop,7.8633
2,2018,External Solid State Drives,3.7766
3,2018,Gaming Laptop,0.5389
4,2018,Graphic Card,1.9779


In [41]:
# Define a function to get the top 3 categories

def top_3_categories(group):
    return group.nlargest(3, 'gross_sales_millions')

# Apply the function to the grouped data

top_categories = sales_fiscal_year.groupby('fiscal_year').apply(top_3_categories)

# Reset the index to flatten the DataFrame

top_categories.reset_index(drop=True, inplace=True)

# Add rank column
top_categories['rank'] = (top_categories.groupby('fiscal_year')['gross_sales_millions'].rank(ascending=False, method='dense')).astype('int8')

# Reoganize columns
top_categories = top_categories[['fiscal_year', 'rank', 'category', 'gross_sales_millions']]

top_categories

Unnamed: 0,fiscal_year,rank,category,gross_sales_millions
0,2018,1,Keyboard,11.9281
1,2018,2,Processors,11.8002
2,2018,3,Business Laptop,7.8633
3,2019,1,Keyboard,32.8224
4,2019,2,Wi fi extender,32.6239
5,2019,3,External Solid State Drives,25.9507
6,2020,1,Keyboard,82.758
7,2020,2,Business Laptop,77.0953
8,2020,3,Processors,74.6237
9,2021,1,Keyboard,271.063


# Request 2
## Analyze the distribution of manufacturing costs across different product divisions and determine which division has the highest average manufacturing cost.

# Request 3
## Analyze the relationship between the 'forecast_quantity' and 'sold_quantity' of products. Considering different product variants, identify whether there's a consistent pattern of over-forecasting or under-forecasting across various product variants.

## Investigate if certain customer segments or regions tend to have higher discrepancies between forecasted and actual sales quantities

## Propose potential strategies to optimize forecasting accuracy based on these findings.