# 0. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.gridspec as gridspec

from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from kneed import KneeLocator
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from collections import Counter
from tabulate import tabulate

# II. Exploratory Data Analysis

In [2]:
dfsales = pd.read_csv("data/cleaned_data.csv", index_col=0)

## 1. Data Overview and Descriptive Statistics

### Overview

In [3]:
dfsales.head()

Unnamed: 0_level_0,DimGenderId,DimItemId,ItemNo2,StyleID,Style,Color,Size,Gender,Category,SUB Category,...,Quality,Origin Country,WHSSalesPriceDKK,RRSalesPriceDKK,OrderHeaderNumber,OrderLineNumber,Quantity,Amount,Discount,Cost
DimPostingDateId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-03-12,2,347900,F15404308_CLR000021,F15404308,Hoys pants 6528,Black,XL,WOMENSWEAR,Trousers,Trousers,...,Woven,China,315.0,800.0,103099,20001,1.0,294.63,17.71,152.91
2022-03-12,2,347900,F15404308_CLR000021,F15404308,Hoys pants 6528,Black,XL,WOMENSWEAR,Trousers,Trousers,...,Woven,China,315.0,800.0,103106,20002,2.0,589.26,35.34,305.8
2022-03-12,1,778,M00012003_CLR000508,M00012003,Kronos o-n ss 273,White mel,S,MENSWEAR,Tops,Crew neck,...,Jersey,Turkey,90.0,250.0,103035,20002,1.0,78.12,0.0,42.41
2022-03-12,2,348547,F19123672_CLR000021,F19123672,Majan ss shirt 9942,Black,L,WOMENSWEAR,Shirts,Shirts,...,Woven,China,195.0,500.0,IN0001122,60002,2.0,379.45,0.0,147.84
2022-03-12,2,348547,F19123672_CLR000021,F19123672,Majan ss shirt 9942,Black,L,WOMENSWEAR,Shirts,Shirts,...,Woven,China,195.0,500.0,103113,40003,2.0,379.45,0.0,162.08


### Volumetry 

In [4]:
print("Dataset size: " + str(dfsales.shape))
# We have 3 ladder of items: The style, it is the design of an item. The style with a specific color is called a product. The style x color x size is called a SKU.
print("Number of styles: " + str(dfsales['StyleID'].nunique()))
print("Number of products: " + str(dfsales['ItemNo2'].nunique()))
print("Number of SKUs: " + str(dfsales['DimItemId'].nunique()))

Dataset size: (818375, 21)
Number of styles: 322
Number of products: 700
Number of SKUs: 4359


### Data Types

In [5]:
dfsales.dtypes

DimGenderId            int64
DimItemId              int64
ItemNo2               object
StyleID               object
Style                 object
Color                 object
Size                  object
Gender                object
Category              object
SUB Category          object
SUB Category2         object
Quality               object
Origin Country        object
WHSSalesPriceDKK     float64
RRSalesPriceDKK      float64
OrderHeaderNumber     object
OrderLineNumber        int64
Quantity             float64
Amount               float64
Discount             float64
Cost                 float64
dtype: object

In [9]:
dfsales.index = pd.to_datetime(dfsales.index, format="%Y-%m-%d")

### Description

In [10]:
dfsales.describe(include="float")

Unnamed: 0,WHSSalesPriceDKK,RRSalesPriceDKK,Quantity,Amount,Discount,Cost
count,818375.0,818375.0,818375.0,818375.0,818375.0,818375.0
mean,234.737225,603.471367,1.872136,441.772587,50.959565,178.263226
std,144.077196,360.547892,4.430286,815.981699,259.153296,339.761034
min,0.0,0.0,-1.0,-477.38,-477.38,-453.36
25%,110.0,300.0,1.0,176.09,0.0,61.7
50%,195.0,500.0,1.0,307.95,13.57,123.83
75%,320.0,800.0,2.0,500.0,36.56,187.91
max,1480.0,3700.0,1500.0,215758.56,52976.0,75360.0


In [11]:
dfsales.describe(include="object")

Unnamed: 0,ItemNo2,StyleID,Style,Color,Size,Gender,Category,SUB Category,SUB Category2,Quality,Origin Country,OrderHeaderNumber
count,818375,818375,818375,818375,818375,818375,818375,818375,818375,818375,818375,818375
unique,700,322,323,176,68,3,18,30,5,7,7,108656
top,M00012003_CLR000023,M00012003,Kronos o-n ss 273,Black,M,WOMENSWEAR,Tops,Crew neck,Short sleeve,Jersey,China,IN0149693
freq,18564,76833,76833,229953,182660,416289,377830,335886,274536,432931,397313,716


### Report

In [12]:
# Generate the report
profile = ProfileReport(dfsales, title="Sales of Basic Product")

# Save the report to .html
profile.to_file("BasicSales.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot reindex on an axis with duplicate labels')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## 2. Feature Assessment and Visualization

#### Sales Pattern

In [13]:
sns.set(rc={'figure.figsize': (20,8)})
ax = sns.lineplot(data=dfsales,y="Quantity", x=dfsales.index.strftime("%Y-%W"), estimator="sum")
ax.tick_params(axis="x", rotation=90)

In [14]:
sns.set(rc={'figure.figsize': (20,8)})
ax = sns.lineplot(data=dfsales,y="Quantity", x=dfsales.index.month, estimator="sum", hue=dfsales.index.year)
ax.tick_params(axis="x", rotation=0)


In [15]:
sns.set(rc={'figure.figsize': (20,8)})
ax = sns.lineplot(data=dfsales,y="Quantity", x=dfsales.index.strftime("%W"),estimator="sum", hue=dfsales.index.year)
ax.tick_params(axis="x", rotation=0)

#### Product categories

In [16]:
unique_product_cat = dfsales[['Category','SUB Category','SUB Category2']].drop_duplicates()
unique_product_cat.sort_values(by=['Category', 'SUB Category', 'SUB Category2']).reset_index(drop=True)

Unnamed: 0,Category,SUB Category,SUB Category2
0,Accessories,Accessories,Default
1,Bags,Tote,Default
2,Blazers,Blazers,Default
3,Blazers,Blazers,Long sleeve
4,Dresses,Maxi,Default
5,Dresses,Maxi,Long sleeve
6,Dresses,Maxi,Sleeveless
7,Dresses,Midi,Default
8,Dresses,Midi,Long sleeve
9,Dresses,Midi,Sleeveless


In [17]:
# Bar plot of quantity sold for each categories
temp = dfsales.groupby(["Category"])["Quantity"].sum().sort_values(ascending=False)
ax = sns.barplot(data=dfsales, y="Category", x="Quantity", estimator="sum", hue="Category", errorbar=None, order=temp.index)
for i in ax.containers:
    ax.bar_label(i,)

#### Sales Distribution in Each Category

In [18]:
fig, axs = plt.subplots(nrows=dfsales["Category"].nunique(), figsize=(15,100))
j=0
for i in dfsales["Category"].unique():
    temp = dfsales[dfsales["Category"]==i]
    sns.lineplot(data=temp, y="Quantity", x=temp.index.month, estimator="sum", hue=temp.index.year, ax=axs[j]).set(title=f"Sales distribution of {i}", xlabel="Month")
    j+=1