# This file is temporary (will create python file for visualization and outcomes)

In this notebook we will take a dive into the costs and profits of all our orders. We will see, which products are the most profitable and which ones should, perhaps, be taken of the list.   
Among others, we will see:
- which product categories are ordered most often;
- which product categories are most often cancelled or possibly fraudulant;
- which products in each category bring in the most and the least profit;
- how do the customers types differ in the things they order;
- on which categories and which products is the company losing the most;
- what are the most and least expensive products;
- which categories are the most and least expensive, and averages, distribution of prices, kurtosis;
- which categories are the most discounted ones;
- how many sales are made per customer and sales total per customer;
- do small customer clustering via their buying patterns and PCA;
- if there are any trends in the order time and dates;
- if there is any weekly pattern in the shipement;
- many others ;)

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import statsmodels.api as sm
from scipy.stats import kurtosis, skew
from datetime import datetime

pd.set_option('display.max_columns', 500)

In [14]:
df = pd.read_parquet('data/SupplyChainDataset_cleaned.parquet')

df.head()

Unnamed: 0,Type,Benefit per order,Sales per customer,Category Id,Customer City,Customer Country,Customer Id,Customer Segment,Customer State,Department Id,Market,Order Country,Order date,Order Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Region,Order Status,Product Card Id,Product Price,Shipping date,Shipping Mode,Shipping time,Shipping day,Order time,Order day,Target shipping days
0,DEBIT,91.25,314.64,73,Caguas,Puerto Rico,20755,Consumer,PR,2,Pacific Asia,Indonesia,2018-01-31 22:56:00,77202,13.11,0.04,180517,327.75,0.29,1,327.75,314.64,91.25,Southeast Asia,COMPLETE,1360,327.75,2018-03-02 22:56:00,Standard Class,22:56,Friday,22:56,Wednesday,-1
1,TRANSFER,-249.09,311.36,73,Caguas,Puerto Rico,19492,Consumer,PR,2,Pacific Asia,India,2018-01-13 12:27:00,75939,16.39,0.05,179254,327.75,-0.8,1,327.75,311.36,-249.09,South Asia,PENDING,1360,327.75,2018-01-18 12:27:00,Standard Class,12:27,Thursday,12:27,Saturday,1
2,CASH,-247.78,309.72,73,San Jose,EE. UU.,19491,Consumer,CA,2,Pacific Asia,India,2018-01-13 12:06:00,75938,18.03,0.06,179253,327.75,-0.8,1,327.75,309.72,-247.78,South Asia,CLOSED,1360,327.75,2018-01-17 12:06:00,Standard Class,12:06,Wednesday,12:06,Saturday,0
3,DEBIT,22.86,304.81,73,Los Angeles,EE. UU.,19490,Home Office,CA,2,Pacific Asia,Australia,2018-01-13 11:45:00,75937,22.94,0.07,179252,327.75,0.08,1,327.75,304.81,22.86,Oceania,COMPLETE,1360,327.75,2018-01-16 11:45:00,Standard Class,11:45,Tuesday,11:45,Saturday,-1
4,PAYMENT,134.21,298.25,73,Caguas,Puerto Rico,19489,Corporate,PR,2,Pacific Asia,Australia,2018-01-13 11:24:00,75936,29.5,0.09,179251,327.75,0.45,1,327.75,298.25,134.21,Oceania,PENDING_PAYMENT,1360,327.75,2018-01-15 11:24:00,Standard Class,11:24,Monday,11:24,Saturday,-2


In [15]:
df.describe(include='all')

  df.describe(include='all')
  df.describe(include='all')


Unnamed: 0,Type,Benefit per order,Sales per customer,Category Id,Customer City,Customer Country,Customer Id,Customer Segment,Customer State,Department Id,Market,Order Country,Order date,Order Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Region,Order Status,Product Card Id,Product Price,Shipping date,Shipping Mode,Shipping time,Shipping day,Order time,Order day,Target shipping days
count,180519,180519.0,180519.0,180519.0,180519,180519,180519.0,180519,180519,180519.0,180519,180519,180519,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519,180519,180519.0,180519.0,180519,180519,180519,180519,180519,180519,180519.0
unique,4,,,,563,2,,3,46,,5,164,65752,,,,,,,,,,,23,9,,,63701,4,1440,7,1440,7,
top,DEBIT,,,,Caguas,EE. UU.,,Consumer,PR,,LATAM,Estados Unidos,2016-12-14 12:29:00,,,,,,,,,,,Central America,COMPLETE,,,2016-05-01 05:58:00,Standard Class,00:07,Monday,01:55,Thursday,
freq,69295,,,,66770,111146,,93504,69373,,51594,24840,5,,,,,,,,,,,28341,59491,,,10,107752,169,25989,166,26060,
first,,,,,,,,,,,,,2015-01-01 00:00:00,,,,,,,,,,,,,,,2015-01-02 00:08:00,,,,,,
last,,,,,,,,,,,,,2018-12-01 23:51:00,,,,,,,,,,,,,,,2018-12-01 23:51:00,,,,,,
mean,,21.974989,183.107607,31.851451,,,6691.379495,,,5.44346,,,,36221.894903,20.664741,0.101668,90260.0,141.23255,0.120647,2.127638,203.772096,183.107607,21.974989,,,692.509764,141.23255,,,,,,,0.565807
std,,104.433526,120.043668,15.640064,,,4162.918106,,,1.629246,,,,21045.379569,21.800901,0.070415,52111.490959,139.732492,0.466796,1.453451,132.273077,120.043668,104.433526,,,336.446807,139.732492,,,,,,,1.490966
min,,-4274.98,7.49,2.0,,,1.0,,,2.0,,,,1.0,0.0,0.0,1.0,9.99,-2.75,1.0,9.99,7.49,-4274.98,,,19.0,9.99,,,,,,,-2.0
25%,,7.0,104.38,18.0,,,3258.5,,,4.0,,,,18057.0,5.4,0.04,45130.5,50.0,0.08,1.0,119.980003,104.38,7.0,,,403.0,50.0,,,,,,,0.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 34 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   Type                      180519 non-null  object        
 1   Benefit per order         180519 non-null  float64       
 2   Sales per customer        180519 non-null  float64       
 3   Category Id               180519 non-null  int64         
 4   Customer City             180519 non-null  object        
 5   Customer Country          180519 non-null  object        
 6   Customer Id               180519 non-null  int64         
 7   Customer Segment          180519 non-null  object        
 8   Customer State            180519 non-null  object        
 9   Department Id             180519 non-null  int64         
 10  Market                    180519 non-null  object        
 11  Order Country             180519 non-null  object        
 12  Or

In [17]:
df['Shipping date'] = df['Shipping date'].dt.date
df['Order date'] = df['Order date'].dt.date

In [23]:
df.isnull().sum().sum() # no missing values

0

In [22]:
df.duplicated().sum()

0

In [43]:
id_values = ['Category Id', 'Customer Id','Department Id', 'Order Id', 'Order Item Id', 'Product Card Id']
df[id_values] = df[id_values].astype('category')

In [101]:
for a in numeric_columns:
    print(sum(df[a]))

3966902.9700005623
33054402.04000918
3730378.4000016865
18353.040090266222
25495158.680586748
21779.009992063595
384079
36784735.01340458
33054402.04000918
3966902.9700005623
25495158.680586748
102139


In [44]:
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
numeric_columns

Index(['Benefit per order', 'Sales per customer', 'Order Item Discount',
       'Order Item Discount Rate', 'Order Item Product Price',
       'Order Item Profit Ratio', 'Order Item Quantity', 'Sales',
       'Order Item Total', 'Order Profit Per Order', 'Product Price',
       'Target shipping days'],
      dtype='object')

In [102]:
def extended_statistics(dataframe, columns):
    mean = dataframe[columns].mean()
    stand_dev = dataframe[columns].std()
    median = dataframe[columns].median()
    summary = [dataframe[c].sum() for c in columns]
    skewness = [skew(dataframe[c]) for c in columns]
    kurtos = [kurtosis(dataframe[c]) for c in columns]
    outlier = [len(outliers(dataframe, c)) for c in columns]
    
    es = list(zip(mean, stand_dev, median, summary, skewness, kurtos, outlier))
    es_df = pd.DataFrame(es, columns=[ 'Mean', 'Standard Deviation', 'Median', 'Summary', 
                                      'Skewness', 'Kurtosis', 'Outliers Count'],)
    es_df.index = columns
    es_df = es_df.round(4)
    
    return es_df

# def extended_statistics(dataframe, columns):
#     es = pd.DataFrame({'Mean': dataframe[columns].mean(),
#                        'Standard deviation': dataframe[columns].std(),
#                        'Median': dataframe[columns].median(),
#                        'Sum': [dataframe[c].sum() for c in columns],
#                        'Skewness': [skew(dataframe[c]) for c in columns],
#                        'Kurtosis': [kurtosis(dataframe[c]) for c in columns],
#                        'Outliers count': [len(outliers(dataframe, c)) for c in columns]})

#     return es

In [54]:
def outliers(dataframe, column):
    outliers = []
    q1 = np.percentile(sorted(dataframe[column]), 25)
    q3 = np.percentile(sorted(dataframe[column]), 75)
    IQR = q3 - q1
    lwr_bound = q1 - (1.5 * IQR)
    upr_bound = q3 + (1.5 * IQR)
    for i in sorted(dataframe[column]): 
        if (i < lwr_bound or i > upr_bound):
            outliers.append(i)
    return outliers


In [103]:
extended_statistics(df, numeric_columns)

AttributeError: 'list' object has no attribute 'round'

In [68]:
len(outliers(df, 'Benefit per order'))

18942

In [70]:
c = [len(outliers(df, c)) for c in numeric_columns]
c

[18942, 1943, 7537, 0, 2048, 17300, 0, 488, 1943, 18942, 2048, 35701]

In [72]:
k = [kurtosis(df[c]) for c in numeric_columns]
k

[71.37524861020434,
 23.91966767743895,
 25.230535166833157,
 -0.9011651400087111,
 23.31231853044674,
 10.156909953713836,
 -0.7537139387454612,
 23.935865040496115,
 23.91966767743895,
 71.37524861020434,
 23.31231853044674,
 -0.2922621130873413]