# supply chain analysis

## import data from sql server

In [1]:
import sqlalchemy as sa
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sqlalchemy.engine import URL
from sqlalchemy import create_engine
from matplotlib.ticker import FixedLocator, MaxNLocator

%matplotlib inline
%store -r sc_conn_string

In [2]:
pd.set_option('display.max_rows', 20)

In [3]:
sc_conn_url = URL.create("mssql+pyodbc", query={"odbc_connect": sc_conn_string})
engine = create_engine(sc_conn_url)

In [4]:
customers_query = 'SELECT * FROM customers'
products_query = 'SELECT * FROM products'
categories_departments_query = 'SELECT * FROM categories_departments'
orders_query = 'SELECT * FROM orders'
orders_ratio_query = 'SELECT * FROM orders_ratio'
orders_demographic_query = 'SELECT * FROM orders_demographic'
shipping_query = 'SELECT * FROM shipping'

In [5]:
with engine.begin() as connection:
    og_customers = pd.read_sql_query(customers_query, connection)
    og_products = pd.read_sql_query(products_query, connection)
    og_categories_departments = pd.read_sql_query(categories_departments_query, connection)
    og_orders = pd.read_sql_query(orders_query, connection)
    og_orders_ratio = pd.read_sql_query(orders_ratio_query, connection)
    og_orders_demographic = pd.read_sql_query(orders_demographic_query, connection)
    og_shipping = pd.read_sql_query(shipping_query, connection)

In [6]:
customers = og_customers
products = og_products
categories_departments = og_categories_departments
orders = og_orders
orders_ratio = og_orders_ratio
orders_demographic = og_orders_demographic
shipping = og_shipping

## correct dataframe datatypes

### customers table

In [7]:
customers['customer_id'].drop_duplicates().sort_values(ascending=False)

2090     20757
2142     20756
4132     20755
16820    20754
241      20753
         ...  
349          5
17259        4
3857         3
6155         2
9519         1
Name: customer_id, Length: 20652, dtype: int64

In [8]:
customers['customer_id'] = customers['customer_id'].astype('int32')

In [9]:
customers['zipcode'].drop_duplicates().sort_values(ascending=False)

565     99205
255     98632
5124    98390
2549    98226
1131    98208
        ...  
385       685
4047      680
586       674
2496      612
1639      603
Name: zipcode, Length: 995, dtype: int64

In [10]:
customers['zipcode'] = customers['zipcode'].astype('int32')

In [11]:
customers.dtypes

customer_id     int32
first_name     object
last_name      object
segment        object
country        object
state          object
city           object
street         object
zipcode         int32
dtype: object

In [12]:
customers

Unnamed: 0,customer_id,first_name,last_name,segment,country,state,city,street,zipcode
0,11036,Bobby,Smith,Consumer,Puerto Rico,PR,Caguas,4464 Rocky Horse Harbour,725
1,11595,Zachary,Jones,Consumer,EE. UU.,CA,Modesto,777 Blue Hills Court,95355
2,10139,Mary,Gould,Consumer,Puerto Rico,PR,Caguas,6398 Indian Brook Valley,725
3,7277,Justin,Moore,Corporate,EE. UU.,CO,Littleton,7002 Blue Place,80126
4,10893,Lisa,Smith,Corporate,EE. UU.,CA,Riverside,3763 Noble Prairie Heights,92503
...,...,...,...,...,...,...,...,...,...
20647,16382,Rowan,Robbins,Home Office,Puerto Rico,PR,Caguas,1898 Velvet Villas,725
20648,18492,Hedy,Rivas,Consumer,Puerto Rico,PR,Caguas,1173 Silver Isle,725
20649,14086,Nelle,Hyde,Home Office,EE. UU.,NJ,Princeton,2407 Thunder Mountain,8540
20650,15424,Callie,Rodriguez,Home Office,EE. UU.,CA,Folsom,1836 Cozy View Orchard,95630


### products table

In [13]:
products['product_id'].drop_duplicates().sort_values(ascending=False)

14     1363
103    1362
56     1361
97     1360
105    1359
       ... 
4        44
88       37
78       35
61       24
20       19
Name: product_id, Length: 118, dtype: int64

In [14]:
products['product_id'] = products['product_id'].astype('int16')

In [15]:
products['category_id'].drop_duplicates().sort_values(ascending=False)

14     76
103    75
56     74
97     73
105    72
       ..
111     6
31      5
45      4
4       3
20      2
Name: category_id, Length: 51, dtype: int64

In [16]:
products['category_id'] = products['category_id'].astype('int16')

In [17]:
products['department_id'].drop_duplicates().sort_values(ascending=False)

22     12
105    11
0      10
7       9
69      8
2       7
3       6
12      5
1       4
21      3
4       2
Name: department_id, dtype: int64

In [18]:
products['department_id'] = products['department_id'].astype('int8')

In [19]:
products['price'].drop_duplicates().sort_values(ascending=False)

64     1999.99
0      1500.00
112     999.99
102     599.99
85      532.58
        ...   
19       15.99
29       14.99
56       11.54
52       11.29
98        9.99
Name: price, Length: 75, dtype: float64

In [20]:
products['price'] = products['price'].astype('float32')

In [21]:
products.dtypes

product_id         int16
product_name      object
category_id        int16
department_id       int8
price            float32
product_image     object
dtype: object

In [22]:
products

Unnamed: 0,product_id,product_name,category_id,department_id,price,product_image
0,1351,Dell Laptop,64,10,1500.000000,http://images.acmesports.sports/Dell+Laptop
1,1350,Children's heaters,63,4,357.100006,http://images.acmesports.sports/Children+heaters
2,1004,Field & Stream Sportsman 16 Gun Fire Safe,45,7,399.980011,http://images.acmesports.sports/Field+%26+Stre...
3,835,Bridgestone e6 Straight Distance NFL Carolina,37,6,31.990000,http://images.acmesports.sports/Bridgestone+e6...
4,44,adidas Men's F10 Messi TRX FG Soccer Cleat,3,2,59.990002,http://images.acmesports.sports/adidas+Men%27s...
...,...,...,...,...,...,...
113,226,Bowflex SelectTech 1090 Dumbbells,11,3,599.989990,http://images.acmesports.sports/Bowflex+Select...
114,677,TaylorMade White Smoke IN-12 Putter,31,6,99.989998,http://images.acmesports.sports/TaylorMade+Whi...
115,777,Bag Boy M330 Push Cart,35,6,79.989998,http://images.acmesports.sports/Bag+Boy+M330+P...
116,303,Garmin Forerunner 910XT GPS Watch,38,6,399.989990,http://images.acmesports.sports/Garmin+Forerun...


### categories_departments table

In [23]:
categories_departments['category_id'].drop_duplicates().sort_values(ascending=False)

9     76
47    75
46    74
31    73
0     72
      ..
45     6
13     5
11     4
41     3
20     2
Name: category_id, Length: 51, dtype: int64

In [24]:
categories_departments['category_id'] = categories_departments['category_id'].astype('int16')

In [25]:
categories_departments['department_id'].drop_duplicates().sort_values(ascending=False)

18    12
0     11
3     10
10     9
5      8
12     7
6      6
14     5
2      4
1      3
8      2
Name: department_id, dtype: int64

In [26]:
categories_departments['department_id'] = categories_departments['department_id'].astype('int8')

In [27]:
categories_departments.dtypes

category_id       int16
category         object
department_id      int8
department       object
dtype: object

In [28]:
categories_departments

Unnamed: 0,category_id,category,department_id,department
0,72,Pet Supplies,11,Pet Shop
1,16,As Seen on TV!,3,Footwear
2,60,Baby,4,Apparel
3,65,Consumer Electronics,10,Technology
4,70,Men's Clothing,4,Apparel
...,...,...,...,...
46,74,Toys,7,Fan Shop
47,75,Video Games,9,Discs Shop
48,9,Cardio Equipment,3,Footwear
49,62,Cameras,10,Technology


### orders table

In [29]:
orders['order_id'].drop_duplicates().sort_values(ascending=False)

49740     77204
84559     77203
88093     77202
22678     77201
131160    77200
          ...  
33042         7
24993         5
10993         4
28926         2
105188        1
Name: order_id, Length: 65752, dtype: int64

In [30]:
orders['order_id'] = orders['order_id'].astype('int32')

In [31]:
orders['transaction_id'].drop_duplicates().sort_values(ascending=False)

49740     180519
84559     180518
88093     180517
22678     180516
131160    180515
           ...  
10993          5
58160          4
28926          3
169389         2
105188         1
Name: transaction_id, Length: 180519, dtype: int64

In [32]:
orders['transaction_id'] = orders['transaction_id'].astype('int32')

In [33]:
orders['customer_id'].drop_duplicates().sort_values(ascending=False)

49740     20757
84559     20756
88093     20755
22678     20754
131160    20753
          ...  
8659          5
3359          4
469           3
2144          2
162421        1
Name: customer_id, Length: 20652, dtype: int64

In [34]:
orders['customer_id'] = orders['customer_id'].astype('int32')

In [35]:
orders['product_id'].drop_duplicates().sort_values(ascending=False)

98      1363
189     1362
364     1361
30      1360
834     1359
        ... 
475       44
68        37
3109      35
3416      24
1353      19
Name: product_id, Length: 118, dtype: int64

In [36]:
orders['product_id'] = orders['product_id'].astype('int16')

In [37]:
orders['price'].drop_duplicates().sort_values(ascending=False)

15427    1999.99
750      1500.00
61632     999.99
21324     599.99
122       532.58
          ...   
602        15.99
48         14.99
364        11.54
264        11.29
139         9.99
Name: price, Length: 75, dtype: float64

In [38]:
orders['price'] = orders['price'].astype('float32')

In [39]:
orders['quantity'].drop_duplicates().sort_values(ascending=False)

9    5
6    4
7    3
4    2
0    1
Name: quantity, dtype: object

In [40]:
orders['quantity'] = orders['quantity'].astype('int16')

In [41]:
orders['gross_sale'].drop_duplicates().sort_values(ascending=False)

15427    1999.99
750      1500.00
61632     999.99
21324     599.99
122       532.58
          ...   
602        15.99
48         14.99
364        11.54
264        11.29
1781        9.99
Name: gross_sale, Length: 193, dtype: float64

In [42]:
orders['gross_sale'] = orders['gross_sale'].astype('float32')

In [43]:
orders['discount_pct'].drop_duplicates().sort_values(ascending=False)

4             0.25
9      0.200000003
26     0.180000007
25     0.170000002
41     0.159999996
1      0.150000006
19     0.129999995
18     0.119999997
0      0.100000001
75     0.090000004
40            0.07
7      0.055013753
6      0.050000001
22     0.039999999
13     0.029999999
11            0.02
114           0.01
23               0
Name: discount_pct, dtype: object

In [44]:
orders['discount_pct'] = orders['discount_pct'].astype('float32')

In [45]:
orders['discount'].drop_duplicates().sort_values(ascending=False)

116065    500.00
54761     400.00
9008      375.00
55183     360.00
97849     340.00
           ...  
2386        0.15
12721       0.12
3214        0.11
148080      0.10
23          0.00
Name: discount, Length: 1017, dtype: float64

In [46]:
orders['discount'] = orders['discount'].astype('float32')

In [47]:
orders['net_sale'].drop_duplicates().sort_values(ascending=False)

53637     1939.99
36817     1919.99
44264     1899.99
32885     1889.99
127983    1859.99
           ...   
46277        8.39
23328        8.29
51262        8.19
27514        7.99
19974        7.49
Name: net_sale, Length: 2931, dtype: float64

In [48]:
orders['net_sale'] = orders['net_sale'].astype('float32')

In [49]:
orders.dtypes

order_id                   int32
transaction_id             int32
customer_id                int32
order_date        datetime64[ns]
payment_type              object
product_id                 int16
price                    float32
quantity                   int16
gross_sale               float32
discount_pct             float32
discount                 float16
net_sale                 float32
dtype: object

In [50]:
orders

Unnamed: 0,order_id,transaction_id,customer_id,order_date,payment_type,product_id,price,quantity,gross_sale,discount_pct,discount,net_sale
0,5853,14565,9485,2015-03-27 10:12:00,PAYMENT,1004,399.980011,1,399.980011,0.10,40.000000,359.980011
1,65537,163801,968,2017-08-14 16:10:00,DEBIT,1073,199.990005,1,199.990005,0.15,30.000000,169.990005
2,22961,57465,3860,2015-12-02 03:54:00,PAYMENT,403,129.990005,1,129.990005,0.10,13.000000,116.989998
3,14430,36094,11937,2015-07-30 15:07:00,PAYMENT,191,99.989998,1,99.989998,0.15,15.000000,84.989998
4,49042,122621,10713,2016-12-16 21:14:00,TRANSFER,365,59.990002,2,119.980003,0.25,30.000000,89.980003
...,...,...,...,...,...,...,...,...,...,...,...,...
180514,65752,164333,5049,2017-08-17 19:30:00,DEBIT,1004,399.980011,1,399.980011,0.07,28.000000,371.980011
180515,1044,2603,2444,2015-01-16 05:24:00,PAYMENT,1014,49.980000,5,249.899994,0.13,32.500000,217.410004
180516,4049,10066,7956,2015-03-01 02:11:00,PAYMENT,1014,49.980000,1,49.980000,0.00,0.000000,49.980000
180517,8546,21325,12192,2015-05-05 17:41:00,PAYMENT,403,129.990005,1,129.990005,0.12,15.601562,114.389999


### orders_ratio table

In [51]:
orders['net_sale'].drop_duplicates().sort_values(ascending=False)

53637     1939.98999
36817     1919.98999
44264     1899.98999
32885     1889.98999
127983    1859.98999
             ...    
46277        8.39000
23328        8.29000
51262        8.19000
27514        7.99000
19974        7.49000
Name: net_sale, Length: 2931, dtype: float32

In [52]:
orders_ratio['order_id'] = orders_ratio['order_id'].astype('int32')
orders_ratio['transaction_id'] = orders_ratio['transaction_id'].astype('int32')
orders_ratio['customer_id'] = orders_ratio['customer_id'].astype('int32')
orders_ratio['net_sale'] = orders_ratio['net_sale'].astype('float32')
orders_ratio['item_profit_ratio'] = orders_ratio['item_profit_ratio'].astype('float32')
orders_ratio['profit_per_order'] = orders_ratio['profit_per_order'].astype('float32')
orders_ratio['benefit_per_order'] = orders_ratio['benefit_per_order'].astype('float32')
orders_ratio['sales_per_customer'] = orders_ratio['sales_per_customer'].astype('float32')

In [53]:
orders_ratio.dtypes

order_id                       int32
transaction_id                 int32
customer_id                    int32
order_date            datetime64[ns]
net_sale                     float32
item_profit_ratio            float32
profit_per_order             float32
benefit_per_order            float32
sales_per_customer           float32
dtype: object

In [54]:
orders_ratio

Unnamed: 0,order_id,transaction_id,customer_id,order_date,net_sale,item_profit_ratio,profit_per_order,benefit_per_order,sales_per_customer
0,20886,52187,3074,2015-11-01 20:56:00,107.889999,-0.16,-16.940001,-16.940001,107.889999
1,16431,41020,1148,2015-08-28 20:09:00,172.770004,0.09,15.200000,15.200000,172.770004
2,9124,22767,5421,2015-05-14 04:11:00,83.650002,-0.43,-36.220001,-36.220001,83.650002
3,51532,128780,83,2017-01-22 05:36:00,245.960007,0.33,81.169998,81.169998,245.960007
4,57518,143889,6977,2017-04-19 14:46:00,286.000000,0.28,80.080002,80.080002,286.000000
...,...,...,...,...,...,...,...,...,...
180514,52466,131135,6531,2017-02-04 20:49:00,163.770004,0.45,73.699997,73.699997,163.770004
180515,2522,6308,10999,2015-02-06 19:13:00,185.949997,0.08,13.950000,13.950000,185.949997
180516,19818,49531,12217,2015-10-17 06:46:00,219.910004,0.49,107.760002,107.760002,219.910004
180517,26558,66497,5585,2016-01-23 16:06:00,113.089996,0.31,35.060001,35.060001,113.089996


### orders_demographic table

In [55]:
orders_demographic['order_id'] = orders_demographic['order_id'].astype('int32')
orders_demographic['transaction_id'] = orders_demographic['transaction_id'].astype('int32')
orders_demographic['customer_id'] = orders_demographic['customer_id'].astype('int32')
orders_demographic['latitiude'] = orders_demographic['latitiude'].astype('float32')
orders_demographic['longitude'] = orders_demographic['longitude'].astype('float32')

In [56]:
orders_demographic.dtypes

order_id                   int32
transaction_id             int32
order_date        datetime64[ns]
type                      object
customer_id                int32
latitiude                float32
longitude                float32
market                    object
region                    object
country                   object
state                     object
city                      object
zipcode                   object
order_status              object
dtype: object

In [60]:
orders_demographic['zipcode'] = orders_demographic['zipcode'].replace({None: float('nan')})

In [61]:
orders_demographic

Unnamed: 0,order_id,transaction_id,order_date,type,customer_id,latitiude,longitude,market,region,country,state,city,zipcode,order_status
0,11211,28025,2015-06-13 15:21:00,DEBIT,7537,29.455990,-98.528091,Europe,Western Europe,Austria,Viena,Viena,,COMPLETE
1,15614,39036,2015-08-16 21:55:00,CASH,1521,33.912216,-118.352570,Europe,Western Europe,Francia,Languedoc-Rosellón-Mediodía-Pirineos,Tournefeuille,,CLOSED
2,32801,82056,2016-04-23 19:18:00,PAYMENT,5849,38.453041,-90.306450,USCA,West of USA,Estados Unidos,California,Los Angeles,90004,PENDING_PAYMENT
3,64222,160566,2017-07-26 11:28:00,TRANSFER,4848,36.933971,-121.743462,Europe,Southern Europe,Italia,Piamonte,Turin,,PENDING
4,59324,148514,2017-05-15 23:29:00,TRANSFER,4515,18.260149,-66.370552,LATAM,Central America,México,Durango,Gómez Palacio,,PROCESSING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180514,23666,59213,2015-12-12 10:54:00,PAYMENT,4038,42.365345,-87.835899,Pacific Asia,South Asia,India,Rajastán,Ganganagar,,PENDING_PAYMENT
180515,20487,51133,2015-10-27 01:09:00,DEBIT,698,18.227573,-66.043716,Pacific Asia,South Asia,Nepal,Central,Kathmandu,,COMPLETE
180516,57434,143671,2017-04-18 09:20:00,PAYMENT,290,18.224083,-66.047104,LATAM,Central America,El Salvador,San Salvador,Apopa,,PENDING_PAYMENT
180517,55055,137687,2017-03-14 15:52:00,PAYMENT,11978,18.203072,-66.370514,LATAM,South America,Colombia,Tolima,Ibagué,,PAYMENT_REVIEW


### shipping table

In [62]:
shipping.dtypes

order_id                            int64
transaction_id                      int64
order_order_date           datetime64[ns]
customer_id                         int64
product_id                          int64
shipping_date              datetime64[ns]
mode                               object
days_shipping_scheduled            object
days_shipping_real                 object
delivery_status                    object
risk                               object
dtype: object

In [63]:
shipping

Unnamed: 0,order_id,transaction_id,order_order_date,customer_id,product_id,shipping_date,mode,days_shipping_scheduled,days_shipping_real,delivery_status,risk
0,44371,110769,2016-10-09 16:47:00,9540,627,2016-10-10 04:47:00,Same Day,0,1,Late delivery,1
1,18065,45146,2015-09-21 16:37:00,5684,1014,2015-09-27 16:37:00,Standard Class,4,6,Late delivery,1
2,33158,82891,2016-04-29 00:22:00,224,502,2016-05-01 00:22:00,First Class,1,2,Late delivery,1
3,73677,176992,2017-12-11 11:59:00,17230,1362,2017-12-14 11:59:00,Standard Class,4,3,Advance shipping,0
4,5834,14520,2015-03-27 03:33:00,3293,1004,2015-04-01 03:33:00,Standard Class,4,5,Late delivery,1
...,...,...,...,...,...,...,...,...,...,...,...
180514,8364,20897,2015-05-03 01:55:00,3411,1004,2015-05-08 01:55:00,Second Class,2,5,Late delivery,1
180515,14594,36501,2015-08-02 00:34:00,1502,502,2015-08-07 00:34:00,Standard Class,4,5,Late delivery,1
180516,38897,97110,2016-07-21 19:00:00,5099,403,2016-07-24 19:00:00,Second Class,2,3,Late delivery,1
180517,66417,166028,2017-08-27 12:28:00,11303,957,2017-08-30 12:28:00,Standard Class,4,3,Advance shipping,0


In [58]:
all_column_name = (
    customers.columns.tolist() +
    products.columns.tolist() +
    categories_departments.columns.tolist() +
    orders.columns.tolist() +
    orders_demographic.columns.tolist() +
    shipping.columns.tolist()
    )
all_column_name

['customer_id',
 'first_name',
 'last_name',
 'segment',
 'country',
 'state',
 'city',
 'street',
 'zipcode',
 'product_id',
 'product_name',
 'category_id',
 'department_id',
 'price',
 'product_image',
 'category_id',
 'category',
 'department_id',
 'department',
 'order_id',
 'transaction_id',
 'customer_id',
 'order_date',
 'payment_type',
 'product_id',
 'price',
 'quantity',
 'gross_sale',
 'discount_pct',
 'discount',
 'net_sale',
 'order_id',
 'transaction_id',
 'order_date',
 'type',
 'customer_id',
 'latitiude',
 'longitude',
 'market',
 'region',
 'country',
 'state',
 'city',
 'zipcode',
 'order_status',
 'order_id',
 'transaction_id',
 'order_order_date',
 'customer_id',
 'product_id',
 'shipping_date',
 'mode',
 'days_shipping_scheduled',
 'days_shipping_real',
 'delivery_status',
 'risk']

## Request 1
Analyze the relationship between the 'days_shipping_real' and the 'risk.'

Investigate whether orders with longer shipping durations are more likely to be at risk of late delivery.

Additionally, break down the analysis by different product categories to see if the relationship varies across product types.

Provide insights into potential factors contributing to late deliveries based on the shipping duration and product categories.

In [59]:
shipping[['days_shipping_real','risk']]

Unnamed: 0,days_shipping_real,risk
0,1,1
1,6,1
2,2,1
3,3,0
4,5,1
...,...,...
180514,5,1
180515,5,1
180516,3,1
180517,3,0
