In [116]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import statsmodels.api as sm
from datetime import datetime

#there are more columns than default setting
pd.set_option('display.max_columns', 500)

In [117]:
# let's load in the data
df = pd.read_csv('data/DataCoSupplyChainDataset.csv', sep=';')
df.head()

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,Customer Country,Customer Email,Customer Fname,Customer Id,Customer Lname,Customer Password,Customer Segment,Customer State,Customer Street,Customer Zipcode,Department Id,Department Name,Latitude,Longitude,Market,Order City,Order Country,Order Customer Id,order date (DateOrders),Order Id,Order Item Cardprod Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Region,Order State,Order Status,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,Puerto Rico,XXXXXXXXX,Cally,20755,Holloway,XXXXXXXXX,Consumer,PR,5365 Noble Nectar Island,725.0,2,Fitness,18.251453,-66.037056,Pacific Asia,Bekasi,Indonesia,20755,1/31/2018 22:56,77202,1360,13.11,0.04,180517,327.75,0.29,1,327.75,314.640015,91.25,Southeast Asia,Java Occidental,COMPLETE,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,Puerto Rico,XXXXXXXXX,Irene,19492,Luna,XXXXXXXXX,Consumer,PR,2679 Rustic Loop,725.0,2,Fitness,18.279451,-66.037064,Pacific Asia,Bikaner,India,19492,1/13/2018 12:27,75939,1360,16.389999,0.05,179254,327.75,-0.8,1,327.75,311.359985,-249.089996,South Asia,Rajastán,PENDING,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,EE. UU.,XXXXXXXXX,Gillian,19491,Maldonado,XXXXXXXXX,Consumer,CA,8510 Round Bear Gate,95125.0,2,Fitness,37.292233,-121.881279,Pacific Asia,Bikaner,India,19491,1/13/2018 12:06,75938,1360,18.030001,0.06,179253,327.75,-0.8,1,327.75,309.720001,-247.779999,South Asia,Rajastán,CLOSED,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,EE. UU.,XXXXXXXXX,Tana,19490,Tate,XXXXXXXXX,Home Office,CA,3200 Amber Bend,90027.0,2,Fitness,34.125946,-118.291016,Pacific Asia,Townsville,Australia,19490,1/13/2018 11:45,75937,1360,22.940001,0.07,179252,327.75,0.08,1,327.75,304.809998,22.860001,Oceania,Queensland,COMPLETE,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,Puerto Rico,XXXXXXXXX,Orli,19489,Hendricks,XXXXXXXXX,Corporate,PR,8671 Iron Anchor Corners,725.0,2,Fitness,18.253769,-66.037048,Pacific Asia,Townsville,Australia,19489,1/13/2018 11:24,75936,1360,29.5,0.09,179251,327.75,0.45,1,327.75,298.25,134.210007,Oceania,Queensland,PENDING_PAYMENT,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


Why deleting each column:  
Customer Email - no data, lots of unique, personal, if needed can be accessed by id  
Customer Fname - lots of unique, personal, if needed can be accessed by id, not nessesary  
Customer Lname - lots of unique, personal, if needed can be accessed by id, not nessesary  
Customer Password - no data, lots of unique, personal, if needed can be accessed by id  
Customer Street - lots of unique, personal, if needed can be accessed by id  
Customer Zipcode - lots of unique, personal, if needed can be accessed by id  
Order Zipcode - no data in here  
Product Description - most data missing, not nessesary    
Product Image - not nessesary, could be access by id  
Product Category Id - is the same as Category Id  
Category Name - is already encoded in Category Id (dictionary needs to be created)  
Department Name - is already encoded in Department Id (dictionary needs to be created)  
Order Customer Id - is the same as Customer Id (need to check that)  
Order Item Cardprod Id - same as Product Card Id  
Product Name - is encoded in Product Name (dictionary needs to be created)  

In [118]:
def create_dictionary(df, column1, column2):
    dict_name = {}
    for i, j in zip(df[column1].unique(), df[column2].unique()):
        dict_name[i] = j
    return dict_name

In [119]:
product_name = create_dictionary(df, 'Product Card Id', 'Product Name')
department_name = create_dictionary(df, 'Department Id', 'Department Name')
category_name = create_dictionary(df, 'Category Id', 'Category Name')

In [120]:
df = df.drop(['Customer Email', 'Customer Fname', 'Customer Lname',
              'Customer Password', 'Customer Street', 'Customer Zipcode',
              'Order Zipcode', 'Product Description', 'Product Image',
              'Product Category Id', 'Category Name', 'Department Name',
              'Order Customer Id','Order Item Cardprod Id', 'Product Name',
             'Order Profit Per Order', 'Customer City', 'Order Item Product Price',
             'Order State', 'Order City', ], axis=1)

In [121]:
df.shape

(180519, 38)

In [122]:
# basic descriptions for all data
df.describe(include='all')

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Customer City,Customer Country,Customer Id,Customer Segment,Customer State,Department Id,Latitude,Longitude,Market,Order City,Order Country,order date (DateOrders),Order Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Region,Order State,Order Status,Product Card Id,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
count,180519,180519.0,180519.0,180519.0,180519.0,180519,180519.0,180519.0,180519,180519,180519.0,180519,180519,180519.0,180519.0,180519.0,180519,180519,180519,180519,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519,180519,180519,180519.0,180519.0,180519.0,180519,180519
unique,4,,,,,4,,,563,2,,3,46,,,,5,3597,164,65752,,,,,,,,,,,23,1089,9,,,,63701,4
top,DEBIT,,,,,Late delivery,,,Caguas,EE. UU.,,Consumer,PR,,,,LATAM,Santo Domingo,Estados Unidos,12/14/2016 12:29,,,,,,,,,,,Central America,Inglaterra,COMPLETE,,,,1/5/2016 5:58,Standard Class
freq,69295,,,,,98977,,,66770,111146,,93504,69373,,,,51594,2211,24840,5,,,,,,,,,,,28341,6722,59491,,,,10,107752
mean,,3.497654,2.931847,21.974989,183.107609,,0.548291,31.851451,,,6691.379495,,,5.44346,29.719955,-84.915675,,,,,36221.894903,20.664741,0.101668,90260.0,141.23255,0.120647,2.127638,203.772096,183.107609,21.974989,,,,692.509764,141.23255,0.0,,
std,,1.623722,1.374449,104.433526,120.04367,,0.497664,15.640064,,,4162.918106,,,1.629246,9.813646,21.433241,,,,,21045.379569,21.800901,0.070415,52111.490959,139.732492,0.466796,1.453451,132.273077,120.04367,104.433526,,,,336.446807,139.732492,0.0,,
min,,0.0,0.0,-4274.97998,7.49,,0.0,2.0,,,1.0,,,2.0,-33.937553,-158.025986,,,,,1.0,0.0,0.0,1.0,9.99,-2.75,1.0,9.99,7.49,-4274.97998,,,,19.0,9.99,0.0,,
25%,,2.0,2.0,7.0,104.379997,,0.0,18.0,,,3258.5,,,4.0,18.265432,-98.446312,,,,,18057.0,5.4,0.04,45130.5,50.0,0.08,1.0,119.980003,104.379997,7.0,,,,403.0,50.0,0.0,,
50%,,3.0,4.0,31.52,163.990005,,1.0,29.0,,,6457.0,,,5.0,33.144863,-76.847908,,,,,36140.0,14.0,0.1,90260.0,59.990002,0.27,1.0,199.919998,163.990005,31.52,,,,627.0,59.990002,0.0,,
75%,,5.0,4.0,64.800003,247.399994,,1.0,45.0,,,9779.0,,,7.0,39.279617,-66.370583,,,,,54144.0,29.99,0.16,135389.5,199.990005,0.36,3.0,299.950012,247.399994,64.800003,,,,1004.0,199.990005,0.0,,


In [123]:
df = df.rename(columns={'shipping date (DateOrders)': 'Shipping date',
                       'order date (DateOrders)': 'Order date'})

In [124]:
df['Shipping date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 180519 entries, 0 to 180518
Series name: Shipping date
Non-Null Count   Dtype 
--------------   ----- 
180519 non-null  object
dtypes: object(1)
memory usage: 1.4+ MB


In [125]:
df['Shipping date'] = pd.to_datetime(df['Shipping date'],errors = 'coerce', dayfirst=True)
df['Order date'] = pd.to_datetime(df['Order date'],errors = 'coerce', dayfirst=True)

In [126]:
df['Shipping time'] = df['Shipping date'].dt.strftime('%H:%M')
df['Shipping time'] = df['Shipping time'].apply(lambda x: datetime.strptime(x, '%H:%M').time())
df['Shipping date'] = pd.to_datetime(df['Shipping date'].dt.date,errors = 'coerce', dayfirst=True)
df['Shipping day'] = df['Shipping date'].dt.day_name()
df['Order time'] = df['Order date'].dt.strftime('%H:%M')
df['Order time'] = df['Order time'].apply(lambda x: datetime.strptime(x, '%H:%M').time())
df['Order date'] = pd.to_datetime(df['Order date'].dt.date,errors = 'coerce', dayfirst=True)
df['Order day'] = df['Order date'].dt.day_name()

In [127]:
df['Target shipping days'] = df['Days for shipping (real)'] - df['Days for shipment (scheduled)']
df.head()

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Customer City,Customer Country,Customer Id,Customer Segment,Customer State,Department Id,Latitude,Longitude,Market,Order City,Order Country,Order date,Order Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Region,Order State,Order Status,Product Card Id,Product Price,Product Status,Shipping date,Shipping Mode,Shipping time,Shipping day,Order time,Order day,Target shipping days
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Caguas,Puerto Rico,20755,Consumer,PR,2,18.251453,-66.037056,Pacific Asia,Bekasi,Indonesia,2018-01-31,77202,13.11,0.04,180517,327.75,0.29,1,327.75,314.640015,91.25,Southeast Asia,Java Occidental,COMPLETE,1360,327.75,0,2018-03-02,Standard Class,22:56:00,Friday,22:56:00,Wednesday,-1
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Caguas,Puerto Rico,19492,Consumer,PR,2,18.279451,-66.037064,Pacific Asia,Bikaner,India,2018-01-13,75939,16.389999,0.05,179254,327.75,-0.8,1,327.75,311.359985,-249.089996,South Asia,Rajastán,PENDING,1360,327.75,0,2018-01-18,Standard Class,12:27:00,Thursday,12:27:00,Saturday,1
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,San Jose,EE. UU.,19491,Consumer,CA,2,37.292233,-121.881279,Pacific Asia,Bikaner,India,2018-01-13,75938,18.030001,0.06,179253,327.75,-0.8,1,327.75,309.720001,-247.779999,South Asia,Rajastán,CLOSED,1360,327.75,0,2018-01-17,Standard Class,12:06:00,Wednesday,12:06:00,Saturday,0
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Los Angeles,EE. UU.,19490,Home Office,CA,2,34.125946,-118.291016,Pacific Asia,Townsville,Australia,2018-01-13,75937,22.940001,0.07,179252,327.75,0.08,1,327.75,304.809998,22.860001,Oceania,Queensland,COMPLETE,1360,327.75,0,2018-01-16,Standard Class,11:45:00,Tuesday,11:45:00,Saturday,-1
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Caguas,Puerto Rico,19489,Corporate,PR,2,18.253769,-66.037048,Pacific Asia,Townsville,Australia,2018-01-13,75936,29.5,0.09,179251,327.75,0.45,1,327.75,298.25,134.210007,Oceania,Queensland,PENDING_PAYMENT,1360,327.75,0,2018-01-15,Standard Class,11:24:00,Monday,11:24:00,Saturday,-2


In [128]:
df['Delivery Status'].unique()

array(['Advance shipping', 'Late delivery', 'Shipping on time',
       'Shipping canceled'], dtype=object)

In [135]:
df[df['Market'] != df['Order Region']]

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Customer City,Customer Country,Customer Id,Customer Segment,Customer State,Department Id,Latitude,Longitude,Market,Order City,Order Country,Order date,Order Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Region,Order State,Order Status,Product Card Id,Product Price,Product Status,Shipping date,Shipping Mode,Shipping time,Shipping day,Order time,Order day,Target shipping days
0,DEBIT,3,4,91.250000,314.640015,Advance shipping,0,73,Caguas,Puerto Rico,20755,Consumer,PR,2,18.251453,-66.037056,Pacific Asia,Bekasi,Indonesia,2018-01-31,77202,13.110000,0.04,180517,327.750000,0.29,1,327.750000,314.640015,91.250000,Southeast Asia,Java Occidental,COMPLETE,1360,327.750000,0,2018-03-02,Standard Class,22:56:00,Friday,22:56:00,Wednesday,-1
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Caguas,Puerto Rico,19492,Consumer,PR,2,18.279451,-66.037064,Pacific Asia,Bikaner,India,2018-01-13,75939,16.389999,0.05,179254,327.750000,-0.80,1,327.750000,311.359985,-249.089996,South Asia,Rajastán,PENDING,1360,327.750000,0,2018-01-18,Standard Class,12:27:00,Thursday,12:27:00,Saturday,1
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,San Jose,EE. UU.,19491,Consumer,CA,2,37.292233,-121.881279,Pacific Asia,Bikaner,India,2018-01-13,75938,18.030001,0.06,179253,327.750000,-0.80,1,327.750000,309.720001,-247.779999,South Asia,Rajastán,CLOSED,1360,327.750000,0,2018-01-17,Standard Class,12:06:00,Wednesday,12:06:00,Saturday,0
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Los Angeles,EE. UU.,19490,Home Office,CA,2,34.125946,-118.291016,Pacific Asia,Townsville,Australia,2018-01-13,75937,22.940001,0.07,179252,327.750000,0.08,1,327.750000,304.809998,22.860001,Oceania,Queensland,COMPLETE,1360,327.750000,0,2018-01-16,Standard Class,11:45:00,Tuesday,11:45:00,Saturday,-1
4,PAYMENT,2,4,134.210007,298.250000,Advance shipping,0,73,Caguas,Puerto Rico,19489,Corporate,PR,2,18.253769,-66.037048,Pacific Asia,Townsville,Australia,2018-01-13,75936,29.500000,0.09,179251,327.750000,0.45,1,327.750000,298.250000,134.210007,Oceania,Queensland,PENDING_PAYMENT,1360,327.750000,0,2018-01-15,Standard Class,11:24:00,Monday,11:24:00,Saturday,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180514,CASH,4,4,40.000000,399.980011,Shipping on time,0,45,Brooklyn,EE. UU.,1005,Home Office,NY,7,40.640930,-73.942711,Pacific Asia,Shanghái,China,2016-01-16,26043,0.000000,0.00,65177,399.980011,0.10,1,399.980011,399.980011,40.000000,Eastern Asia,Shanghái,CLOSED,1004,399.980011,0,2016-01-20,Standard Class,03:40:00,Wednesday,03:40:00,Saturday,0
180515,DEBIT,3,2,-613.770019,395.980011,Late delivery,1,45,Bakersfield,EE. UU.,9141,Corporate,CA,7,35.362545,-119.018700,Pacific Asia,Hirakata,Japón,2016-01-16,26037,4.000000,0.01,65161,399.980011,-1.55,1,399.980011,395.980011,-613.770019,Eastern Asia,Osaka,COMPLETE,1004,399.980011,0,2016-01-19,Second Class,01:34:00,Tuesday,01:34:00,Saturday,1
180516,TRANSFER,5,4,141.110001,391.980011,Late delivery,1,45,Bristol,EE. UU.,291,Corporate,CT,7,41.629959,-72.967155,Pacific Asia,Adelaide,Australia,2016-01-15,26024,8.000000,0.02,65129,399.980011,0.36,1,399.980011,391.980011,141.110001,Oceania,Australia del Sur,PENDING,1004,399.980011,0,2016-01-20,Standard Class,21:00:00,Wednesday,21:00:00,Friday,1
180517,PAYMENT,3,4,186.229996,387.980011,Advance shipping,0,45,Caguas,Puerto Rico,2813,Consumer,PR,7,18.213350,-66.370575,Pacific Asia,Adelaide,Australia,2016-01-15,26022,12.000000,0.03,65126,399.980011,0.48,1,399.980011,387.980011,186.229996,Oceania,Australia del Sur,PENDING_PAYMENT,1004,399.980011,0,2016-01-18,Standard Class,20:18:00,Monday,20:18:00,Friday,-1


In [138]:
store_locations_df = pd.DataFrame({'Latitude': df['Latitude'],
                                   'Longitude': df['Longitude']})

In [140]:
store_locations_df = store_locations_df.drop_duplicates()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11835 entries, 0 to 179271
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Latitude   11835 non-null  float64
 1   Longitude  11835 non-null  float64
dtypes: float64(2)
memory usage: 277.4 KB
