In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import statsmodels.api as sm
from datetime import datetime

#there are more columns than default setting
pd.set_option('display.max_columns', 500)

In [15]:
df = pd.read_csv('data/DataCoSupplyChainDataset.csv', sep=';')

df.head()

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,Customer Country,Customer Email,Customer Fname,Customer Id,Customer Lname,Customer Password,Customer Segment,Customer State,Customer Street,Customer Zipcode,Department Id,Department Name,Latitude,Longitude,Market,Order City,Order Country,Order Customer Id,order date (DateOrders),Order Id,Order Item Cardprod Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Region,Order State,Order Status,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,Puerto Rico,XXXXXXXXX,Cally,20755,Holloway,XXXXXXXXX,Consumer,PR,5365 Noble Nectar Island,725.0,2,Fitness,18.251453,-66.037056,Pacific Asia,Bekasi,Indonesia,20755,1/31/2018 22:56,77202,1360,13.11,0.04,180517,327.75,0.29,1,327.75,314.640015,91.25,Southeast Asia,Java Occidental,COMPLETE,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,Puerto Rico,XXXXXXXXX,Irene,19492,Luna,XXXXXXXXX,Consumer,PR,2679 Rustic Loop,725.0,2,Fitness,18.279451,-66.037064,Pacific Asia,Bikaner,India,19492,1/13/2018 12:27,75939,1360,16.389999,0.05,179254,327.75,-0.8,1,327.75,311.359985,-249.089996,South Asia,Rajastán,PENDING,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,EE. UU.,XXXXXXXXX,Gillian,19491,Maldonado,XXXXXXXXX,Consumer,CA,8510 Round Bear Gate,95125.0,2,Fitness,37.292233,-121.881279,Pacific Asia,Bikaner,India,19491,1/13/2018 12:06,75938,1360,18.030001,0.06,179253,327.75,-0.8,1,327.75,309.720001,-247.779999,South Asia,Rajastán,CLOSED,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,EE. UU.,XXXXXXXXX,Tana,19490,Tate,XXXXXXXXX,Home Office,CA,3200 Amber Bend,90027.0,2,Fitness,34.125946,-118.291016,Pacific Asia,Townsville,Australia,19490,1/13/2018 11:45,75937,1360,22.940001,0.07,179252,327.75,0.08,1,327.75,304.809998,22.860001,Oceania,Queensland,COMPLETE,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,Puerto Rico,XXXXXXXXX,Orli,19489,Hendricks,XXXXXXXXX,Corporate,PR,8671 Iron Anchor Corners,725.0,2,Fitness,18.253769,-66.037048,Pacific Asia,Townsville,Australia,19489,1/13/2018 11:24,75936,1360,29.5,0.09,179251,327.75,0.45,1,327.75,298.25,134.210007,Oceania,Queensland,PENDING_PAYMENT,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


Why deleting each column:  
Customer Email - no data, lots of unique, personal, if needed can be accessed by id  
Customer Fname - lots of unique, personal, if needed can be accessed by id, not nessesary  
Customer Lname - lots of unique, personal, if needed can be accessed by id, not nessesary  
Customer Password - no data, lots of unique, personal, if needed can be accessed by id  
Customer Street - lots of unique, personal, if needed can be accessed by id  
Customer Zipcode - lots of unique, personal, if needed can be accessed by id  
Order Zipcode - no data in here  
Product Description - most data missing, not nessesary    
Product Image - not nessesary, could be access by id  
Product Category Id - is the same as Category Id  
Category Name - is already encoded in Category Id (dictionary needs to be created)  
Department Name - is already encoded in Department Id (dictionary needs to be created)  
Order Customer Id - is the same as Customer Id (need to check that)  
Order Item Cardprod Id - same as Product Card Id  
Product Name - is encoded in Product Name (dictionary needs to be created)  

In [17]:
def create_dictionary(df, column1, column2):
    dict_name = {}
    for i, j in zip(df[column1].unique(), df[column2].unique()):
        dict_name[i] = j
    return dict_name

In [18]:
product_name = create_dictionary(df, 'Product Card Id', 'Product Name')
department_name = create_dictionary(df, 'Department Id', 'Department Name')
category_name = create_dictionary(df, 'Category Id', 'Category Name')

In [16]:
df = df.drop(['Customer Email', 'Customer Fname', 'Customer Lname',
              'Customer Password', 'Customer Street', 'Customer Zipcode',
              'Order Zipcode', 'Product Description', 'Product Image',
              'Product Category Id', 'Category Name', 'Department Name',
              'Order Customer Id','Order Item Cardprod Id', 'Product Name',
             'Latitude', 'Longitude', 'Delivery Status', 'Late_delivery_risk',
             'Order City', 'Sales', 'Product Status', 'Customer City',
             'Order State', 'Market'], axis=1)

In [11]:
df.head()

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Category Id,Customer City,Customer Country,Customer Id,Customer Segment,Customer State,Department Id,Market,Order Country,order date (DateOrders),Order Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Order Item Total,Order Profit Per Order,Order Region,Order State,Order Status,Product Card Id,Product Price,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,73,Caguas,Puerto Rico,20755,Consumer,PR,2,Pacific Asia,Indonesia,1/31/2018 22:56,77202,13.11,0.04,180517,327.75,0.29,1,314.640015,91.25,Southeast Asia,Java Occidental,COMPLETE,1360,327.75,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,73,Caguas,Puerto Rico,19492,Consumer,PR,2,Pacific Asia,India,1/13/2018 12:27,75939,16.389999,0.05,179254,327.75,-0.8,1,311.359985,-249.089996,South Asia,Rajastán,PENDING,1360,327.75,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,73,San Jose,EE. UU.,19491,Consumer,CA,2,Pacific Asia,India,1/13/2018 12:06,75938,18.030001,0.06,179253,327.75,-0.8,1,309.720001,-247.779999,South Asia,Rajastán,CLOSED,1360,327.75,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,73,Los Angeles,EE. UU.,19490,Home Office,CA,2,Pacific Asia,Australia,1/13/2018 11:45,75937,22.940001,0.07,179252,327.75,0.08,1,304.809998,22.860001,Oceania,Queensland,COMPLETE,1360,327.75,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,73,Caguas,Puerto Rico,19489,Corporate,PR,2,Pacific Asia,Australia,1/13/2018 11:24,75936,29.5,0.09,179251,327.75,0.45,1,298.25,134.210007,Oceania,Queensland,PENDING_PAYMENT,1360,327.75,1/15/2018 11:24,Standard Class


In [20]:
memory_usage = df.memory_usage(deep=True)

# Sumowanie zużycia pamięci
total_memory_usage = memory_usage.sum()

print(f"Rozmiar DataFrame'u: {total_memory_usage / (1024 ** 2):.2f} MB")

Rozmiar DataFrame'u: 140.72 MB


In [19]:
decimals = pd.Series([3, 3, 3, 3, 3], index=['Benefit per order', 'Sales per customer', 'Order Item Discount', 'Order Item Total',
            'Order Profit Per Order'])
df = df.round(decimals)

In [22]:
df = df.rename(columns={'shipping date (DateOrders)': 'Shipping date',
                       'order date (DateOrders)': 'Order date'})

In [124]:
df['Shipping date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 180519 entries, 0 to 180518
Series name: Shipping date
Non-Null Count   Dtype 
--------------   ----- 
180519 non-null  object
dtypes: object(1)
memory usage: 1.4+ MB


In [125]:
df['Shipping date'] = pd.to_datetime(df['Shipping date'],errors = 'coerce', dayfirst=True)
df['Order date'] = pd.to_datetime(df['Order date'],errors = 'coerce', dayfirst=True)

In [126]:
df['Shipping time'] = df['Shipping date'].dt.strftime('%H:%M')
df['Shipping time'] = df['Shipping time'].apply(lambda x: datetime.strptime(x, '%H:%M').time())
df['Shipping date'] = pd.to_datetime(df['Shipping date'].dt.date,errors = 'coerce', dayfirst=True)
df['Shipping day'] = df['Shipping date'].dt.day_name()
df['Order time'] = df['Order date'].dt.strftime('%H:%M')
df['Order time'] = df['Order time'].apply(lambda x: datetime.strptime(x, '%H:%M').time())
df['Order date'] = pd.to_datetime(df['Order date'].dt.date,errors = 'coerce', dayfirst=True)
df['Order day'] = df['Order date'].dt.day_name()

In [127]:
df['Target shipping days'] = df['Days for shipping (real)'] - df['Days for shipment (scheduled)']
df.head()

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Customer City,Customer Country,Customer Id,Customer Segment,Customer State,Department Id,Latitude,Longitude,Market,Order City,Order Country,Order date,Order Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Region,Order State,Order Status,Product Card Id,Product Price,Product Status,Shipping date,Shipping Mode,Shipping time,Shipping day,Order time,Order day,Target shipping days
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Caguas,Puerto Rico,20755,Consumer,PR,2,18.251453,-66.037056,Pacific Asia,Bekasi,Indonesia,2018-01-31,77202,13.11,0.04,180517,327.75,0.29,1,327.75,314.640015,91.25,Southeast Asia,Java Occidental,COMPLETE,1360,327.75,0,2018-03-02,Standard Class,22:56:00,Friday,22:56:00,Wednesday,-1
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Caguas,Puerto Rico,19492,Consumer,PR,2,18.279451,-66.037064,Pacific Asia,Bikaner,India,2018-01-13,75939,16.389999,0.05,179254,327.75,-0.8,1,327.75,311.359985,-249.089996,South Asia,Rajastán,PENDING,1360,327.75,0,2018-01-18,Standard Class,12:27:00,Thursday,12:27:00,Saturday,1
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,San Jose,EE. UU.,19491,Consumer,CA,2,37.292233,-121.881279,Pacific Asia,Bikaner,India,2018-01-13,75938,18.030001,0.06,179253,327.75,-0.8,1,327.75,309.720001,-247.779999,South Asia,Rajastán,CLOSED,1360,327.75,0,2018-01-17,Standard Class,12:06:00,Wednesday,12:06:00,Saturday,0
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Los Angeles,EE. UU.,19490,Home Office,CA,2,34.125946,-118.291016,Pacific Asia,Townsville,Australia,2018-01-13,75937,22.940001,0.07,179252,327.75,0.08,1,327.75,304.809998,22.860001,Oceania,Queensland,COMPLETE,1360,327.75,0,2018-01-16,Standard Class,11:45:00,Tuesday,11:45:00,Saturday,-1
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Caguas,Puerto Rico,19489,Corporate,PR,2,18.253769,-66.037048,Pacific Asia,Townsville,Australia,2018-01-13,75936,29.5,0.09,179251,327.75,0.45,1,327.75,298.25,134.210007,Oceania,Queensland,PENDING_PAYMENT,1360,327.75,0,2018-01-15,Standard Class,11:24:00,Monday,11:24:00,Saturday,-2


In [128]:
df['Delivery Status'].unique()

array(['Advance shipping', 'Late delivery', 'Shipping on time',
       'Shipping canceled'], dtype=object)

In [24]:
df[df['Sales'] != df['Product Price']]

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Customer Country,Customer Id,Customer Segment,Customer State,Department Id,Latitude,Longitude,Market,Order Country,Order date,Order Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Region,Order Status,Product Card Id,Product Price,Product Status,Shipping date,Shipping Mode
48,PAYMENT,5,2,-30.750000,115.180000,Late delivery,1,17,Puerto Rico,9083,Home Office,PR,4,18.380119,-66.183128,Pacific Asia,India,2/24/2016 13:57,28744,4.800000,0.04,71956,-0.27,2,119.980003,115.180000,South Asia,PENDING_PAYMENT,365,59.990002,0,2/29/2016 13:57,Second Class
49,PAYMENT,2,2,-122.730003,79.180000,Shipping on time,0,29,Puerto Rico,4741,Home Office,PR,5,18.235573,-66.370613,Pacific Asia,Turquía,10/25/2016 14:39,45461,0.800000,0.01,113598,-1.55,2,79.980003,79.180000,West Asia,PENDING_PAYMENT,627,39.990002,0,10/27/2016 14:39,Second Class
50,PAYMENT,6,2,33.599998,96.000000,Late delivery,1,24,Puerto Rico,639,Home Office,PR,5,18.025368,-66.613037,Pacific Asia,Australia,3/30/2016 4:37,31115,4.000000,0.04,77757,0.35,2,100.000000,96.000000,Oceania,PENDING_PAYMENT,502,50.000000,0,4/5/2016 4:37,Second Class
51,PAYMENT,2,2,24.690001,75.980003,Shipping on time,0,29,Puerto Rico,9702,Home Office,PR,5,18.273838,-66.370636,Pacific Asia,Turquía,10/30/2016 1:31,45766,4.000000,0.05,114401,0.33,2,79.980003,75.980003,West Asia,PENDING_PAYMENT,627,39.990002,0,11/1/2016 1:31,Second Class
52,PAYMENT,3,2,9.100000,91.000000,Late delivery,1,24,Puerto Rico,9114,Home Office,PR,5,18.284805,-66.370590,Pacific Asia,Mongolia,11/28/2016 1:18,47752,9.000000,0.09,119405,0.10,2,100.000000,91.000000,Eastern Asia,PENDING_PAYMENT,502,50.000000,0,12/1/2016 1:18,Second Class
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179626,TRANSFER,3,4,92.110001,248.960007,Advance shipping,0,17,Puerto Rico,10240,Consumer,PR,4,18.281187,-66.370613,LATAM,Honduras,5/6/2017 14:34,58682,50.990002,0.17,146878,0.37,5,299.950012,248.960007,Central America,PENDING,365,59.990002,0,5/9/2017 14:34,Standard Class
179627,TRANSFER,4,4,76.989998,245.960007,Shipping on time,0,17,Puerto Rico,3,Consumer,PR,4,18.025375,-66.615082,LATAM,Honduras,3/31/2017 1:18,56178,53.990002,0.18,140507,0.31,5,299.950012,245.960007,Central America,PENDING,365,59.990002,0,4/4/2017 1:18,Standard Class
179628,TRANSFER,3,4,67.489998,224.960007,Advance shipping,0,17,Puerto Rico,7024,Consumer,PR,4,18.229071,-66.370583,LATAM,México,5/19/2017 3:51,59542,74.989998,0.25,149029,0.30,5,299.950012,224.960007,Central America,PENDING,365,59.990002,0,5/22/2017 3:51,Standard Class
179629,TRANSFER,4,4,37.590000,199.949997,Shipping on time,0,29,Puerto Rico,482,Consumer,PR,5,18.227577,-66.045624,LATAM,República Dominicana,4/4/2017 5:09,56463,0.000000,0.00,141241,0.19,5,199.949997,199.949997,Caribbean,PENDING,627,39.990002,0,4/8/2017 5:09,Standard Class


In [138]:
store_locations_df = pd.DataFrame({'Latitude': df['Latitude'],
                                   'Longitude': df['Longitude']})

In [140]:
store_locations_df = store_locations_df.drop_duplicates()

In [27]:
df.loc[df['Product Status'] == 1]

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Customer Country,Customer Id,Customer Segment,Customer State,Department Id,Latitude,Longitude,Market,Order Country,Order date,Order Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Region,Order Status,Product Card Id,Product Price,Product Status,Shipping date,Shipping Mode


In [21]:
df.to_csv('data/supply_chain_cleaned.csv')