# OptimizingDelivery Analysis

## Import libraries

In [1]:
import os 
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


## Import Data

In [2]:
%store -r optimizingdelivery_data_dir

In [3]:
# Define dataframes and dataframe names
dataframe_names = ['customers', 'dates_2022', 'products', 'target_orders', 'order_lines', 'orders_aggregate']

In [4]:
# Reset dataframes
def reset_dataframes():

    # Read parquet and overwrite local instance, effectively resetting the dataframes
    for name in dataframe_names:

        file_path = os.path.join(optimizingdelivery_data_dir, f"{name}.parquet")

        globals()[name] = pd.read_parquet(file_path)

In [5]:
reset_dataframes()

In [6]:
print(customers.shape)
print(dates_2022.shape)
print(products.shape)
print(target_orders.shape)
print(order_lines.shape)
print(orders_aggregate.shape)

(35, 3)
(365, 4)
(18, 3)
(35, 4)
(57096, 11)
(31729, 6)


## Analysis

In [7]:
order_lines.columns

Index(['order_id', 'order_placement_date', 'customer_id', 'product_id',
       'order_qty', 'agreed_delivery_date', 'actual_delivery_date',
       'delivery_qty', 'in_full', 'on_time', 'on_time_in_full'],
      dtype='object')

In [8]:
print('order_lines')
print('ot_perc: ' + str((order_lines['on_time'].sum() / len(order_lines)) * 100))
print('if_perc: ' + str((order_lines['in_full'].sum() / len(order_lines)) * 100))
print('otif_perc: ' + str((order_lines['on_time_in_full'].sum() / len(order_lines)) * 100))


order_lines
ot_perc: 71.11706599411517
if_perc: 65.96083788706738
otif_perc: 47.95432254448648


In [9]:
print('orders_aggregate')
print('ot_perc: ' + str((orders_aggregate['on_time'].sum() / len(orders_aggregate)) * 100))
print('if_perc: ' + str((orders_aggregate['in_full'].sum() / len(orders_aggregate)) * 100))
print('otif_perc: ' + str((orders_aggregate['otif'].sum() / len(orders_aggregate)) * 100))


orders_aggregate
ot_perc: 59.03117022282455
if_perc: 52.78136720350468
otif_perc: 29.020769642913425


In [10]:
order_lines.sample(5)

Unnamed: 0,order_id,order_placement_date,customer_id,product_id,order_qty,agreed_delivery_date,actual_delivery_date,delivery_qty,in_full,on_time,on_time_in_full
24311,FMY519721501,2022-05-17,789721,25891402,478,2022-05-19,2022-05-19,430,0,1,0
47277,FAUG81202502,2022-07-29,789202,25891402,460,2022-08-01,2022-08-01,460,1,1,1
1148,FMR36220601,2022-03-04,789220,25891302,84,2022-03-06,2022-03-06,67,0,1,0
32181,FJUN612703603,2022-06-11,789703,25891203,241,2022-06-12,2022-06-12,241,1,1,1
46181,FJUL728320501,2022-07-26,789320,25891102,447,2022-07-28,2022-07-28,447,1,1,1


In [11]:
# Calculate the mean percentages
customer_split = order_lines.groupby('customer_id')[['in_full', 'on_time', 'on_time_in_full']].mean() * 100

# Reset index to turn 'customer_id' into a regular column
customer_split = customer_split.reset_index().rename(columns={'in_full': 'if_pct',
                                                             'on_time': 'ot_pct', 
                                                             'on_time_in_full': 'otif_pct'})

customer_split = customer_split.round(2)

In [12]:
customer_split = pd.merge(customer_split, target_orders, how='left', on='customer_id')

In [13]:
customer_split.columns

Index(['customer_id', 'if_pct', 'ot_pct', 'otif_pct', 'ontime_target%',
       'infull_target%', 'otif_target%'],
      dtype='object')

In [14]:
customer_split.rename(columns={ 'infull_target%': 'if_target_pct',
                                'otif_target%': 'of_target_pct', 
                                'ontime_target%': 'otif_target_pct'}, inplace=True)

In [15]:
customer_split.columns

Index(['customer_id', 'if_pct', 'ot_pct', 'otif_pct', 'otif_target_pct',
       'if_target_pct', 'of_target_pct'],
      dtype='object')

In [16]:
customer_split = customer_split[['customer_id', 'if_pct', 'if_target_pct', 
                                 'ot_pct', 'of_target_pct', 
                                 'otif_pct', 'otif_target_pct']]

## Machine Learning Model

##### Create Predictive Models:
  
Building predictive models to determine whether an order line meets targets like 'in_full', 'on_time', or 'otif' is a fundamental step. You can use various machine learning algorithms like logistic regression, random forests, or gradient boosting, depending on the complexity of your data and the desired accuracy of predictions.
Ensure you have labeled data where each order line is marked as 'true' or 'false' for meeting the targets. Also, pay attention to feature engineering to extract relevant information from your dataset.
  
##### Identify Associated Variables:
  
Analyzing the importance of variables associated with failure for 'in_full', 'on_time', or 'otif' is crucial for understanding the root causes of delivery failures. You can use techniques like feature importance analysis from tree-based models or permutation importance to identify the most influential features.
Understanding these variables can provide insights into areas of improvement in your production process. For example, if a specific product category or supplier consistently results in delivery failures, you can focus on improving those aspects.
Predict Targets and Evaluate Model Performance:
  
After training your predictive models, assess their performance using appropriate evaluation metrics such as accuracy, precision, recall, F1-score, or area under the ROC curve (AUC). Ensure you use proper validation techniques like cross-validation to estimate the model's generalization performance.
It's essential to have reliable models that accurately predict whether an order line meets the specified targets. These models will serve as the basis for making informed decisions and implementing improvements in your production process.
  
##### Extract Insights for Improvement:
  
Once you have reliable predictive models, leverage the insights gained from them to suggest improvements in your production process. For example, if certain factors consistently lead to delivery failures, you can take corrective actions such as optimizing inventory management, improving supplier relationships, enhancing transportation logistics, or refining order processing workflows.
Continuously monitor the performance of your production process and iteratively refine your strategies based on the insights provided by the predictive models.

### Prepare model data

#### Merge dataframes

In [17]:
model_data = order_lines.copy()

In [18]:
# Add as much quantifiable data into the data model.

In [19]:
# Merge date

In [20]:
model_data = pd.merge(model_data, dates_2022, how='left', left_on='order_placement_date', right_on='date')

model_data['order_placement_month'] = model_data['month'].astype('int64')
model_data['order_placement_day'] = model_data['order_placement_date'].dt.day.astype('int64')
model_data['order_placement_week_no'] = model_data['week_number'].astype('int64')

model_data = model_data[['order_id', 'order_placement_date', 'order_placement_month', 'order_placement_day', 'order_placement_week_no', 
                        'customer_id', 'product_id','order_qty', 'agreed_delivery_date', 'actual_delivery_date',
                        'delivery_qty', 'in_full', 'on_time', 'on_time_in_full']].copy()

In [21]:
model_data = pd.merge(model_data, dates_2022, how='left', left_on='agreed_delivery_date', right_on='date')

model_data['agreed_delivery_month'] = model_data['month'].astype('int64')
model_data['agreed_delivery_day'] = model_data['agreed_delivery_date'].dt.day.astype('int64')
model_data['agreed_delivery_week_no'] = model_data['week_number'].astype('int64')

model_data = model_data[['order_id', 'order_placement_date', 'order_placement_month', 'order_placement_day', 'order_placement_week_no', 
                        'customer_id', 'product_id','order_qty', 
                        'agreed_delivery_date', 'agreed_delivery_month', 'agreed_delivery_day', 'agreed_delivery_week_no', 
                        'actual_delivery_date',
                        'delivery_qty', 'in_full', 'on_time', 'on_time_in_full',]].copy()

In [22]:
model_data = pd.merge(model_data, dates_2022, how='left', left_on='actual_delivery_date', right_on='date')

model_data['actual_delivery_month'] = model_data['month'].astype('int64')
model_data['actual_delivery_day'] = model_data['actual_delivery_date'].dt.day.astype('int64')
model_data['actual_delivery_week_no'] = model_data['week_number'].astype('int64')

model_data = model_data[['order_id', 'order_placement_date', 'order_placement_month', 'order_placement_day', 'order_placement_week_no', 
                        'customer_id', 'product_id','order_qty', 
                        'agreed_delivery_date', 'agreed_delivery_month', 'agreed_delivery_day', 'agreed_delivery_week_no', 
                        'actual_delivery_date', 'actual_delivery_month', 'actual_delivery_day', 'actual_delivery_week_no',
                        'delivery_qty', 'in_full', 'on_time', 'on_time_in_full',]].copy()

In [23]:
# Merge custopmer

In [24]:
model_data = pd.merge(model_data, customers, how='left', on='customer_id')

In [25]:
# Merge products

In [26]:
model_data = pd.merge(model_data, products, how='left', on='product_id')

#### Refine model data datatypes

In [27]:
model_data.dtypes

order_id                           object
order_placement_date       datetime64[ns]
order_placement_month               int64
order_placement_day                 int64
order_placement_week_no             int64
customer_id                         int64
product_id                          int64
order_qty                           int64
agreed_delivery_date       datetime64[ns]
agreed_delivery_month               int64
agreed_delivery_day                 int64
agreed_delivery_week_no             int64
actual_delivery_date       datetime64[ns]
actual_delivery_month               int64
actual_delivery_day                 int64
actual_delivery_week_no             int64
delivery_qty                        int64
in_full                             int64
on_time                             int64
on_time_in_full                     int64
customer_name                      object
city                               object
product_name                       object
category                          

In [28]:
model_data['order_code'] = model_data['order_id'].astype('category').cat.codes
model_data['customer_name_code'] = model_data['customer_name'].astype('category').cat.codes
model_data['city_code'] = model_data['city'].astype('category').cat.codes
model_data['product_name_code'] = model_data['product_name'].astype('category').cat.codes
model_data['category_code'] = model_data['category'].astype('category').cat.codes

In [29]:
model_data = model_data[[   'order_id',
                            'order_code',
                            'order_placement_date',
                            'order_placement_month',
                            'order_placement_day',
                            'order_placement_week_no',
                            'customer_id',
                            'customer_name',
                            'customer_name_code',
                            'city',
                            'city_code',
                            'product_id',
                            'product_name',
                            'product_name_code',
                            'category',
                            'category_code',
                            'order_qty',
                            'agreed_delivery_date',
                            'agreed_delivery_month',
                            'agreed_delivery_day',
                            'agreed_delivery_week_no',
                            'actual_delivery_date',
                            'actual_delivery_month',
                            'actual_delivery_day',
                            'actual_delivery_week_no',
                            'delivery_qty',
                            'in_full',
                            'on_time',
                            'on_time_in_full']]

In [30]:
model_data.shape

(57096, 29)

### Multi-Output Random Forest Classifier

In [31]:
list(model_data.columns)

['order_id',
 'order_code',
 'order_placement_date',
 'order_placement_month',
 'order_placement_day',
 'order_placement_week_no',
 'customer_id',
 'customer_name',
 'customer_name_code',
 'city',
 'city_code',
 'product_id',
 'product_name',
 'product_name_code',
 'category',
 'category_code',
 'order_qty',
 'agreed_delivery_date',
 'agreed_delivery_month',
 'agreed_delivery_day',
 'agreed_delivery_week_no',
 'actual_delivery_date',
 'actual_delivery_month',
 'actual_delivery_day',
 'actual_delivery_week_no',
 'delivery_qty',
 'in_full',
 'on_time',
 'on_time_in_full']

In [32]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.metrics import jaccard_score, zero_one_loss, coverage_error, average_precision_score

# Assuming y_pred and y_test are defined after making predictions
# y_pred = multi_output_classifier.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)
print("Hamming Loss:", hamming_loss_value)

# Precision
precision = precision_score(y_test, y_pred, average='micro')  # You can change the average parameter as needed
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_pred, average='micro')  # You can change the average parameter as needed
print("Recall:", recall)

# F1-score
f1 = f1_score(y_test, y_pred, average='micro')  # You can change the average parameter as needed
print("F1-score:", f1)

# Jaccard Similarity Score
jaccard = jaccard_score(y_test, y_pred, average='micro')  # You can change the average parameter as needed
print("Jaccard Similarity Score:", jaccard)

# Zero-One Loss
zero_one_loss_value = zero_one_loss(y_test, y_pred)
print("Zero-One Loss:", zero_one_loss_value)

# Coverage Error
coverage_error_value = coverage_error(y_test, y_pred)
print("Coverage Error:", coverage_error_value)

# Average Precision Score
average_precision = average_precision_score(y_test, y_pred, average='micro')  # You can change the average parameter as needed
print("Average Precision Score:", average_precision)


Accuracy: 0.6199649737302977
Hamming Loss: 0.20722416812609457
Precision: 0.8106465174411999
Recall: 0.9106516050800945
F1-score: 0.857743981244928
Jaccard Similarity Score: 0.7509209556888748
Zero-One Loss: 0.3800350262697023
Coverage Error: 1.615586690017513
Average Precision Score: 0.7995125242394017
