# OptimizingDelivery Analysis

## Import libraries

In [1]:
import os 
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


## Import Data

In [2]:
%store -r optimizingdelivery_data_dir

In [3]:
# Define dataframes and dataframe names
dataframe_names = ['customers', 'dates_2022', 'products', 'target_orders', 'order_lines', 'orders_aggregate']

In [4]:
# Reset dataframes
def reset_dataframes():

    # Read parquet and overwrite local instance, effectively resetting the dataframes
    for name in dataframe_names:

        file_path = os.path.join(optimizingdelivery_data_dir, f"{name}.parquet")

        globals()[name] = pd.read_parquet(file_path)

In [5]:
reset_dataframes()

In [6]:
print(customers.shape)
print(dates_2022.shape)
print(products.shape)
print(target_orders.shape)
print(order_lines.shape)
print(orders_aggregate.shape)

(35, 3)
(365, 4)
(18, 3)
(35, 4)
(57096, 11)
(31729, 6)


## Analysis

In [7]:
order_lines.columns

Index(['order_id', 'order_placement_date', 'customer_id', 'product_id',
       'order_qty', 'agreed_delivery_date', 'actual_delivery_date',
       'delivery_qty', 'in_full', 'on_time', 'on_time_in_full'],
      dtype='object')

In [8]:
print('order_lines')
print('ot_perc: ' + str((order_lines['on_time'].sum() / len(order_lines)) * 100))
print('if_perc: ' + str((order_lines['in_full'].sum() / len(order_lines)) * 100))
print('otif_perc: ' + str((order_lines['on_time_in_full'].sum() / len(order_lines)) * 100))


order_lines
ot_perc: 71.11706599411517
if_perc: 65.96083788706738
otif_perc: 47.95432254448648


In [9]:
print('orders_aggregate')
print('ot_perc: ' + str((orders_aggregate['on_time'].sum() / len(orders_aggregate)) * 100))
print('if_perc: ' + str((orders_aggregate['in_full'].sum() / len(orders_aggregate)) * 100))
print('otif_perc: ' + str((orders_aggregate['otif'].sum() / len(orders_aggregate)) * 100))


orders_aggregate
ot_perc: 59.03117022282455
if_perc: 52.78136720350468
otif_perc: 29.020769642913425


In [10]:
order_lines.sample(5)

Unnamed: 0,order_id,order_placement_date,customer_id,product_id,order_qty,agreed_delivery_date,actual_delivery_date,delivery_qty,in_full,on_time,on_time_in_full
47290,FJUL731221501,2022-07-29,789221,25891501,132,2022-07-31,2022-07-31,132,1,1,1
15884,FAP421301401,2022-04-20,789301,25891401,429,2022-04-21,2022-04-20,429,1,1,1
43326,FJUL720121603,2022-07-17,789121,25891501,158,2022-07-20,2022-07-22,158,1,0,0
28820,FJUN61721602,2022-05-31,789721,25891301,83,2022-06-01,2022-06-01,83,1,1,1
14341,FAP417303503,2022-04-15,789303,25891503,220,2022-04-17,2022-04-20,209,0,0,0


In [11]:
# Calculate the mean percentages
customer_split = order_lines.groupby('customer_id')[['in_full', 'on_time', 'on_time_in_full']].mean() * 100

# Reset index to turn 'customer_id' into a regular column
customer_split = customer_split.reset_index().rename(columns={'in_full': 'if_pct',
                                                             'on_time': 'ot_pct', 
                                                             'on_time_in_full': 'otif_pct'})

customer_split = customer_split.round(2)

In [12]:
customer_split = pd.merge(customer_split, target_orders, how='left', on='customer_id')

In [13]:
customer_split.columns

Index(['customer_id', 'if_pct', 'ot_pct', 'otif_pct', 'ontime_target%',
       'infull_target%', 'otif_target%'],
      dtype='object')

In [14]:
customer_split.rename(columns={ 'infull_target%': 'if_target_pct',
                                'otif_target%': 'of_target_pct', 
                                'ontime_target%': 'otif_target_pct'}, inplace=True)

In [15]:
customer_split.columns

Index(['customer_id', 'if_pct', 'ot_pct', 'otif_pct', 'otif_target_pct',
       'if_target_pct', 'of_target_pct'],
      dtype='object')

In [16]:
customer_split = customer_split[['customer_id', 'if_pct', 'if_target_pct', 
                                 'ot_pct', 'of_target_pct', 
                                 'otif_pct', 'otif_target_pct']]

## Machine Learning Model

##### Create Predictive Models:
  
Building predictive models to determine whether an order line meets targets like 'in_full', 'on_time', or 'otif' is a fundamental step. You can use various machine learning algorithms like logistic regression, random forests, or gradient boosting, depending on the complexity of your data and the desired accuracy of predictions.
Ensure you have labeled data where each order line is marked as 'true' or 'false' for meeting the targets. Also, pay attention to feature engineering to extract relevant information from your dataset.
  
##### Identify Associated Variables:
  
Analyzing the importance of variables associated with failure for 'in_full', 'on_time', or 'otif' is crucial for understanding the root causes of delivery failures. You can use techniques like feature importance analysis from tree-based models or permutation importance to identify the most influential features.
Understanding these variables can provide insights into areas of improvement in your production process. For example, if a specific product category or supplier consistently results in delivery failures, you can focus on improving those aspects.
Predict Targets and Evaluate Model Performance:
  
After training your predictive models, assess their performance using appropriate evaluation metrics such as accuracy, precision, recall, F1-score, or area under the ROC curve (AUC). Ensure you use proper validation techniques like cross-validation to estimate the model's generalization performance.
It's essential to have reliable models that accurately predict whether an order line meets the specified targets. These models will serve as the basis for making informed decisions and implementing improvements in your production process.
  
##### Extract Insights for Improvement:
  
Once you have reliable predictive models, leverage the insights gained from them to suggest improvements in your production process. For example, if certain factors consistently lead to delivery failures, you can take corrective actions such as optimizing inventory management, improving supplier relationships, enhancing transportation logistics, or refining order processing workflows.
Continuously monitor the performance of your production process and iteratively refine your strategies based on the insights provided by the predictive models.

### Prepare model data

#### Incorporate as much relevant data into the training model

In [140]:
model_data = order_lines.copy()

##### Merge dates

In [141]:
model_data = pd.merge(model_data, dates_2022, how='left', left_on='order_placement_date', right_on='date')

model_data['order_placement_month'] = model_data['month'].astype('int64')
model_data['order_placement_day'] = model_data['order_placement_date'].dt.day.astype('int64')
model_data['order_placement_week_no'] = model_data['week_number'].astype('int64')

model_data = model_data[['order_id', 'order_placement_date', 'order_placement_month', 'order_placement_day', 'order_placement_week_no', 
                        'customer_id', 'product_id','order_qty', 'agreed_delivery_date', 'actual_delivery_date',
                        'delivery_qty', 'in_full', 'on_time', 'on_time_in_full']].copy()

In [142]:
model_data = pd.merge(model_data, dates_2022, how='left', left_on='agreed_delivery_date', right_on='date')

model_data['agreed_delivery_month'] = model_data['month'].astype('int64')
model_data['agreed_delivery_day'] = model_data['agreed_delivery_date'].dt.day.astype('int64')
model_data['agreed_delivery_week_no'] = model_data['week_number'].astype('int64')

model_data = model_data[['order_id', 'order_placement_date', 'order_placement_month', 'order_placement_day', 'order_placement_week_no', 
                        'customer_id', 'product_id','order_qty', 
                        'agreed_delivery_date', 'agreed_delivery_month', 'agreed_delivery_day', 'agreed_delivery_week_no', 
                        'actual_delivery_date',
                        'delivery_qty', 'in_full', 'on_time', 'on_time_in_full',]].copy()

In [143]:
model_data = pd.merge(model_data, dates_2022, how='left', left_on='actual_delivery_date', right_on='date')

model_data['actual_delivery_month'] = model_data['month'].astype('int64')
model_data['actual_delivery_day'] = model_data['actual_delivery_date'].dt.day.astype('int64')
model_data['actual_delivery_week_no'] = model_data['week_number'].astype('int64')

model_data = model_data[['order_id', 'order_placement_date', 'order_placement_month', 'order_placement_day', 'order_placement_week_no', 
                        'customer_id', 'product_id','order_qty', 
                        'agreed_delivery_date', 'agreed_delivery_month', 'agreed_delivery_day', 'agreed_delivery_week_no', 
                        'actual_delivery_date', 'actual_delivery_month', 'actual_delivery_day', 'actual_delivery_week_no',
                        'delivery_qty', 'in_full', 'on_time', 'on_time_in_full',]].copy()

##### Merge customer

In [144]:
model_data = pd.merge(model_data, customers, how='left', on='customer_id')

##### Merge customer

In [145]:
model_data = pd.merge(model_data, products, how='left', on='product_id')

#### Refine model data datatypes

In [146]:
model_data.dtypes

order_id                           object
order_placement_date       datetime64[ns]
order_placement_month               int64
order_placement_day                 int64
order_placement_week_no             int64
customer_id                         int64
product_id                          int64
order_qty                           int64
agreed_delivery_date       datetime64[ns]
agreed_delivery_month               int64
agreed_delivery_day                 int64
agreed_delivery_week_no             int64
actual_delivery_date       datetime64[ns]
actual_delivery_month               int64
actual_delivery_day                 int64
actual_delivery_week_no             int64
delivery_qty                        int64
in_full                             int64
on_time                             int64
on_time_in_full                     int64
customer_name                      object
city                               object
product_name                       object
category                          

In [147]:
# Change object datatyes into categorical codes
model_data['order_code'] = model_data['order_id'].astype('category').cat.codes
model_data['customer_name_code'] = model_data['customer_name'].astype('category').cat.codes
model_data['city_code'] = model_data['city'].astype('category').cat.codes
model_data['product_name_code'] = model_data['product_name'].astype('category').cat.codes
model_data['category_code'] = model_data['category'].astype('category').cat.codes

In [148]:
# Refine columns
model_data = model_data[[   'order_id',
                            'order_code',
                            'order_placement_date',
                            'order_placement_month',
                            'order_placement_day',
                            'order_placement_week_no',
                            'customer_id',
                            'customer_name',
                            'customer_name_code',
                            'city',
                            'city_code',
                            'product_id',
                            'product_name',
                            'product_name_code',
                            'category',
                            'category_code',
                            'order_qty',
                            'agreed_delivery_date',
                            'agreed_delivery_month',
                            'agreed_delivery_day',
                            'agreed_delivery_week_no',
                            'actual_delivery_date',
                            'actual_delivery_month',
                            'actual_delivery_day',
                            'actual_delivery_week_no',
                            'delivery_qty',
                            'in_full',
                            'on_time',
                            'on_time_in_full']]

# Sort model data by order placement date and reset the index
model_data.sort_values('order_placement_date', inplace=True)
model_data.reset_index(drop=True, inplace=True)

In [149]:
model_data.shape

(57096, 29)

In [150]:
model_data.isna().sum().sum()

0

### Multi-Output Random Forest Classifier

#### Model 1
  
First model, Random split model is more accurate than sequential split model  
  
Random:  
Mean Cross-validation score: 0.5855006584300525  
  
Sequential:  
Mean Cross-validation score: 0.365746872700515

##### Random Model

In [44]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [47]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.54045534 0.57758319 0.56987741 0.65043783 0.56497373 0.60700525
 0.64938704 0.56987741 0.57968476 0.64238179 0.63257443 0.60105079
 0.60980736 0.64763573 0.5821366  0.59754816 0.65557113 0.58058865
 0.58023826 0.27119832]
Mean Cross-validation score: 0.5855006584300525


In [None]:
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.metrics import jaccard_score, zero_one_loss, coverage_error, average_precision_score

# Assuming y_pred and y_test are defined after making predictions
# y_pred = multi_output_classifier.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)
print("Hamming Loss:", hamming_loss_value)

# Precision
precision = precision_score(y_test, y_pred, average='micro')  # You can change the average parameter as needed
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_pred, average='micro')  # You can change the average parameter as needed
print("Recall:", recall)

# F1-score
f1 = f1_score(y_test, y_pred, average='micro')  # You can change the average parameter as needed
print("F1-score:", f1)

# Jaccard Similarity Score
jaccard = jaccard_score(y_test, y_pred, average='micro')  # You can change the average parameter as needed
print("Jaccard Similarity Score:", jaccard)

# Zero-One Loss
zero_one_loss_value = zero_one_loss(y_test, y_pred)
print("Zero-One Loss:", zero_one_loss_value)

# Coverage Error
coverage_error_value = coverage_error(y_test, y_pred)
print("Coverage Error:", coverage_error_value)

# Average Precision Score
average_precision = average_precision_score(y_test, y_pred, average='micro')  # You can change the average parameter as needed
print("Average Precision Score:", average_precision)


##### Sequential Model

In [None]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Split the data into features (X) and targets (y)
X = model_data[features]
y = model_data[targets]

# Perform train-test split while preserving temporal order
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import make_scorer, accuracy_score

X = model_data[features]
y = model_data[targets]

# Define the sequential cross-validation splitter
tscv = TimeSeriesSplit(n_splits=20)

# Define a custom scorer for multi-output classification
scorer = make_scorer(accuracy_score)

# Perform cross-validation
cv_scores = cross_val_score(multi_output, X, y, cv=tscv, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

#### Model 2
  
Random Search to tune hyperparameters
  
Mean Cross-validation score: 0.5975163932514909  
Paramters tuned:  
    max_depth,   
    max_features,   
    min_samples_leaf,  
    min_samples_split=,    
    n_estimators

##### Random Search 1
  
param_dist = {  
    'estimator__n_estimators': randint(10, 200),  
    'estimator__max_depth': randint(2, 20),  
    'estimator__min_samples_split': randint(2, 20),  
    'estimator__min_samples_leaf': randint(1, 20),  
    'estimator__max_features': ['auto', 'sqrt', 'log2']   }

Best Parameters: RandomForestClassifier(  
    max_depth=15,   
    max_features='log2',   
    min_samples_leaf=2,  
    min_samples_split=15,    
    n_estimators=192)

In [61]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from scipy.stats import randint

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Define paramater ranges
param_dist = {
    'estimator__n_estimators': randint(10, 200),
    'estimator__max_depth': randint(2, 20),
    'estimator__min_samples_split': randint(2, 20),
    'estimator__min_samples_leaf': randint(1, 20),
    'estimator__max_features': ['auto', 'sqrt', 'log2']
}

# Fit model
random_search = RandomizedSearchCV(estimator=multi_output, param_distributions=param_dist, n_iter=100, cv=5, random_state=9, n_jobs=-1, scoring='accuracy')

random_search.fit(X_train, y_train)

# Best estimator
best_rf_estimator = random_search.best_estimator_.estimators_[0]



215 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
131 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 273, in _wrap_func_call
    return func()
           ^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  F

In [67]:
print("Best Parameters:", best_rf_estimator)

Best Parameters: RandomForestClassifier(max_depth=15, max_features='log2', min_samples_leaf=2,
                       min_samples_split=15, n_estimators=192, random_state=9)


##### Random Search 2
  
param_dist = {  
    'estimator__n_estimators': randint(150, 250),  
    'estimator__max_depth': randint(10, 20),  
    'estimator__min_samples_split': randint(10, 20),  
    'estimator__min_samples_leaf': randint(1, 5),  
    'estimator__max_features': ['auto', 'sqrt', 'log2']  
}
  
Best Parameters: RandomForestClassifier(  
    max_depth=19,  
    max_features='log2',   
    min_samples_leaf=2,  
    min_samples_split=18,  
    n_estimators=189)

In [68]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from scipy.stats import randint

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Define paramater ranges
param_dist = {
    'estimator__n_estimators': randint(150, 250),
    'estimator__max_depth': randint(10, 20),
    'estimator__min_samples_split': randint(10, 20),
    'estimator__min_samples_leaf': randint(1, 5),
    'estimator__max_features': ['auto', 'sqrt', 'log2']
}

# Fit model
random_search = RandomizedSearchCV(estimator=multi_output, param_distributions=param_dist, n_iter=100, cv=5, random_state=9, n_jobs=-1, scoring='accuracy')

random_search.fit(X_train, y_train)

# Best estimator
best_rf_estimator = random_search.best_estimator_.estimators_[0]

175 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
82 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 273, in _wrap_func_call
    return func()
           ^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  Fi

In [69]:
print("Best Parameters:", best_rf_estimator)

Best Parameters: RandomForestClassifier(max_depth=19, max_features='log2', min_samples_split=18,
                       n_estimators=189, random_state=9)


##### Random Search 3
  
param_dist = {  
    'estimator__n_estimators': randint(170, 200),  
    'estimator__max_depth': randint(15, 25),  
    'estimator__min_samples_split': randint(15, 20),  
    'estimator__min_samples_leaf': randint(1, 5),  
    'estimator__max_features': ['auto', 'sqrt', 'log2']  
}  
    
Best Parameters: RandomForestClassifier(  
    max_depth=24,   
    max_features='log2',   
    min_samples_leaf=2,  
    min_samples_split=15,  
    n_estimators=191)  

In [71]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from scipy.stats import randint

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Define paramater ranges
param_dist = {
    'estimator__n_estimators': randint(170, 200),
    'estimator__max_depth': randint(15, 25),
    'estimator__min_samples_split': randint(15, 20),
    'estimator__min_samples_leaf': randint(1, 5),
    'estimator__max_features': ['auto', 'sqrt', 'log2']
}

# Fit model
random_search = RandomizedSearchCV(estimator=multi_output, param_distributions=param_dist, n_iter=100, cv=5, random_state=9, n_jobs=-1, scoring='accuracy')

random_search.fit(X_train, y_train)

# Best estimator
best_rf_estimator = random_search.best_estimator_.estimators_[0]

185 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
91 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 273, in _wrap_func_call
    return func()
           ^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  Fi

In [72]:
print("Best Parameters:", best_rf_estimator)

Best Parameters: RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15,
                       n_estimators=191, random_state=9)


##### Model 2
  
With updated paramaters
  
Mean Cross-validation score: 0.5975163932514909

In [82]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [83]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.5530648  0.60350263 0.5943958  0.64903678 0.60175131 0.6150613
 0.64063047 0.59474606 0.58178634 0.64763573 0.62802102 0.61155867
 0.61015762 0.63782837 0.59544658 0.59964974 0.67028732 0.61142256
 0.59810792 0.30623686]
Mean Cross-validation score: 0.5975163932514909


In [133]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=7)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=7)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [134]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.54325744 0.60210158 0.59649737 0.64238179 0.60805604 0.61541156
 0.63922942 0.59474606 0.58388792 0.64728546 0.62416813 0.61050788
 0.61085814 0.63817863 0.59229422 0.60105079 0.66398038 0.61107218
 0.60196216 0.28065872]
Mean Cross-validation score: 0.5953792937555303


#### Model 3

Added columns to model_data

In [151]:
model_data.sample(10)

Unnamed: 0,order_id,order_code,order_placement_date,order_placement_month,order_placement_day,order_placement_week_no,customer_id,customer_name,customer_name_code,city,...,agreed_delivery_day,agreed_delivery_week_no,actual_delivery_date,actual_delivery_month,actual_delivery_day,actual_delivery_week_no,delivery_qty,in_full,on_time,on_time_in_full
37882,FJUL71321601,11322,2022-06-29,6,29,27,789321,Chiptec Stores,2,Ahmedabad,...,1,27,2022-07-01,7,1,27,87,1,1,1
33645,FJUN617401302,17467,2022-06-16,6,16,25,789401,Propel Mart,10,Surat,...,17,25,2022-06-16,6,16,25,41,1,1,1
23570,FMY516421501,27623,2022-05-14,5,14,20,789421,Lotus Mart,9,Ahmedabad,...,16,21,2022-05-17,5,17,21,143,0,0,0
21356,FMY510401502,26379,2022-05-07,5,7,19,789401,Propel Mart,10,Surat,...,10,20,2022-05-10,5,10,20,476,1,1,1
27422,FMY530202302,30206,2022-05-27,5,27,22,789202,Rel Fresh,11,Ahmedabad,...,30,23,2022-05-30,5,30,23,49,1,1,1
42416,FJUL717101103,12061,2022-07-14,7,14,29,789101,Vijay Stores,13,Surat,...,17,30,2022-07-17,7,17,30,450,1,1,1
6883,FMR325320502,23971,2022-03-22,3,22,13,789320,Chiptec Stores,2,Surat,...,25,13,2022-03-25,3,25,13,344,0,1,0
45840,FJUL728122502,14145,2022-07-25,7,25,31,789122,Coolblue,3,Vadodara,...,28,31,2022-07-29,7,29,31,126,0,0,0
13151,FAP412122603,415,2022-04-11,4,11,16,789122,Coolblue,3,Vadodara,...,12,16,2022-04-13,4,13,16,135,0,0,0
12897,FAP414102301,780,2022-04-11,4,11,16,789102,Vijay Stores,13,Ahmedabad,...,14,16,2022-04-17,4,17,17,58,1,0,0


In [154]:
list(model_data.columns)

['order_id',
 'order_code',
 'order_placement_date',
 'order_placement_month',
 'order_placement_day',
 'order_placement_week_no',
 'customer_id',
 'customer_name',
 'customer_name_code',
 'city',
 'city_code',
 'product_id',
 'product_name',
 'product_name_code',
 'category',
 'category_code',
 'order_qty',
 'agreed_delivery_date',
 'agreed_delivery_month',
 'agreed_delivery_day',
 'agreed_delivery_week_no',
 'actual_delivery_date',
 'actual_delivery_month',
 'actual_delivery_day',
 'actual_delivery_week_no',
 'delivery_qty',
 'in_full',
 'on_time',
 'on_time_in_full',
 'delivery_duration',
 'order_placement_weekday',
 'agreed_delivery_deekday',
 'actual_delivery_weekday']

In [164]:
model_data['delivery_duration'] = (model_data['order_placement_date'] - model_data['actual_delivery_date']).dt.days.astype('int64')
model_data['order_placement_weekday'] = model_data['order_placement_date'].dt.weekday.astype('int64')
model_data['agreed_delivery_weekday'] = model_data['agreed_delivery_date'].dt.weekday.astype('int64')
model_data['actual_delivery_weekday'] = model_data['actual_delivery_date'].dt.weekday.astype('int64')

In [155]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [156]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.68231173 0.69176883 0.67425569 0.70402802 0.69281961 0.69527145
 0.69352014 0.67110333 0.69492119 0.70017513 0.69211909 0.69877408
 0.69457093 0.69001751 0.68196147 0.69562172 0.7091801  0.6899089
 0.69446391 0.68114926]
Mean Cross-validation score: 0.6913971051168545


In [157]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_deekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full','on_time_in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [158]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.62311734 0.67740806 0.66725044 0.67880911 0.66619965 0.68406305
 0.66935201 0.65709282 0.67810858 0.68686515 0.68021016 0.68791594
 0.68231173 0.67880911 0.67145359 0.68301226 0.68920813 0.66958655
 0.68044849 0.64120533]
Mean Cross-validation score: 0.6726213738790426


In [159]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full', 'on_time_in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [160]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.52784588 0.59019264 0.56987741 0.60770578 0.54921191 0.60910683
 0.58563923 0.56742557 0.5408056  0.59964974 0.59929947 0.58704028
 0.60210158 0.61996497 0.58423818 0.56637478 0.6250876  0.53468816
 0.57428171 0.25122635]
Mean Cross-validation score: 0.5645881836044168


In [None]:
# Add additional columns to my model data

In [None]:
rolling_windows = [3, 5]

cols_to_roll = ['order_qty',
                'delivery_qty',
                'in_full',
                'on_time',
                'on_time_in_full']

new_cols = [f'{c}_rolling' for c in cols_to_roll]

# Define function to add rolling averages
def rolling_averages(group, cols_to_roll, rolling_windows):
    group = group.sort_values('order_placement_date')
    for window in rolling_windows:
        rolling_stats = group[cols_to_roll].rolling(window, closed='left').mean()
        rolling_cols = [f'{c}_rolling_{window}' for c in cols_to_roll]
        group[rolling_cols] = rolling_stats
        group = group.dropna(subset=rolling_cols)
    return group

# Apply function
rolling_customers = model_data.groupby('product_id').apply(lambda x: rolling_averages(x, cols_to_roll, rolling_windows))

In [174]:
from sklearn.ensemble import GradientBoostingClassifier

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define gradient boosting classifier model, wrap in multi output classifier
gradient_boosting = GradientBoostingClassifier(max_depth=3, n_estimators=100, random_state=9)
multi_output_gbm = MultiOutputClassifier(gradient_boosting, n_jobs=-1)

# Fit model
multi_output_gbm.fit(X_train, y_train)

# Predict
y_pred = multi_output_gbm.predict(X_test)


In [175]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
scorer = make_scorer(accuracy_score)

# Perform cross-validation
cv_scores = cross_val_score(multi_output_gbm, model_data[features], model_data[targets], cv=20, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.57443082 0.67390543 0.65464098 0.68721541 0.68791594 0.67355517
 0.67285464 0.6647986  0.69281961 0.68721541 0.68441331 0.69772329
 0.68721541 0.67460595 0.66024518 0.67635727 0.69796776 0.67379117
 0.67344078 0.68325158]
Mean Cross-validation score: 0.6739181865375905


In [176]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define XGBoost classifier model, wrap in multi output classifier
xgb_classifier = xgb.XGBClassifier(max_depth=3, n_estimators=100, random_state=9)
multi_output_xgb = MultiOutputClassifier(xgb_classifier, n_jobs=-1)

# Fit model
multi_output_xgb.fit(X_train, y_train)

# Predict
y_pred = multi_output_xgb.predict(X_test)

In [177]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(xgb_classifier, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.69947461 0.71768827 0.71383538 0.73870403 0.72399299 0.72644483
 0.73695271 0.71033275 0.73099825 0.74255692 0.72784588 0.7408056
 0.73870403 0.72889667 0.73274956 0.71628722 0.74912404 0.7428171
 0.70532586 0.72529783]
Mean Cross-validation score: 0.727441726179007


In [182]:
from sklearn.ensemble import GradientBoostingClassifier

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full', 'on_time_in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define gradient boosting classifier model, wrap in multi output classifier
gradient_boosting = GradientBoostingClassifier(max_depth=3, n_estimators=100, random_state=9)
multi_output_gbm = MultiOutputClassifier(gradient_boosting, n_jobs=-1)

# Fit model
multi_output_gbm.fit(X_train, y_train)

# Predict
y_pred = multi_output_gbm.predict(X_test)


In [183]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
scorer = make_scorer(accuracy_score)

# Perform cross-validation
cv_scores = cross_val_score(multi_output_gbm, model_data[features], model_data[targets], cv=20, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.52434326 0.66269702 0.64658494 0.67145359 0.66935201 0.66339755
 0.66304729 0.64518389 0.66409807 0.67250438 0.6676007  0.67775832
 0.66584939 0.66269702 0.6525394  0.6676007  0.67659425 0.668185
 0.66748423 0.66222845]
Mean Cross-validation score: 0.6575599735891617


In [184]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full', 'on_time_in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define XGBoost classifier model, wrap in multi output classifier
xgb_classifier = xgb.XGBClassifier(max_depth=3, n_estimators=100, random_state=9)
multi_output_xgb = MultiOutputClassifier(xgb_classifier, n_jobs=-1)

# Fit model
multi_output_xgb.fit(X_train, y_train)

# Predict
y_pred = multi_output_xgb.predict(X_test)

In [185]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(xgb_classifier, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.72504378 0.71313485 0.72084063 0.73590193 0.73765324 0.70647986
 0.73064799 0.68721541 0.68126095 0.74115587 0.73450088 0.7323993
 0.70542907 0.71383538 0.73099825 0.70367776 0.74632095 0.72459706
 0.69866854 0.72284513]
Mean Cross-validation score: 0.719630340309542


In [188]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


In [190]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Initialize individual classifiers
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

gradient_boosting = GradientBoostingClassifier(max_depth=3, n_estimators=100, random_state=9)
multi_output_gbm = MultiOutputClassifier(gradient_boosting, n_jobs=-1)

xgb_classifier = xgb.XGBClassifier(max_depth=3, n_estimators=100, random_state=9)
multi_output_xgb = MultiOutputClassifier(xgb_classifier, n_jobs=-1)


pipeline = Pipeline([
    ('preprocessor', StandardScaler()),  # Preprocessing steps if needed
    ('classifier_gbc', multi_output_gbm),  # Gradient Boosting classifier
    ('classifier_xgb', multi_output_xgb)  # XGBoost classifier
])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'MultiOutputClassifier(estimator=GradientBoostingClassifier(random_state=9),
                      n_jobs=-1)' (type <class 'sklearn.multioutput.MultiOutputClassifier'>) doesn't

In [None]:
# Perform cross-validation on the pipeline
scorer = make_scorer(accuracy_score)

# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(pipeline, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())